aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
committerGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
commitccf18968b1bbc2fb117190a1984ac2a826dac228 (patch)
tree7bc8fbf5722aecf1e84fa50c31c657864cba1daa /fs/ocfs2
parente91c021c487110386a07facd0396e6c3b7cf9c1f (diff)
parentd99cf9d679a520d67f81d805b7cb91c68e1847f0 (diff)
Merge ../torvalds-2.6/
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Makefile33
-rw-r--r--fs/ocfs2/alloc.c2040
-rw-r--r--fs/ocfs2/alloc.h82
-rw-r--r--fs/ocfs2/aops.c643
-rw-r--r--fs/ocfs2/aops.h41
-rw-r--r--fs/ocfs2/buffer_head_io.c232
-rw-r--r--fs/ocfs2/buffer_head_io.h73
-rw-r--r--fs/ocfs2/cluster/Makefile4
-rw-r--r--fs/ocfs2/cluster/endian.h30
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1797
-rw-r--r--fs/ocfs2/cluster/heartbeat.h82
-rw-r--r--fs/ocfs2/cluster/masklog.c166
-rw-r--r--fs/ocfs2/cluster/masklog.h275
-rw-r--r--fs/ocfs2/cluster/nodemanager.c791
-rw-r--r--fs/ocfs2/cluster/nodemanager.h64
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h37
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h39
-rw-r--r--fs/ocfs2/cluster/quorum.c315
-rw-r--r--fs/ocfs2/cluster/quorum.h36
-rw-r--r--fs/ocfs2/cluster/sys.c124
-rw-r--r--fs/ocfs2/cluster/sys.h33
-rw-r--r--fs/ocfs2/cluster/tcp.c1829
-rw-r--r--fs/ocfs2/cluster/tcp.h113
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h174
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c91
-rw-r--r--fs/ocfs2/dcache.h31
-rw-r--r--fs/ocfs2/dir.c618
-rw-r--r--fs/ocfs2/dir.h54
-rw-r--r--fs/ocfs2/dlm/Makefile8
-rw-r--r--fs/ocfs2/dlm/dlmapi.h214
-rw-r--r--fs/ocfs2/dlm/dlmast.c466
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h884
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c530
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h35
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c246
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1469
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h36
-rw-r--r--fs/ocfs2/dlm/dlmfs.c640
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlm/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlm/dlmlock.c676
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2664
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2132
-rw-r--r--fs/ocfs2/dlm/dlmthread.c692
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c672
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlm/userdlm.c658
-rw-r--r--fs/ocfs2/dlm/userdlm.h111
-rw-r--r--fs/ocfs2/dlmglue.c2904
-rw-r--r--fs/ocfs2/dlmglue.h111
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c248
-rw-r--r--fs/ocfs2/export.h31
-rw-r--r--fs/ocfs2/extent_map.c994
-rw-r--r--fs/ocfs2/extent_map.h46
-rw-r--r--fs/ocfs2/file.c1237
-rw-r--r--fs/ocfs2/file.h57
-rw-r--r--fs/ocfs2/heartbeat.c378
-rw-r--r--fs/ocfs2/heartbeat.h67
-rw-r--r--fs/ocfs2/inode.c1140
-rw-r--r--fs/ocfs2/inode.h145
-rw-r--r--fs/ocfs2/journal.c1652
-rw-r--r--fs/ocfs2/journal.h457
-rw-r--r--fs/ocfs2/localalloc.c983
-rw-r--r--fs/ocfs2/localalloc.h56
-rw-r--r--fs/ocfs2/mmap.c102
-rw-r--r--fs/ocfs2/mmap.h6
-rw-r--r--fs/ocfs2/namei.c2264
-rw-r--r--fs/ocfs2/namei.h58
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h109
-rw-r--r--fs/ocfs2/ocfs2.h464
-rw-r--r--fs/ocfs2/ocfs2_fs.h638
-rw-r--r--fs/ocfs2/ocfs2_lockid.h73
-rw-r--r--fs/ocfs2/slot_map.c303
-rw-r--r--fs/ocfs2/slot_map.h66
-rw-r--r--fs/ocfs2/suballoc.c1651
-rw-r--r--fs/ocfs2/suballoc.h132
-rw-r--r--fs/ocfs2/super.c1733
-rw-r--r--fs/ocfs2/super.h44
-rw-r--r--fs/ocfs2/symlink.c180
-rw-r--r--fs/ocfs2/symlink.h42
-rw-r--r--fs/ocfs2/sysfile.c131
-rw-r--r--fs/ocfs2/sysfile.h33
-rw-r--r--fs/ocfs2/uptodate.c544
-rw-r--r--fs/ocfs2/uptodate.h44
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/vote.c1202
-rw-r--r--fs/ocfs2/vote.h56
93 files changed, 42729 insertions, 0 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 000000000000..7d3be845a614
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o
6
7ocfs2-objs := \
8 alloc.o \
9 aops.o \
10 buffer_head_io.o \
11 dcache.o \
12 dir.o \
13 dlmglue.o \
14 export.o \
15 extent_map.o \
16 file.o \
17 heartbeat.o \
18 inode.o \
19 journal.o \
20 localalloc.o \
21 mmap.o \
22 namei.o \
23 slot_map.o \
24 suballoc.o \
25 super.o \
26 symlink.o \
27 sysfile.o \
28 uptodate.o \
29 ver.o \
30 vote.o
31
32obj-$(CONFIG_OCFS2_FS) += cluster/
33obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dlmglue.h"
38#include "extent_map.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "sysfile.h"
44#include "file.h"
45#include "super.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 struct ocfs2_journal_handle *handle,
56 struct inode *inode,
57 int wanted,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
60
61static int ocfs2_add_branch(struct ocfs2_super *osb,
62 struct ocfs2_journal_handle *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 struct ocfs2_journal_handle *handle,
71 struct inode *inode,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 struct ocfs2_journal_handle *handle,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 u64 blkno,
81 u32 new_clusters);
82
83static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 struct inode *inode,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
87
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 struct inode *inode,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
99 u64 blkno)
100{
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
104}
105
106/*
107 * How many free extents have we got before we need more meta data?
108 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 struct inode *inode,
111 struct ocfs2_dinode *fe)
112{
113 int retval;
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
117
118 mlog_entry_void();
119
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 retval = -EIO;
123 goto bail;
124 }
125
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
129 if (retval < 0) {
130 mlog_errno(retval);
131 goto bail;
132 }
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 el = &eb->h_list;
135 } else
136 el = &fe->id2.i_list;
137
138 BUG_ON(el->l_tree_depth != 0);
139
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141bail:
142 if (eb_bh)
143 brelse(eb_bh);
144
145 mlog_exit(retval);
146 return retval;
147}
148
149/* expects array to already be allocated
150 *
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152 * l_count for you
153 */
154static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 struct ocfs2_journal_handle *handle,
156 struct inode *inode,
157 int wanted,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
160{
161 int count, status, i;
162 u16 suballoc_bit_start;
163 u32 num_got;
164 u64 first_blkno;
165 struct ocfs2_extent_block *eb;
166
167 mlog_entry_void();
168
169 count = 0;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
172 handle,
173 meta_ac,
174 wanted - count,
175 &suballoc_bit_start,
176 &num_got,
177 &first_blkno);
178 if (status < 0) {
179 mlog_errno(status);
180 goto bail;
181 }
182
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
186 status = -EIO;
187 mlog_errno(status);
188 goto bail;
189 }
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
194 if (status < 0) {
195 mlog_errno(status);
196 goto bail;
197 }
198
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
209#else
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211#endif
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 eb->h_list.l_count =
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216 suballoc_bit_start++;
217 first_blkno++;
218
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
222 if (status < 0) {
223 mlog_errno(status);
224 goto bail;
225 }
226 }
227
228 count += num_got;
229 }
230
231 status = 0;
232bail:
233 if (status < 0) {
234 for(i = 0; i < wanted; i++) {
235 if (bhs[i])
236 brelse(bhs[i]);
237 bhs[i] = NULL;
238 }
239 }
240 mlog_exit(status);
241 return status;
242}
243
244/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
247 * structure.
248 *
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
251 *
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
254 */
255static int ocfs2_add_branch(struct ocfs2_super *osb,
256 struct ocfs2_journal_handle *handle,
257 struct inode *inode,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
262{
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
271
272 mlog_entry_void();
273
274 BUG_ON(!last_eb_bh);
275
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278 if (eb_bh) {
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 el = &eb->h_list;
281 } else
282 el = &fe->id2.i_list;
283
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
286
287 new_blocks = le16_to_cpu(el->l_tree_depth);
288
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 GFP_KERNEL);
292 if (!new_eb_bhs) {
293 status = -ENOMEM;
294 mlog_errno(status);
295 goto bail;
296 }
297
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 *
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
311 * block. */
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
314 bh = new_eb_bhs[i];
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 status = -EIO;
319 goto bail;
320 }
321 eb_el = &eb->h_list;
322
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
325 if (status < 0) {
326 mlog_errno(status);
327 goto bail;
328 }
329
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339 status = ocfs2_journal_dirty(handle, bh);
340 if (status < 0) {
341 mlog_errno(status);
342 goto bail;
343 }
344
345 next_blkno = le64_to_cpu(eb->h_blkno);
346 }
347
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
356 if (status < 0) {
357 mlog_errno(status);
358 goto bail;
359 }
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
362 if (status < 0) {
363 mlog_errno(status);
364 goto bail;
365 }
366 if (eb_bh) {
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
369 if (status < 0) {
370 mlog_errno(status);
371 goto bail;
372 }
373 }
374
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
382
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
391 if (status < 0)
392 mlog_errno(status);
393 status = ocfs2_journal_dirty(handle, fe_bh);
394 if (status < 0)
395 mlog_errno(status);
396 if (eb_bh) {
397 status = ocfs2_journal_dirty(handle, eb_bh);
398 if (status < 0)
399 mlog_errno(status);
400 }
401
402 status = 0;
403bail:
404 if (new_eb_bhs) {
405 for (i = 0; i < new_blocks; i++)
406 if (new_eb_bhs[i])
407 brelse(new_eb_bhs[i]);
408 kfree(new_eb_bhs);
409 }
410
411 mlog_exit(status);
412 return status;
413}
414
415/*
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
418 * after this call.
419 */
420static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 struct ocfs2_journal_handle *handle,
422 struct inode *inode,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
426{
427 int status, i;
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
433
434 mlog_entry_void();
435
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 &new_eb_bh);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 status = -EIO;
447 goto bail;
448 }
449
450 eb_el = &eb->h_list;
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
453
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
456 if (status < 0) {
457 mlog_errno(status);
458 goto bail;
459 }
460
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) {
472 mlog_errno(status);
473 goto bail;
474 }
475
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
478 if (status < 0) {
479 mlog_errno(status);
480 goto bail;
481 }
482
483 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1);
494
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
499
500 status = ocfs2_journal_dirty(handle, fe_bh);
501 if (status < 0) {
502 mlog_errno(status);
503 goto bail;
504 }
505
506 *ret_new_eb_bh = new_eb_bh;
507 new_eb_bh = NULL;
508 status = 0;
509bail:
510 if (new_eb_bh)
511 brelse(new_eb_bh);
512
513 mlog_exit(status);
514 return status;
515}
516
517/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 struct ocfs2_journal_handle *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %"MLFu64" has a bad "
570 "extent list",
571 OCFS2_I(inode)->ip_blkno);
572 status = -EIO;
573 goto bail;
574 }
575 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
576
577 BUG_ON(i >= num_bhs);
578 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
579 OCFS2_BH_CACHED, inode);
580 if (status < 0) {
581 mlog_errno(status);
582 goto bail;
583 }
584 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
585 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
586 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
587 eb);
588 status = -EIO;
589 goto bail;
590 }
591
592 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
593 OCFS2_JOURNAL_ACCESS_WRITE);
594 if (status < 0) {
595 mlog_errno(status);
596 goto bail;
597 }
598
599 el = &eb->h_list;
600 i++;
601 /* When we leave this loop, eb_bhs[num_bhs - 1] will
602 * hold the bottom-most leaf extent block. */
603 }
604 BUG_ON(el->l_tree_depth);
605
606 el = &fe->id2.i_list;
607 /* If we have tree depth, then the fe update is
608 * trivial, and we want to switch el out for the
609 * bottom-most leaf in order to update it with the
610 * actual extent data below. */
611 next_free = le16_to_cpu(el->l_next_free_rec);
612 if (next_free == 0) {
613 ocfs2_error(inode->i_sb,
614 "Dinode %"MLFu64" has a bad "
615 "extent list",
616 OCFS2_I(inode)->ip_blkno);
617 status = -EIO;
618 goto bail;
619 }
620 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
621 new_clusters);
622 /* (num_bhs - 1) to avoid the leaf */
623 for(i = 0; i < (num_bhs - 1); i++) {
624 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
625 el = &eb->h_list;
626
627 /* finally, make our actual change to the
628 * intermediate extent blocks. */
629 next_free = le16_to_cpu(el->l_next_free_rec);
630 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
631 new_clusters);
632
633 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
634 if (status < 0)
635 mlog_errno(status);
636 }
637 BUG_ON(i != (num_bhs - 1));
638 /* note that the leaf block wasn't touched in
639 * the loop above */
640 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
641 el = &eb->h_list;
642 BUG_ON(el->l_tree_depth);
643 }
644
645 /* yay, we can finally add the actual extent now! */
646 i = le16_to_cpu(el->l_next_free_rec) - 1;
647 if (le16_to_cpu(el->l_next_free_rec) &&
648 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
649 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
650 } else if (le16_to_cpu(el->l_next_free_rec) &&
651 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
652 /* having an empty extent at eof is legal. */
653 if (el->l_recs[i].e_cpos != fe->i_clusters) {
654 ocfs2_error(inode->i_sb,
655 "Dinode %"MLFu64" trailing extent is bad: "
656 "cpos (%u) != number of clusters (%u)",
657 le32_to_cpu(el->l_recs[i].e_cpos),
658 le32_to_cpu(fe->i_clusters));
659 status = -EIO;
660 goto bail;
661 }
662 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
663 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
664 } else {
665 /* No contiguous record, or no empty record at eof, so
666 * we add a new one. */
667
668 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
669 le16_to_cpu(el->l_count));
670 i = le16_to_cpu(el->l_next_free_rec);
671
672 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
673 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
674 el->l_recs[i].e_cpos = fe->i_clusters;
675 le16_add_cpu(&el->l_next_free_rec, 1);
676 }
677
678 /*
679 * extent_map errors are not fatal, so they are ignored outside
680 * of flushing the thing.
681 */
682 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
683 new_clusters);
684 if (status) {
685 mlog_errno(status);
686 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
687 }
688
689 status = ocfs2_journal_dirty(handle, fe_bh);
690 if (status < 0)
691 mlog_errno(status);
692 if (fe->id2.i_list.l_tree_depth) {
693 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
694 if (status < 0)
695 mlog_errno(status);
696 }
697
698 status = 0;
699bail:
700 if (eb_bhs) {
701 for (i = 0; i < num_bhs; i++)
702 if (eb_bhs[i])
703 brelse(eb_bhs[i]);
704 kfree(eb_bhs);
705 }
706
707 mlog_exit(status);
708 return status;
709}
710
711/*
712 * Should only be called when there is no space left in any of the
713 * leaf nodes. What we want to do is find the lowest tree depth
714 * non-leaf extent block with room for new records. There are three
715 * valid results of this search:
716 *
717 * 1) a lowest extent block is found, then we pass it back in
718 * *lowest_eb_bh and return '0'
719 *
720 * 2) the search fails to find anything, but the dinode has room. We
721 * pass NULL back in *lowest_eb_bh, but still return '0'
722 *
723 * 3) the search fails to find anything AND the dinode is full, in
724 * which case we return > 0
725 *
726 * return status < 0 indicates an error.
727 */
728static int ocfs2_find_branch_target(struct ocfs2_super *osb,
729 struct inode *inode,
730 struct buffer_head *fe_bh,
731 struct buffer_head **target_bh)
732{
733 int status = 0, i;
734 u64 blkno;
735 struct ocfs2_dinode *fe;
736 struct ocfs2_extent_block *eb;
737 struct ocfs2_extent_list *el;
738 struct buffer_head *bh = NULL;
739 struct buffer_head *lowest_bh = NULL;
740
741 mlog_entry_void();
742
743 *target_bh = NULL;
744
745 fe = (struct ocfs2_dinode *) fe_bh->b_data;
746 el = &fe->id2.i_list;
747
748 while(le16_to_cpu(el->l_tree_depth) > 1) {
749 if (le16_to_cpu(el->l_next_free_rec) == 0) {
750 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
751 "extent list (next_free_rec == 0)",
752 OCFS2_I(inode)->ip_blkno);
753 status = -EIO;
754 goto bail;
755 }
756 i = le16_to_cpu(el->l_next_free_rec) - 1;
757 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
758 if (!blkno) {
759 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
760 "list where extent # %d has no physical "
761 "block start",
762 OCFS2_I(inode)->ip_blkno, i);
763 status = -EIO;
764 goto bail;
765 }
766
767 if (bh) {
768 brelse(bh);
769 bh = NULL;
770 }
771
772 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
773 inode);
774 if (status < 0) {
775 mlog_errno(status);
776 goto bail;
777 }
778
779 eb = (struct ocfs2_extent_block *) bh->b_data;
780 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
781 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
782 status = -EIO;
783 goto bail;
784 }
785 el = &eb->h_list;
786
787 if (le16_to_cpu(el->l_next_free_rec) <
788 le16_to_cpu(el->l_count)) {
789 if (lowest_bh)
790 brelse(lowest_bh);
791 lowest_bh = bh;
792 get_bh(lowest_bh);
793 }
794 }
795
796 /* If we didn't find one and the fe doesn't have any room,
797 * then return '1' */
798 if (!lowest_bh
799 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
800 status = 1;
801
802 *target_bh = lowest_bh;
803bail:
804 if (bh)
805 brelse(bh);
806
807 mlog_exit(status);
808 return status;
809}
810
811/* the caller needs to update fe->i_clusters */
812int ocfs2_insert_extent(struct ocfs2_super *osb,
813 struct ocfs2_journal_handle *handle,
814 struct inode *inode,
815 struct buffer_head *fe_bh,
816 u64 start_blk,
817 u32 new_clusters,
818 struct ocfs2_alloc_context *meta_ac)
819{
820 int status, i, shift;
821 struct buffer_head *last_eb_bh = NULL;
822 struct buffer_head *bh = NULL;
823 struct ocfs2_dinode *fe;
824 struct ocfs2_extent_block *eb;
825 struct ocfs2_extent_list *el;
826
827 mlog_entry_void();
828
829 mlog(0, "add %u clusters starting at block %"MLFu64" to "
830 "inode %"MLFu64"\n",
831 new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
832
833 fe = (struct ocfs2_dinode *) fe_bh->b_data;
834 el = &fe->id2.i_list;
835
836 if (el->l_tree_depth) {
837 /* jump to end of tree */
838 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
839 &last_eb_bh, OCFS2_BH_CACHED, inode);
840 if (status < 0) {
841 mlog_exit(status);
842 goto bail;
843 }
844 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
845 el = &eb->h_list;
846 }
847
848 /* Can we allocate without adding/shifting tree bits? */
849 i = le16_to_cpu(el->l_next_free_rec) - 1;
850 if (le16_to_cpu(el->l_next_free_rec) == 0
851 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
852 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
853 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
854 goto out_add;
855
856 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
857 "tree now.\n");
858
859 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
860 if (shift < 0) {
861 status = shift;
862 mlog_errno(status);
863 goto bail;
864 }
865
866 /* We traveled all the way to the bottom of the allocation tree
867 * and didn't find room for any more extents - we need to add
868 * another tree level */
869 if (shift) {
870 /* if we hit a leaf, we'd better be empty :) */
871 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
872 le16_to_cpu(el->l_count));
873 BUG_ON(bh);
874 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
875 "(current = %u)\n",
876 le16_to_cpu(fe->id2.i_list.l_tree_depth));
877
878 /* ocfs2_shift_tree_depth will return us a buffer with
879 * the new extent block (so we can pass that to
880 * ocfs2_add_branch). */
881 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
882 meta_ac, &bh);
883 if (status < 0) {
884 mlog_errno(status);
885 goto bail;
886 }
887 /* Special case: we have room now if we shifted from
888 * tree_depth 0 */
889 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
890 goto out_add;
891 }
892
893 /* call ocfs2_add_branch to add the final part of the tree with
894 * the new data. */
895 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
896 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
897 meta_ac);
898 if (status < 0) {
899 mlog_errno(status);
900 goto bail;
901 }
902
903out_add:
904 /* Finally, we can add clusters. */
905 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
906 start_blk, new_clusters);
907 if (status < 0)
908 mlog_errno(status);
909
910bail:
911 if (bh)
912 brelse(bh);
913
914 if (last_eb_bh)
915 brelse(last_eb_bh);
916
917 mlog_exit(status);
918 return status;
919}
920
921static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
922{
923 struct buffer_head *tl_bh = osb->osb_tl_bh;
924 struct ocfs2_dinode *di;
925 struct ocfs2_truncate_log *tl;
926
927 di = (struct ocfs2_dinode *) tl_bh->b_data;
928 tl = &di->id2.i_dealloc;
929
930 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
931 "slot %d, invalid truncate log parameters: used = "
932 "%u, count = %u\n", osb->slot_num,
933 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
934 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
935}
936
937static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
938 unsigned int new_start)
939{
940 unsigned int tail_index;
941 unsigned int current_tail;
942
943 /* No records, nothing to coalesce */
944 if (!le16_to_cpu(tl->tl_used))
945 return 0;
946
947 tail_index = le16_to_cpu(tl->tl_used) - 1;
948 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
949 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
950
951 return current_tail == new_start;
952}
953
954static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
955 struct ocfs2_journal_handle *handle,
956 u64 start_blk,
957 unsigned int num_clusters)
958{
959 int status, index;
960 unsigned int start_cluster, tl_count;
961 struct inode *tl_inode = osb->osb_tl_inode;
962 struct buffer_head *tl_bh = osb->osb_tl_bh;
963 struct ocfs2_dinode *di;
964 struct ocfs2_truncate_log *tl;
965
966 mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
967 num_clusters);
968
969 BUG_ON(!down_trylock(&tl_inode->i_sem));
970
971 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
972
973 di = (struct ocfs2_dinode *) tl_bh->b_data;
974 tl = &di->id2.i_dealloc;
975 if (!OCFS2_IS_VALID_DINODE(di)) {
976 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
977 status = -EIO;
978 goto bail;
979 }
980
981 tl_count = le16_to_cpu(tl->tl_count);
982 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
983 tl_count == 0,
984 "Truncate record count on #%"MLFu64" invalid ("
985 "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
988
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
992 status = -ENOSPC;
993 mlog_errno(status);
994 goto bail;
995 }
996
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
999 if (status < 0) {
1000 mlog_errno(status);
1001 goto bail;
1002 }
1003
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
1006 OCFS2_I(tl_inode)->ip_blkno, index);
1007
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 /*
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 */
1013 index--;
1014
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 num_clusters);
1019 } else {
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1022 }
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1026 if (status < 0) {
1027 mlog_errno(status);
1028 goto bail;
1029 }
1030
1031bail:
1032 mlog_exit(status);
1033 return status;
1034}
1035
1036static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 struct ocfs2_journal_handle *handle,
1038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1040{
1041 int status = 0;
1042 int i;
1043 unsigned int num_clusters;
1044 u64 start_blk;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051 mlog_entry_void();
1052
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1056 while (i >= 0) {
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto bail;
1064 }
1065
1066 tl->tl_used = cpu_to_le16(i);
1067
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1069 if (status < 0) {
1070 mlog_errno(status);
1071 goto bail;
1072 }
1073
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1076 * this. */
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089 /* if start_blk is not set, we ignore the record as
1090 * invalid. */
1091 if (start_blk) {
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1097 num_clusters);
1098 if (status < 0) {
1099 mlog_errno(status);
1100 goto bail;
1101 }
1102 }
1103 i--;
1104 }
1105
1106bail:
1107 mlog_exit(status);
1108 return status;
1109}
1110
1111/* Expects you to already be holding tl_inode->i_sem */
1112static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113{
1114 int status;
1115 unsigned int num_to_flush;
1116 struct ocfs2_journal_handle *handle = NULL;
1117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1123
1124 mlog_entry_void();
1125
1126 BUG_ON(!down_trylock(&tl_inode->i_sem));
1127
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 status = -EIO;
1133 goto bail;
1134 }
1135
1136 num_to_flush = le16_to_cpu(tl->tl_used);
1137 mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
1138 num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
1139 if (!num_to_flush) {
1140 status = 0;
1141 goto bail;
1142 }
1143
1144 handle = ocfs2_alloc_handle(osb);
1145 if (!handle) {
1146 status = -ENOMEM;
1147 mlog_errno(status);
1148 goto bail;
1149 }
1150
1151 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152 GLOBAL_BITMAP_SYSTEM_INODE,
1153 OCFS2_INVALID_SLOT);
1154 if (!data_alloc_inode) {
1155 status = -EINVAL;
1156 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1157 goto bail;
1158 }
1159
1160 ocfs2_handle_add_inode(handle, data_alloc_inode);
1161 status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1162 if (status < 0) {
1163 mlog_errno(status);
1164 goto bail;
1165 }
1166
1167 handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168 if (IS_ERR(handle)) {
1169 status = PTR_ERR(handle);
1170 handle = NULL;
1171 mlog_errno(status);
1172 goto bail;
1173 }
1174
1175 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1176 data_alloc_bh);
1177 if (status < 0) {
1178 mlog_errno(status);
1179 goto bail;
1180 }
1181
1182bail:
1183 if (handle)
1184 ocfs2_commit_trans(handle);
1185
1186 if (data_alloc_inode)
1187 iput(data_alloc_inode);
1188
1189 if (data_alloc_bh)
1190 brelse(data_alloc_bh);
1191
1192 mlog_exit(status);
1193 return status;
1194}
1195
1196int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1197{
1198 int status;
1199 struct inode *tl_inode = osb->osb_tl_inode;
1200
1201 down(&tl_inode->i_sem);
1202 status = __ocfs2_flush_truncate_log(osb);
1203 up(&tl_inode->i_sem);
1204
1205 return status;
1206}
1207
1208static void ocfs2_truncate_log_worker(void *data)
1209{
1210 int status;
1211 struct ocfs2_super *osb = data;
1212
1213 mlog_entry_void();
1214
1215 status = ocfs2_flush_truncate_log(osb);
1216 if (status < 0)
1217 mlog_errno(status);
1218
1219 mlog_exit(status);
1220}
1221
1222#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1223void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1224 int cancel)
1225{
1226 if (osb->osb_tl_inode) {
1227 /* We want to push off log flushes while truncates are
1228 * still running. */
1229 if (cancel)
1230 cancel_delayed_work(&osb->osb_truncate_log_wq);
1231
1232 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1233 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1234 }
1235}
1236
1237static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1238 int slot_num,
1239 struct inode **tl_inode,
1240 struct buffer_head **tl_bh)
1241{
1242 int status;
1243 struct inode *inode = NULL;
1244 struct buffer_head *bh = NULL;
1245
1246 inode = ocfs2_get_system_file_inode(osb,
1247 TRUNCATE_LOG_SYSTEM_INODE,
1248 slot_num);
1249 if (!inode) {
1250 status = -EINVAL;
1251 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1252 goto bail;
1253 }
1254
1255 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1256 OCFS2_BH_CACHED, inode);
1257 if (status < 0) {
1258 iput(inode);
1259 mlog_errno(status);
1260 goto bail;
1261 }
1262
1263 *tl_inode = inode;
1264 *tl_bh = bh;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* called during the 1st stage of node recovery. we stamp a clean
1271 * truncate log and pass back a copy for processing later. if the
1272 * truncate log does not require processing, a *tl_copy is set to
1273 * NULL. */
1274int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1275 int slot_num,
1276 struct ocfs2_dinode **tl_copy)
1277{
1278 int status;
1279 struct inode *tl_inode = NULL;
1280 struct buffer_head *tl_bh = NULL;
1281 struct ocfs2_dinode *di;
1282 struct ocfs2_truncate_log *tl;
1283
1284 *tl_copy = NULL;
1285
1286 mlog(0, "recover truncate log from slot %d\n", slot_num);
1287
1288 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293
1294 di = (struct ocfs2_dinode *) tl_bh->b_data;
1295 tl = &di->id2.i_dealloc;
1296 if (!OCFS2_IS_VALID_DINODE(di)) {
1297 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1298 status = -EIO;
1299 goto bail;
1300 }
1301
1302 if (le16_to_cpu(tl->tl_used)) {
1303 mlog(0, "We'll have %u logs to recover\n",
1304 le16_to_cpu(tl->tl_used));
1305
1306 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1307 if (!(*tl_copy)) {
1308 status = -ENOMEM;
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312
1313 /* Assuming the write-out below goes well, this copy
1314 * will be passed back to recovery for processing. */
1315 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1316
1317 /* All we need to do to clear the truncate log is set
1318 * tl_used. */
1319 tl->tl_used = 0;
1320
1321 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1322 if (status < 0) {
1323 mlog_errno(status);
1324 goto bail;
1325 }
1326 }
1327
1328bail:
1329 if (tl_inode)
1330 iput(tl_inode);
1331 if (tl_bh)
1332 brelse(tl_bh);
1333
1334 if (status < 0 && (*tl_copy)) {
1335 kfree(*tl_copy);
1336 *tl_copy = NULL;
1337 }
1338
1339 mlog_exit(status);
1340 return status;
1341}
1342
1343int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1344 struct ocfs2_dinode *tl_copy)
1345{
1346 int status = 0;
1347 int i;
1348 unsigned int clusters, num_recs, start_cluster;
1349 u64 start_blk;
1350 struct ocfs2_journal_handle *handle;
1351 struct inode *tl_inode = osb->osb_tl_inode;
1352 struct ocfs2_truncate_log *tl;
1353
1354 mlog_entry_void();
1355
1356 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1357 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1358 return -EINVAL;
1359 }
1360
1361 tl = &tl_copy->id2.i_dealloc;
1362 num_recs = le16_to_cpu(tl->tl_used);
1363 mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
1364 tl_copy->i_blkno);
1365
1366 down(&tl_inode->i_sem);
1367 for(i = 0; i < num_recs; i++) {
1368 if (ocfs2_truncate_log_needs_flush(osb)) {
1369 status = __ocfs2_flush_truncate_log(osb);
1370 if (status < 0) {
1371 mlog_errno(status);
1372 goto bail_up;
1373 }
1374 }
1375
1376 handle = ocfs2_start_trans(osb, NULL,
1377 OCFS2_TRUNCATE_LOG_UPDATE);
1378 if (IS_ERR(handle)) {
1379 status = PTR_ERR(handle);
1380 mlog_errno(status);
1381 goto bail_up;
1382 }
1383
1384 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1385 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1386 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1387
1388 status = ocfs2_truncate_log_append(osb, handle,
1389 start_blk, clusters);
1390 ocfs2_commit_trans(handle);
1391 if (status < 0) {
1392 mlog_errno(status);
1393 goto bail_up;
1394 }
1395 }
1396
1397bail_up:
1398 up(&tl_inode->i_sem);
1399
1400 mlog_exit(status);
1401 return status;
1402}
1403
1404void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1405{
1406 int status;
1407 struct inode *tl_inode = osb->osb_tl_inode;
1408
1409 mlog_entry_void();
1410
1411 if (tl_inode) {
1412 cancel_delayed_work(&osb->osb_truncate_log_wq);
1413 flush_workqueue(ocfs2_wq);
1414
1415 status = ocfs2_flush_truncate_log(osb);
1416 if (status < 0)
1417 mlog_errno(status);
1418
1419 brelse(osb->osb_tl_bh);
1420 iput(osb->osb_tl_inode);
1421 }
1422
1423 mlog_exit_void();
1424}
1425
1426int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1427{
1428 int status;
1429 struct inode *tl_inode = NULL;
1430 struct buffer_head *tl_bh = NULL;
1431
1432 mlog_entry_void();
1433
1434 status = ocfs2_get_truncate_log_info(osb,
1435 osb->slot_num,
1436 &tl_inode,
1437 &tl_bh);
1438 if (status < 0)
1439 mlog_errno(status);
1440
1441 /* ocfs2_truncate_log_shutdown keys on the existence of
1442 * osb->osb_tl_inode so we don't set any of the osb variables
1443 * until we're sure all is well. */
1444 INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
1445 osb->osb_tl_bh = tl_bh;
1446 osb->osb_tl_inode = tl_inode;
1447
1448 mlog_exit(status);
1449 return status;
1450}
1451
1452/* This function will figure out whether the currently last extent
1453 * block will be deleted, and if it will, what the new last extent
1454 * block will be so we can update his h_next_leaf_blk field, as well
1455 * as the dinodes i_last_eb_blk */
1456static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1457 struct inode *inode,
1458 struct ocfs2_dinode *fe,
1459 u32 new_i_clusters,
1460 struct buffer_head *old_last_eb,
1461 struct buffer_head **new_last_eb)
1462{
1463 int i, status = 0;
1464 u64 block = 0;
1465 struct ocfs2_extent_block *eb;
1466 struct ocfs2_extent_list *el;
1467 struct buffer_head *bh = NULL;
1468
1469 *new_last_eb = NULL;
1470
1471 if (!OCFS2_IS_VALID_DINODE(fe)) {
1472 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1473 status = -EIO;
1474 goto bail;
1475 }
1476
1477 /* we have no tree, so of course, no last_eb. */
1478 if (!fe->id2.i_list.l_tree_depth)
1479 goto bail;
1480
1481 /* trunc to zero special case - this makes tree_depth = 0
1482 * regardless of what it is. */
1483 if (!new_i_clusters)
1484 goto bail;
1485
1486 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1487 el = &(eb->h_list);
1488 BUG_ON(!el->l_next_free_rec);
1489
1490 /* Make sure that this guy will actually be empty after we
1491 * clear away the data. */
1492 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1493 goto bail;
1494
1495 /* Ok, at this point, we know that last_eb will definitely
1496 * change, so lets traverse the tree and find the second to
1497 * last extent block. */
1498 el = &(fe->id2.i_list);
1499 /* go down the tree, */
1500 do {
1501 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1502 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1503 new_i_clusters) {
1504 block = le64_to_cpu(el->l_recs[i].e_blkno);
1505 break;
1506 }
1507 }
1508 BUG_ON(i < 0);
1509
1510 if (bh) {
1511 brelse(bh);
1512 bh = NULL;
1513 }
1514
1515 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1516 inode);
1517 if (status < 0) {
1518 mlog_errno(status);
1519 goto bail;
1520 }
1521 eb = (struct ocfs2_extent_block *) bh->b_data;
1522 el = &eb->h_list;
1523 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1524 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1525 status = -EIO;
1526 goto bail;
1527 }
1528 } while (el->l_tree_depth);
1529
1530 *new_last_eb = bh;
1531 get_bh(*new_last_eb);
1532 mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
1533bail:
1534 if (bh)
1535 brelse(bh);
1536
1537 return status;
1538}
1539
1540static int ocfs2_do_truncate(struct ocfs2_super *osb,
1541 unsigned int clusters_to_del,
1542 struct inode *inode,
1543 struct buffer_head *fe_bh,
1544 struct buffer_head *old_last_eb_bh,
1545 struct ocfs2_journal_handle *handle,
1546 struct ocfs2_truncate_context *tc)
1547{
1548 int status, i, depth;
1549 struct ocfs2_dinode *fe;
1550 struct ocfs2_extent_block *eb;
1551 struct ocfs2_extent_block *last_eb = NULL;
1552 struct ocfs2_extent_list *el;
1553 struct buffer_head *eb_bh = NULL;
1554 struct buffer_head *last_eb_bh = NULL;
1555 u64 next_eb = 0;
1556 u64 delete_blk = 0;
1557
1558 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1559
1560 status = ocfs2_find_new_last_ext_blk(osb,
1561 inode,
1562 fe,
1563 le32_to_cpu(fe->i_clusters) -
1564 clusters_to_del,
1565 old_last_eb_bh,
1566 &last_eb_bh);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 if (last_eb_bh)
1572 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1573
1574 status = ocfs2_journal_access(handle, inode, fe_bh,
1575 OCFS2_JOURNAL_ACCESS_WRITE);
1576 if (status < 0) {
1577 mlog_errno(status);
1578 goto bail;
1579 }
1580 el = &(fe->id2.i_list);
1581
1582 spin_lock(&OCFS2_I(inode)->ip_lock);
1583 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1584 clusters_to_del;
1585 spin_unlock(&OCFS2_I(inode)->ip_lock);
1586 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1587 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1588 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1589
1590 i = le16_to_cpu(el->l_next_free_rec) - 1;
1591
1592 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1593 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1594 /* tree depth zero, we can just delete the clusters, otherwise
1595 * we need to record the offset of the next level extent block
1596 * as we may overwrite it. */
1597 if (!el->l_tree_depth)
1598 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1599 + ocfs2_clusters_to_blocks(osb->sb,
1600 le32_to_cpu(el->l_recs[i].e_clusters));
1601 else
1602 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1603
1604 if (!el->l_recs[i].e_clusters) {
1605 /* if we deleted the whole extent record, then clear
1606 * out the other fields and update the extent
1607 * list. For depth > 0 trees, we've already recorded
1608 * the extent block in 'next_eb' */
1609 el->l_recs[i].e_cpos = 0;
1610 el->l_recs[i].e_blkno = 0;
1611 BUG_ON(!el->l_next_free_rec);
1612 le16_add_cpu(&el->l_next_free_rec, -1);
1613 }
1614
1615 depth = le16_to_cpu(el->l_tree_depth);
1616 if (!fe->i_clusters) {
1617 /* trunc to zero is a special case. */
1618 el->l_tree_depth = 0;
1619 fe->i_last_eb_blk = 0;
1620 } else if (last_eb)
1621 fe->i_last_eb_blk = last_eb->h_blkno;
1622
1623 status = ocfs2_journal_dirty(handle, fe_bh);
1624 if (status < 0) {
1625 mlog_errno(status);
1626 goto bail;
1627 }
1628
1629 if (last_eb) {
1630 /* If there will be a new last extent block, then by
1631 * definition, there cannot be any leaves to the right of
1632 * him. */
1633 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1634 OCFS2_JOURNAL_ACCESS_WRITE);
1635 if (status < 0) {
1636 mlog_errno(status);
1637 goto bail;
1638 }
1639 last_eb->h_next_leaf_blk = 0;
1640 status = ocfs2_journal_dirty(handle, last_eb_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 /* if our tree depth > 0, update all the tree blocks below us. */
1648 while (depth) {
1649 mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
1650 depth, next_eb);
1651 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1652 OCFS2_BH_CACHED, inode);
1653 if (status < 0) {
1654 mlog_errno(status);
1655 goto bail;
1656 }
1657 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1658 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1659 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1660 status = -EIO;
1661 goto bail;
1662 }
1663 el = &(eb->h_list);
1664
1665 status = ocfs2_journal_access(handle, inode, eb_bh,
1666 OCFS2_JOURNAL_ACCESS_WRITE);
1667 if (status < 0) {
1668 mlog_errno(status);
1669 goto bail;
1670 }
1671
1672 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1673 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1674
1675 i = le16_to_cpu(el->l_next_free_rec) - 1;
1676
1677 mlog(0, "extent block %"MLFu64", before: record %d: "
1678 "(%u, %u, %"MLFu64"), next = %u\n",
1679 le64_to_cpu(eb->h_blkno), i,
1680 le32_to_cpu(el->l_recs[i].e_cpos),
1681 le32_to_cpu(el->l_recs[i].e_clusters),
1682 le64_to_cpu(el->l_recs[i].e_blkno),
1683 le16_to_cpu(el->l_next_free_rec));
1684
1685 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1686 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1687
1688 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1689 /* bottom-most block requires us to delete data.*/
1690 if (!el->l_tree_depth)
1691 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1692 + ocfs2_clusters_to_blocks(osb->sb,
1693 le32_to_cpu(el->l_recs[i].e_clusters));
1694 if (!el->l_recs[i].e_clusters) {
1695 el->l_recs[i].e_cpos = 0;
1696 el->l_recs[i].e_blkno = 0;
1697 BUG_ON(!el->l_next_free_rec);
1698 le16_add_cpu(&el->l_next_free_rec, -1);
1699 }
1700 mlog(0, "extent block %"MLFu64", after: record %d: "
1701 "(%u, %u, %"MLFu64"), next = %u\n",
1702 le64_to_cpu(eb->h_blkno), i,
1703 le32_to_cpu(el->l_recs[i].e_cpos),
1704 le32_to_cpu(el->l_recs[i].e_clusters),
1705 le64_to_cpu(el->l_recs[i].e_blkno),
1706 le16_to_cpu(el->l_next_free_rec));
1707
1708 status = ocfs2_journal_dirty(handle, eb_bh);
1709 if (status < 0) {
1710 mlog_errno(status);
1711 goto bail;
1712 }
1713
1714 if (!el->l_next_free_rec) {
1715 mlog(0, "deleting this extent block.\n");
1716
1717 ocfs2_remove_from_cache(inode, eb_bh);
1718
1719 BUG_ON(eb->h_suballoc_slot);
1720 BUG_ON(el->l_recs[0].e_clusters);
1721 BUG_ON(el->l_recs[0].e_cpos);
1722 BUG_ON(el->l_recs[0].e_blkno);
1723 status = ocfs2_free_extent_block(handle,
1724 tc->tc_ext_alloc_inode,
1725 tc->tc_ext_alloc_bh,
1726 eb);
1727 if (status < 0) {
1728 mlog_errno(status);
1729 goto bail;
1730 }
1731 }
1732 brelse(eb_bh);
1733 eb_bh = NULL;
1734 depth--;
1735 }
1736
1737 BUG_ON(!delete_blk);
1738 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1739 clusters_to_del);
1740 if (status < 0) {
1741 mlog_errno(status);
1742 goto bail;
1743 }
1744 status = 0;
1745bail:
1746 if (!status)
1747 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1748 else
1749 ocfs2_extent_map_drop(inode, 0);
1750 mlog_exit(status);
1751 return status;
1752}
1753
1754/*
1755 * It is expected, that by the time you call this function,
1756 * inode->i_size and fe->i_size have been adjusted.
1757 *
1758 * WARNING: This will kfree the truncate context
1759 */
1760int ocfs2_commit_truncate(struct ocfs2_super *osb,
1761 struct inode *inode,
1762 struct buffer_head *fe_bh,
1763 struct ocfs2_truncate_context *tc)
1764{
1765 int status, i, credits, tl_sem = 0;
1766 u32 clusters_to_del, target_i_clusters;
1767 u64 last_eb = 0;
1768 struct ocfs2_dinode *fe;
1769 struct ocfs2_extent_block *eb;
1770 struct ocfs2_extent_list *el;
1771 struct buffer_head *last_eb_bh;
1772 struct ocfs2_journal_handle *handle = NULL;
1773 struct inode *tl_inode = osb->osb_tl_inode;
1774
1775 mlog_entry_void();
1776
1777 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1778
1779 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1780 i_size_read(inode));
1781
1782 last_eb_bh = tc->tc_last_eb_bh;
1783 tc->tc_last_eb_bh = NULL;
1784
1785 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1786
1787 if (fe->id2.i_list.l_tree_depth) {
1788 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1789 el = &eb->h_list;
1790 } else
1791 el = &fe->id2.i_list;
1792 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1793start:
1794 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1795 "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
1796 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1797 le32_to_cpu(fe->i_clusters), last_eb,
1798 le64_to_cpu(fe->i_last_eb_blk),
1799 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1800
1801 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1802 mlog(0, "last_eb changed!\n");
1803 BUG_ON(!fe->id2.i_list.l_tree_depth);
1804 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1805 /* i_last_eb_blk may have changed, read it if
1806 * necessary. We don't have to worry about the
1807 * truncate to zero case here (where there becomes no
1808 * last_eb) because we never loop back after our work
1809 * is done. */
1810 if (last_eb_bh) {
1811 brelse(last_eb_bh);
1812 last_eb_bh = NULL;
1813 }
1814
1815 status = ocfs2_read_block(osb, last_eb,
1816 &last_eb_bh, OCFS2_BH_CACHED,
1817 inode);
1818 if (status < 0) {
1819 mlog_errno(status);
1820 goto bail;
1821 }
1822 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1823 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1824 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1825 status = -EIO;
1826 goto bail;
1827 }
1828 el = &(eb->h_list);
1829 }
1830
1831 /* by now, el will point to the extent list on the bottom most
1832 * portion of this tree. */
1833 i = le16_to_cpu(el->l_next_free_rec) - 1;
1834 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1835 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1836 else
1837 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1838 le32_to_cpu(el->l_recs[i].e_cpos)) -
1839 target_i_clusters;
1840
1841 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1842
1843 down(&tl_inode->i_sem);
1844 tl_sem = 1;
1845 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1846 * record is free for use. If there isn't any, we flush to get
1847 * an empty truncate log. */
1848 if (ocfs2_truncate_log_needs_flush(osb)) {
1849 status = __ocfs2_flush_truncate_log(osb);
1850 if (status < 0) {
1851 mlog_errno(status);
1852 goto bail;
1853 }
1854 }
1855
1856 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1857 fe, el);
1858 handle = ocfs2_start_trans(osb, NULL, credits);
1859 if (IS_ERR(handle)) {
1860 status = PTR_ERR(handle);
1861 handle = NULL;
1862 mlog_errno(status);
1863 goto bail;
1864 }
1865
1866 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1867 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1868 if (status < 0)
1869 mlog_errno(status);
1870
1871 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1872 last_eb_bh, handle, tc);
1873 if (status < 0) {
1874 mlog_errno(status);
1875 goto bail;
1876 }
1877
1878 up(&tl_inode->i_sem);
1879 tl_sem = 0;
1880
1881 ocfs2_commit_trans(handle);
1882 handle = NULL;
1883
1884 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1885 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1886 goto start;
1887bail:
1888 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1889
1890 ocfs2_schedule_truncate_log_flush(osb, 1);
1891
1892 if (tl_sem)
1893 up(&tl_inode->i_sem);
1894
1895 if (handle)
1896 ocfs2_commit_trans(handle);
1897
1898 if (last_eb_bh)
1899 brelse(last_eb_bh);
1900
1901 /* This will drop the ext_alloc cluster lock for us */
1902 ocfs2_free_truncate_context(tc);
1903
1904 mlog_exit(status);
1905 return status;
1906}
1907
1908
1909/*
1910 * Expects the inode to already be locked. This will figure out which
1911 * inodes need to be locked and will put them on the returned truncate
1912 * context.
1913 */
1914int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1915 struct inode *inode,
1916 struct buffer_head *fe_bh,
1917 struct ocfs2_truncate_context **tc)
1918{
1919 int status, metadata_delete;
1920 unsigned int new_i_clusters;
1921 struct ocfs2_dinode *fe;
1922 struct ocfs2_extent_block *eb;
1923 struct ocfs2_extent_list *el;
1924 struct buffer_head *last_eb_bh = NULL;
1925 struct inode *ext_alloc_inode = NULL;
1926 struct buffer_head *ext_alloc_bh = NULL;
1927
1928 mlog_entry_void();
1929
1930 *tc = NULL;
1931
1932 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1933 i_size_read(inode));
1934 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1935
1936 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1937 "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
1938
1939 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1940 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
1941 "%u and size %"MLFu64" whereas struct inode has "
1942 "cluster count %u and size %llu which caused an "
1943 "invalid truncate to %u clusters.",
1944 le64_to_cpu(fe->i_blkno),
1945 le32_to_cpu(fe->i_clusters),
1946 le64_to_cpu(fe->i_size),
1947 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1948 new_i_clusters);
1949 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1950 status = -EIO;
1951 goto bail;
1952 }
1953
1954 *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1955 if (!(*tc)) {
1956 status = -ENOMEM;
1957 mlog_errno(status);
1958 goto bail;
1959 }
1960
1961 metadata_delete = 0;
1962 if (fe->id2.i_list.l_tree_depth) {
1963 /* If we have a tree, then the truncate may result in
1964 * metadata deletes. Figure this out from the
1965 * rightmost leaf block.*/
1966 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1967 &last_eb_bh, OCFS2_BH_CACHED, inode);
1968 if (status < 0) {
1969 mlog_errno(status);
1970 goto bail;
1971 }
1972 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1973 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1974 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1975
1976 brelse(last_eb_bh);
1977 status = -EIO;
1978 goto bail;
1979 }
1980 el = &(eb->h_list);
1981 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1982 metadata_delete = 1;
1983 }
1984
1985 (*tc)->tc_last_eb_bh = last_eb_bh;
1986
1987 if (metadata_delete) {
1988 mlog(0, "Will have to delete metadata for this trunc. "
1989 "locking allocator.\n");
1990 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1991 if (!ext_alloc_inode) {
1992 status = -ENOMEM;
1993 mlog_errno(status);
1994 goto bail;
1995 }
1996
1997 down(&ext_alloc_inode->i_sem);
1998 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
1999
2000 status = ocfs2_meta_lock(ext_alloc_inode,
2001 NULL,
2002 &ext_alloc_bh,
2003 1);
2004 if (status < 0) {
2005 mlog_errno(status);
2006 goto bail;
2007 }
2008 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2009 (*tc)->tc_ext_alloc_locked = 1;
2010 }
2011
2012 status = 0;
2013bail:
2014 if (status < 0) {
2015 if (*tc)
2016 ocfs2_free_truncate_context(*tc);
2017 *tc = NULL;
2018 }
2019 mlog_exit_void();
2020 return status;
2021}
2022
2023static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2024{
2025 if (tc->tc_ext_alloc_inode) {
2026 if (tc->tc_ext_alloc_locked)
2027 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2028
2029 up(&tc->tc_ext_alloc_inode->i_sem);
2030 iput(tc->tc_ext_alloc_inode);
2031 }
2032
2033 if (tc->tc_ext_alloc_bh)
2034 brelse(tc->tc_ext_alloc_bh);
2035
2036 if (tc->tc_last_eb_bh)
2037 brelse(tc->tc_last_eb_bh);
2038
2039 kfree(tc);
2040}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 000000000000..12ba897743f4
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H
28
29struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb,
31 struct ocfs2_journal_handle *handle,
32 struct inode *inode,
33 struct buffer_head *fe_bh,
34 u64 blkno,
35 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb,
38 struct inode *inode,
39 struct ocfs2_dinode *fe);
40/* how many new metadata chunks would an allocation need at maximum? */
41static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
42{
43 /*
44 * Rather than do all the work of determining how much we need
45 * (involves a ton of reads and locks), just ask for the
46 * maximal limit. That's a tree depth shift. So, one block for
47 * level of the tree (current l_tree_depth), one block for the
48 * new tree_depth==0 extent_block, and one block at the new
49 * top-of-the tree.
50 */
51 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
52}
53
54int ocfs2_truncate_log_init(struct ocfs2_super *osb);
55void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
56void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
57 int cancel);
58int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
59int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
60 int slot_num,
61 struct ocfs2_dinode **tl_copy);
62int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
63 struct ocfs2_dinode *tl_copy);
64
65struct ocfs2_truncate_context {
66 struct inode *tc_ext_alloc_inode;
67 struct buffer_head *tc_ext_alloc_bh;
68 int tc_ext_alloc_locked; /* is it cluster locked? */
69 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
70 struct buffer_head *tc_last_eb_bh;
71};
72
73int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode,
75 struct buffer_head *fe_bh,
76 struct ocfs2_truncate_context **tc);
77int ocfs2_commit_truncate(struct ocfs2_super *osb,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc);
81
82#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 000000000000..8f4467a930a5
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27
28#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "alloc.h"
34#include "aops.h"
35#include "dlmglue.h"
36#include "extent_map.h"
37#include "file.h"
38#include "inode.h"
39#include "journal.h"
40#include "super.h"
41#include "symlink.h"
42
43#include "buffer_head_io.h"
44
45static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
46 struct buffer_head *bh_result, int create)
47{
48 int err = -EIO;
49 int status;
50 struct ocfs2_dinode *fe = NULL;
51 struct buffer_head *bh = NULL;
52 struct buffer_head *buffer_cache_bh = NULL;
53 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
54 void *kaddr;
55
56 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
57 (unsigned long long)iblock, bh_result, create);
58
59 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
60
61 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
62 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
63 (unsigned long long)iblock);
64 goto bail;
65 }
66
67 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
68 OCFS2_I(inode)->ip_blkno,
69 &bh, OCFS2_BH_CACHED, inode);
70 if (status < 0) {
71 mlog_errno(status);
72 goto bail;
73 }
74 fe = (struct ocfs2_dinode *) bh->b_data;
75
76 if (!OCFS2_IS_VALID_DINODE(fe)) {
77 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
78 fe->i_blkno, 7, fe->i_signature);
79 goto bail;
80 }
81
82 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
83 le32_to_cpu(fe->i_clusters))) {
84 mlog(ML_ERROR, "block offset is outside the allocated size: "
85 "%llu\n", (unsigned long long)iblock);
86 goto bail;
87 }
88
89 /* We don't use the page cache to create symlink data, so if
90 * need be, copy it over from the buffer cache. */
91 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
92 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
93 iblock;
94 buffer_cache_bh = sb_getblk(osb->sb, blkno);
95 if (!buffer_cache_bh) {
96 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
97 goto bail;
98 }
99
100 /* we haven't locked out transactions, so a commit
101 * could've happened. Since we've got a reference on
102 * the bh, even if it commits while we're doing the
103 * copy, the data is still good. */
104 if (buffer_jbd(buffer_cache_bh)
105 && ocfs2_inode_is_new(inode)) {
106 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
107 if (!kaddr) {
108 mlog(ML_ERROR, "couldn't kmap!\n");
109 goto bail;
110 }
111 memcpy(kaddr + (bh_result->b_size * iblock),
112 buffer_cache_bh->b_data,
113 bh_result->b_size);
114 kunmap_atomic(kaddr, KM_USER0);
115 set_buffer_uptodate(bh_result);
116 }
117 brelse(buffer_cache_bh);
118 }
119
120 map_bh(bh_result, inode->i_sb,
121 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
122
123 err = 0;
124
125bail:
126 if (bh)
127 brelse(bh);
128
129 mlog_exit(err);
130 return err;
131}
132
133static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create)
135{
136 int err = 0;
137 u64 p_blkno, past_eof;
138
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create);
141
142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
144 inode, inode->i_ino);
145
146 if (S_ISLNK(inode->i_mode)) {
147 /* this always does I/O for some reason. */
148 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
149 goto bail;
150 }
151
152 /* this can happen if another node truncs after our extend! */
153 spin_lock(&OCFS2_I(inode)->ip_lock);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%"MLFu64", NULL)\n", err, inode,
166 (unsigned long long)iblock, p_blkno);
167 goto bail;
168 }
169
170 map_bh(bh_result, inode->i_sb, p_blkno);
171
172 if (bh_result->b_blocknr == 0) {
173 err = -EIO;
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
175 "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
176 p_blkno, OCFS2_I(inode)->ip_blkno);
177 }
178
179 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
180 mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
181
182 if (create && (iblock >= past_eof))
183 set_buffer_new(bh_result);
184
185bail:
186 if (err < 0)
187 err = -EIO;
188
189 mlog_exit(err);
190 return err;
191}
192
193static int ocfs2_readpage(struct file *file, struct page *page)
194{
195 struct inode *inode = page->mapping->host;
196 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
197 int ret, unlock = 1;
198
199 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
200
201 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
202 if (ret != 0) {
203 if (ret == AOP_TRUNCATED_PAGE)
204 unlock = 0;
205 mlog_errno(ret);
206 goto out;
207 }
208
209 down_read(&OCFS2_I(inode)->ip_alloc_sem);
210
211 /*
212 * i_size might have just been updated as we grabed the meta lock. We
213 * might now be discovering a truncate that hit on another node.
214 * block_read_full_page->get_block freaks out if it is asked to read
215 * beyond the end of a file, so we check here. Callers
216 * (generic_file_read, fault->nopage) are clever enough to check i_size
217 * and notice that the page they just read isn't needed.
218 *
219 * XXX sys_readahead() seems to get that wrong?
220 */
221 if (start >= i_size_read(inode)) {
222 char *addr = kmap(page);
223 memset(addr, 0, PAGE_SIZE);
224 flush_dcache_page(page);
225 kunmap(page);
226 SetPageUptodate(page);
227 ret = 0;
228 goto out_alloc;
229 }
230
231 ret = ocfs2_data_lock_with_page(inode, 0, page);
232 if (ret != 0) {
233 if (ret == AOP_TRUNCATED_PAGE)
234 unlock = 0;
235 mlog_errno(ret);
236 goto out_alloc;
237 }
238
239 ret = block_read_full_page(page, ocfs2_get_block);
240 unlock = 0;
241
242 ocfs2_data_unlock(inode, 0);
243out_alloc:
244 up_read(&OCFS2_I(inode)->ip_alloc_sem);
245 ocfs2_meta_unlock(inode, 0);
246out:
247 if (unlock)
248 unlock_page(page);
249 mlog_exit(ret);
250 return ret;
251}
252
253/* Note: Because we don't support holes, our allocation has
254 * already happened (allocation writes zeros to the file data)
255 * so we don't have to worry about ordered writes in
256 * ocfs2_writepage.
257 *
258 * ->writepage is called during the process of invalidating the page cache
259 * during blocked lock processing. It can't block on any cluster locks
260 * to during block mapping. It's relying on the fact that the block
261 * mapping can't have disappeared under the dirty pages that it is
262 * being asked to write back.
263 */
264static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
265{
266 int ret;
267
268 mlog_entry("(0x%p)\n", page);
269
270 ret = block_write_full_page(page, ocfs2_get_block, wbc);
271
272 mlog_exit(ret);
273
274 return ret;
275}
276
277/*
278 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
279 * from loopback. It must be able to perform its own locking around
280 * ocfs2_get_block().
281 */
282int ocfs2_prepare_write(struct file *file, struct page *page,
283 unsigned from, unsigned to)
284{
285 struct inode *inode = page->mapping->host;
286 int ret;
287
288 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
289
290 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
291 if (ret != 0) {
292 mlog_errno(ret);
293 goto out;
294 }
295
296 down_read(&OCFS2_I(inode)->ip_alloc_sem);
297
298 ret = block_prepare_write(page, from, to, ocfs2_get_block);
299
300 up_read(&OCFS2_I(inode)->ip_alloc_sem);
301
302 ocfs2_meta_unlock(inode, 0);
303out:
304 mlog_exit(ret);
305 return ret;
306}
307
308/* Taken from ext3. We don't necessarily need the full blown
309 * functionality yet, but IMHO it's better to cut and paste the whole
310 * thing so we can avoid introducing our own bugs (and easily pick up
311 * their fixes when they happen) --Mark */
312static int walk_page_buffers( handle_t *handle,
313 struct buffer_head *head,
314 unsigned from,
315 unsigned to,
316 int *partial,
317 int (*fn)( handle_t *handle,
318 struct buffer_head *bh))
319{
320 struct buffer_head *bh;
321 unsigned block_start, block_end;
322 unsigned blocksize = head->b_size;
323 int err, ret = 0;
324 struct buffer_head *next;
325
326 for ( bh = head, block_start = 0;
327 ret == 0 && (bh != head || !block_start);
328 block_start = block_end, bh = next)
329 {
330 next = bh->b_this_page;
331 block_end = block_start + blocksize;
332 if (block_end <= from || block_start >= to) {
333 if (partial && !buffer_uptodate(bh))
334 *partial = 1;
335 continue;
336 }
337 err = (*fn)(handle, bh);
338 if (!ret)
339 ret = err;
340 }
341 return ret;
342}
343
344struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
345 struct page *page,
346 unsigned from,
347 unsigned to)
348{
349 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
350 struct ocfs2_journal_handle *handle = NULL;
351 int ret = 0;
352
353 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
354 if (!handle) {
355 ret = -ENOMEM;
356 mlog_errno(ret);
357 goto out;
358 }
359
360 if (ocfs2_should_order_data(inode)) {
361 ret = walk_page_buffers(handle->k_handle,
362 page_buffers(page),
363 from, to, NULL,
364 ocfs2_journal_dirty_data);
365 if (ret < 0)
366 mlog_errno(ret);
367 }
368out:
369 if (ret) {
370 if (handle)
371 ocfs2_commit_trans(handle);
372 handle = ERR_PTR(ret);
373 }
374 return handle;
375}
376
377static int ocfs2_commit_write(struct file *file, struct page *page,
378 unsigned from, unsigned to)
379{
380 int ret, extending = 0, locklevel = 0;
381 loff_t new_i_size;
382 struct buffer_head *di_bh = NULL;
383 struct inode *inode = page->mapping->host;
384 struct ocfs2_journal_handle *handle = NULL;
385
386 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
387
388 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
389 * us to sample inode->i_size here without the metadata lock:
390 *
391 * 1) We're currently holding the inode alloc lock, so no
392 * nodes can change it underneath us.
393 *
394 * 2) We've had to take the metadata lock at least once
395 * already to check for extending writes, hence insuring
396 * that our current copy is also up to date.
397 */
398 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
399 if (new_i_size > i_size_read(inode)) {
400 extending = 1;
401 locklevel = 1;
402 }
403
404 ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
405 if (ret != 0) {
406 mlog_errno(ret);
407 goto out;
408 }
409
410 ret = ocfs2_data_lock_with_page(inode, 1, page);
411 if (ret != 0) {
412 mlog_errno(ret);
413 goto out_unlock_meta;
414 }
415
416 if (extending) {
417 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
418 if (IS_ERR(handle)) {
419 ret = PTR_ERR(handle);
420 handle = NULL;
421 goto out_unlock_data;
422 }
423
424 /* Mark our buffer early. We'd rather catch this error up here
425 * as opposed to after a successful commit_write which would
426 * require us to set back inode->i_size. */
427 ret = ocfs2_journal_access(handle, inode, di_bh,
428 OCFS2_JOURNAL_ACCESS_WRITE);
429 if (ret < 0) {
430 mlog_errno(ret);
431 goto out_commit;
432 }
433 }
434
435 /* might update i_size */
436 ret = generic_commit_write(file, page, from, to);
437 if (ret < 0) {
438 mlog_errno(ret);
439 goto out_commit;
440 }
441
442 if (extending) {
443 loff_t size = (u64) i_size_read(inode);
444 struct ocfs2_dinode *di =
445 (struct ocfs2_dinode *)di_bh->b_data;
446
447 /* ocfs2_mark_inode_dirty is too heavy to use here. */
448 inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
449 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
450
451 di->i_size = cpu_to_le64(size);
452 di->i_ctime = di->i_mtime =
453 cpu_to_le64(inode->i_mtime.tv_sec);
454 di->i_ctime_nsec = di->i_mtime_nsec =
455 cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 ret = ocfs2_journal_dirty(handle, di_bh);
458 if (ret < 0) {
459 mlog_errno(ret);
460 goto out_commit;
461 }
462 }
463
464 BUG_ON(extending && (i_size_read(inode) != new_i_size));
465
466out_commit:
467 if (handle)
468 ocfs2_commit_trans(handle);
469out_unlock_data:
470 ocfs2_data_unlock(inode, 1);
471out_unlock_meta:
472 ocfs2_meta_unlock(inode, locklevel);
473out:
474 if (di_bh)
475 brelse(di_bh);
476
477 mlog_exit(ret);
478 return ret;
479}
480
481static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
482{
483 sector_t status;
484 u64 p_blkno = 0;
485 int err = 0;
486 struct inode *inode = mapping->host;
487
488 mlog_entry("(block = %llu)\n", (unsigned long long)block);
489
490 /* We don't need to lock journal system files, since they aren't
491 * accessed concurrently from multiple nodes.
492 */
493 if (!INODE_JOURNAL(inode)) {
494 err = ocfs2_meta_lock(inode, NULL, NULL, 0);
495 if (err) {
496 if (err != -ENOENT)
497 mlog_errno(err);
498 goto bail;
499 }
500 down_read(&OCFS2_I(inode)->ip_alloc_sem);
501 }
502
503 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
504 NULL);
505
506 if (!INODE_JOURNAL(inode)) {
507 up_read(&OCFS2_I(inode)->ip_alloc_sem);
508 ocfs2_meta_unlock(inode, 0);
509 }
510
511 if (err) {
512 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
513 (unsigned long long)block);
514 mlog_errno(err);
515 goto bail;
516 }
517
518
519bail:
520 status = err ? 0 : p_blkno;
521
522 mlog_exit((int)status);
523
524 return status;
525}
526
527/*
528 * TODO: Make this into a generic get_blocks function.
529 *
530 * From do_direct_io in direct-io.c:
531 * "So what we do is to permit the ->get_blocks function to populate
532 * bh.b_size with the size of IO which is permitted at this offset and
533 * this i_blkbits."
534 *
535 * This function is called directly from get_more_blocks in direct-io.c.
536 *
537 * called like this: dio->get_blocks(dio->inode, fs_startblk,
538 * fs_count, map_bh, dio->rw == WRITE);
539 */
540static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
541 unsigned long max_blocks,
542 struct buffer_head *bh_result, int create)
543{
544 int ret;
545 u64 vbo_max; /* file offset, max_blocks from iblock */
546 u64 p_blkno;
547 int contig_blocks;
548 unsigned char blocksize_bits;
549
550 if (!inode || !bh_result) {
551 mlog(ML_ERROR, "inode or bh_result is null\n");
552 return -EIO;
553 }
554
555 blocksize_bits = inode->i_sb->s_blocksize_bits;
556
557 /* This function won't even be called if the request isn't all
558 * nicely aligned and of the right size, so there's no need
559 * for us to check any of that. */
560
561 vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
562
563 spin_lock(&OCFS2_I(inode)->ip_lock);
564 if ((iblock + max_blocks) >
565 ocfs2_clusters_to_blocks(inode->i_sb,
566 OCFS2_I(inode)->ip_clusters)) {
567 spin_unlock(&OCFS2_I(inode)->ip_lock);
568 ret = -EIO;
569 goto bail;
570 }
571 spin_unlock(&OCFS2_I(inode)->ip_lock);
572
573 /* This figures out the size of the next contiguous block, and
574 * our logical offset */
575 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
576 &contig_blocks);
577 if (ret) {
578 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
579 (unsigned long long)iblock);
580 ret = -EIO;
581 goto bail;
582 }
583
584 map_bh(bh_result, inode->i_sb, p_blkno);
585
586 /* make sure we don't map more than max_blocks blocks here as
587 that's all the kernel will handle at this point. */
588 if (max_blocks < contig_blocks)
589 contig_blocks = max_blocks;
590 bh_result->b_size = contig_blocks << blocksize_bits;
591bail:
592 return ret;
593}
594
595/*
596 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
597 * particularly interested in the aio/dio case. Like the core uses
598 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
599 * truncation on another.
600 */
601static void ocfs2_dio_end_io(struct kiocb *iocb,
602 loff_t offset,
603 ssize_t bytes,
604 void *private)
605{
606 struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
607
608 /* this io's submitter should not have unlocked this before we could */
609 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
610 ocfs2_iocb_clear_rw_locked(iocb);
611 up_read(&inode->i_alloc_sem);
612 ocfs2_rw_unlock(inode, 0);
613}
614
615static ssize_t ocfs2_direct_IO(int rw,
616 struct kiocb *iocb,
617 const struct iovec *iov,
618 loff_t offset,
619 unsigned long nr_segs)
620{
621 struct file *file = iocb->ki_filp;
622 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
623 int ret;
624
625 mlog_entry_void();
626 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
627 inode->i_sb->s_bdev, iov, offset,
628 nr_segs,
629 ocfs2_direct_IO_get_blocks,
630 ocfs2_dio_end_io);
631 mlog_exit(ret);
632 return ret;
633}
634
635struct address_space_operations ocfs2_aops = {
636 .readpage = ocfs2_readpage,
637 .writepage = ocfs2_writepage,
638 .prepare_write = ocfs2_prepare_write,
639 .commit_write = ocfs2_commit_write,
640 .bmap = ocfs2_bmap,
641 .sync_page = block_sync_page,
642 .direct_IO = ocfs2_direct_IO
643};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 000000000000..d40456d509a0
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H
24
25int ocfs2_prepare_write(struct file *file, struct page *page,
26 unsigned from, unsigned to);
27
28struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page,
30 unsigned from,
31 unsigned to);
32
33/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \
37 set_bit(0, (unsigned long *)&iocb->private)
38#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private)
40
41#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 000000000000..d424041b38e9
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * io.c
5 *
6 * Buffer cache handling
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "inode.h"
37#include "journal.h"
38#include "uptodate.h"
39
40#include "buffer_head_io.h"
41
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode)
44{
45 int ret = 0;
46
47 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
48 (unsigned long long)bh->b_blocknr, inode);
49
50 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
51 BUG_ON(buffer_jbd(bh));
52
53 /* No need to check for a soft readonly file system here. non
54 * journalled writes are only ever done on system files which
55 * can get modified during recovery even if read-only. */
56 if (ocfs2_is_hard_readonly(osb)) {
57 ret = -EROFS;
58 goto out;
59 }
60
61 down(&OCFS2_I(inode)->ip_io_sem);
62
63 lock_buffer(bh);
64 set_buffer_uptodate(bh);
65
66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh);
68
69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh);
72
73 wait_on_buffer(bh);
74
75 if (buffer_uptodate(bh)) {
76 ocfs2_set_buffer_uptodate(inode, bh);
77 } else {
78 /* We don't need to remove the clustered uptodate
79 * information for this bh as it's not marked locally
80 * uptodate. */
81 ret = -EIO;
82 brelse(bh);
83 }
84
85 up(&OCFS2_I(inode)->ip_io_sem);
86out:
87 mlog_exit(ret);
88 return ret;
89}
90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
92 struct buffer_head *bhs[], int flags,
93 struct inode *inode)
94{
95 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0;
98 struct buffer_head *bh;
99
100 mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
101 block, nr, flags, inode);
102
103 if (osb == NULL || osb->sb == NULL || bhs == NULL) {
104 status = -EINVAL;
105 mlog_errno(status);
106 goto bail;
107 }
108
109 if (nr < 0) {
110 mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
111 status = -EINVAL;
112 mlog_errno(status);
113 goto bail;
114 }
115
116 if (nr == 0) {
117 mlog(ML_BH_IO, "No buffers will be read!\n");
118 status = 0;
119 goto bail;
120 }
121
122 sb = osb->sb;
123
124 if (flags & OCFS2_BH_CACHED && !inode)
125 flags &= ~OCFS2_BH_CACHED;
126
127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem);
129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) {
133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem);
135 status = -EIO;
136 mlog_errno(status);
137 goto bail;
138 }
139 }
140 bh = bhs[i];
141 ignore_cache = 0;
142
143 if (flags & OCFS2_BH_CACHED &&
144 !ocfs2_buffer_uptodate(inode, bh)) {
145 mlog(ML_UPTODATE,
146 "bh (%llu), inode %"MLFu64" not uptodate\n",
147 (unsigned long long)bh->b_blocknr,
148 OCFS2_I(inode)->ip_blkno);
149 ignore_cache = 1;
150 }
151
152 /* XXX: Can we ever get this and *not* have the cached
153 * flag set? */
154 if (buffer_jbd(bh)) {
155 if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
156 mlog(ML_BH_IO, "trying to sync read a jbd "
157 "managed bh (blocknr = %llu)\n",
158 (unsigned long long)bh->b_blocknr);
159 continue;
160 }
161
162 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
163 if (buffer_dirty(bh)) {
164 /* This should probably be a BUG, or
165 * at least return an error. */
166 mlog(ML_BH_IO, "asking me to sync read a dirty "
167 "buffer! (blocknr = %llu)\n",
168 (unsigned long long)bh->b_blocknr);
169 continue;
170 }
171
172 lock_buffer(bh);
173 if (buffer_jbd(bh)) {
174#ifdef CATCH_BH_JBD_RACES
175 mlog(ML_ERROR, "block %llu had the JBD bit set "
176 "while I was in lock_buffer!",
177 (unsigned long long)bh->b_blocknr);
178 BUG();
179#else
180 unlock_buffer(bh);
181 continue;
182#endif
183 }
184 clear_buffer_uptodate(bh);
185 get_bh(bh); /* for end_buffer_read_sync() */
186 bh->b_end_io = end_buffer_read_sync;
187 if (flags & OCFS2_BH_READAHEAD)
188 submit_bh(READA, bh);
189 else
190 submit_bh(READ, bh);
191 continue;
192 }
193 }
194
195 status = 0;
196
197 for (i = (nr - 1); i >= 0; i--) {
198 bh = bhs[i];
199
200 /* We know this can't have changed as we hold the
201 * inode sem. Avoid doing any work on the bh if the
202 * journal has it. */
203 if (!buffer_jbd(bh))
204 wait_on_buffer(bh);
205
206 if (!buffer_uptodate(bh)) {
207 /* Status won't be cleared from here on out,
208 * so we can safely record this and loop back
209 * to cleanup the other buffers. Don't need to
210 * remove the clustered uptodate information
211 * for this bh as it's not marked locally
212 * uptodate. */
213 status = -EIO;
214 brelse(bh);
215 bhs[i] = NULL;
216 continue;
217 }
218
219 if (inode)
220 ocfs2_set_buffer_uptodate(inode, bh);
221 }
222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem);
224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
227
228bail:
229
230 mlog_exit(status);
231 return status;
232}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 000000000000..6ecb90937b68
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_buffer_head.h
5 *
6 * Buffer cache handling functions defined
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_BUFFER_HEAD_IO_H
27#define OCFS2_BUFFER_HEAD_IO_H
28
29#include <linux/buffer_head.h>
30
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate);
33
34static inline int ocfs2_read_block(struct ocfs2_super *osb,
35 u64 off,
36 struct buffer_head **bh,
37 int flags,
38 struct inode *inode);
39
40int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh,
42 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb,
44 u64 block,
45 int nr,
46 struct buffer_head *bhs[],
47 int flags,
48 struct inode *inode);
49
50
51#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
53
54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
55 struct buffer_head **bh, int flags,
56 struct inode *inode)
57{
58 int status = 0;
59
60 if (bh == NULL) {
61 printk("ocfs2: bh == NULL\n");
62 status = -EINVAL;
63 goto bail;
64 }
65
66 status = ocfs2_read_blocks(osb, off, 1, bh,
67 flags, inode);
68
69bail:
70 return status;
71}
72
73#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 000000000000..cdd162f13650
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644
index 000000000000..2df9082f4e35
--- /dev/null
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_CLUSTER_ENDIAN_H
23#define OCFS2_CLUSTER_ENDIAN_H
24
25static inline void be32_add_cpu(__be32 *var, u32 val)
26{
27 *var = cpu_to_be32(be32_to_cpu(*var) + val);
28}
29
30#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 000000000000..7307ba528913
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/jiffies.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/delay.h>
30#include <linux/file.h>
31#include <linux/kthread.h>
32#include <linux/configfs.h>
33#include <linux/random.h>
34#include <linux/crc32.h>
35#include <linux/time.h>
36
37#include "heartbeat.h"
38#include "tcp.h"
39#include "nodemanager.h"
40#include "quorum.h"
41
42#include "masklog.h"
43
44
45/*
46 * The first heartbeat pass had one global thread that would serialize all hb
47 * callback calls. This global serializing sem should only be removed once
48 * we've made sure that all callees can deal with being called concurrently
49 * from multiple hb region threads.
50 */
51static DECLARE_RWSEM(o2hb_callback_sem);
52
53/*
54 * multiple hb threads are watching multiple regions. A node is live
55 * whenever any of the threads sees activity from the node in its region.
56 */
57static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
58static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
59static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62
63static LIST_HEAD(o2hb_all_regions);
64
65static struct o2hb_callback {
66 struct list_head list;
67} o2hb_callbacks[O2HB_NUM_CB];
68
69static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
70
71#define O2HB_DEFAULT_BLOCK_BITS 9
72
73unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
74
75/* Only sets a new threshold if there are no active regions.
76 *
77 * No locking or otherwise interesting code is required for reading
78 * o2hb_dead_threshold as it can't change once regions are active and
79 * it's not interesting to anyone until then anyway. */
80static void o2hb_dead_threshold_set(unsigned int threshold)
81{
82 if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
83 spin_lock(&o2hb_live_lock);
84 if (list_empty(&o2hb_all_regions))
85 o2hb_dead_threshold = threshold;
86 spin_unlock(&o2hb_live_lock);
87 }
88}
89
90struct o2hb_node_event {
91 struct list_head hn_item;
92 enum o2hb_callback_type hn_event_type;
93 struct o2nm_node *hn_node;
94 int hn_node_num;
95};
96
97struct o2hb_disk_slot {
98 struct o2hb_disk_heartbeat_block *ds_raw_block;
99 u8 ds_node_num;
100 u64 ds_last_time;
101 u64 ds_last_generation;
102 u16 ds_equal_samples;
103 u16 ds_changed_samples;
104 struct list_head ds_live_item;
105};
106
107/* each thread owns a region.. when we're asked to tear down the region
108 * we ask the thread to stop, who cleans up the region */
109struct o2hb_region {
110 struct config_item hr_item;
111
112 struct list_head hr_all_item;
113 unsigned hr_unclean_stop:1;
114
115 /* protected by the hr_callback_sem */
116 struct task_struct *hr_task;
117
118 unsigned int hr_blocks;
119 unsigned long long hr_start_block;
120
121 unsigned int hr_block_bits;
122 unsigned int hr_block_bytes;
123
124 unsigned int hr_slots_per_page;
125 unsigned int hr_num_pages;
126
127 struct page **hr_slot_data;
128 struct block_device *hr_bdev;
129 struct o2hb_disk_slot *hr_slots;
130
131 /* let the person setting up hb wait for it to return until it
132 * has reached a 'steady' state. This will be fixed when we have
133 * a more complete api that doesn't lead to this sort of fragility. */
134 atomic_t hr_steady_iterations;
135
136 char hr_dev_name[BDEVNAME_SIZE];
137
138 unsigned int hr_timeout_ms;
139
140 /* randomized as the region goes up and down so that a node
141 * recognizes a node going up and down in one iteration */
142 u64 hr_generation;
143
144 struct work_struct hr_write_timeout_work;
145 unsigned long hr_last_timeout_start;
146
147 /* Used during o2hb_check_slot to hold a copy of the block
148 * being checked because we temporarily have to zero out the
149 * crc field. */
150 struct o2hb_disk_heartbeat_block *hr_tmp_block;
151};
152
153struct o2hb_bio_wait_ctxt {
154 atomic_t wc_num_reqs;
155 struct completion wc_io_complete;
156};
157
158static void o2hb_write_timeout(void *arg)
159{
160 struct o2hb_region *reg = arg;
161
162 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
163 "milliseconds\n", reg->hr_dev_name,
164 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
165 o2quo_disk_timeout();
166}
167
168static void o2hb_arm_write_timeout(struct o2hb_region *reg)
169{
170 mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
171
172 cancel_delayed_work(&reg->hr_write_timeout_work);
173 reg->hr_last_timeout_start = jiffies;
174 schedule_delayed_work(&reg->hr_write_timeout_work,
175 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
176}
177
178static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
179{
180 cancel_delayed_work(&reg->hr_write_timeout_work);
181 flush_scheduled_work();
182}
183
184static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
185 unsigned int num_ios)
186{
187 atomic_set(&wc->wc_num_reqs, num_ios);
188 init_completion(&wc->wc_io_complete);
189}
190
191/* Used in error paths too */
192static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
193 unsigned int num)
194{
195 /* sadly atomic_sub_and_test() isn't available on all platforms. The
196 * good news is that the fast path only completes one at a time */
197 while(num--) {
198 if (atomic_dec_and_test(&wc->wc_num_reqs)) {
199 BUG_ON(num > 0);
200 complete(&wc->wc_io_complete);
201 }
202 }
203}
204
205static void o2hb_wait_on_io(struct o2hb_region *reg,
206 struct o2hb_bio_wait_ctxt *wc)
207{
208 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
209
210 blk_run_address_space(mapping);
211
212 wait_for_completion(&wc->wc_io_complete);
213}
214
215static int o2hb_bio_end_io(struct bio *bio,
216 unsigned int bytes_done,
217 int error)
218{
219 struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
220
221 if (error)
222 mlog(ML_ERROR, "IO Error %d\n", error);
223
224 if (bio->bi_size)
225 return 1;
226
227 o2hb_bio_wait_dec(wc, 1);
228 return 0;
229}
230
231/* Setup a Bio to cover I/O against num_slots slots starting at
232 * start_slot. */
233static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
234 struct o2hb_bio_wait_ctxt *wc,
235 unsigned int start_slot,
236 unsigned int num_slots)
237{
238 int i, nr_vecs, len, first_page, last_page;
239 unsigned int vec_len, vec_start;
240 unsigned int bits = reg->hr_block_bits;
241 unsigned int spp = reg->hr_slots_per_page;
242 struct bio *bio;
243 struct page *page;
244
245 nr_vecs = (num_slots + spp - 1) / spp;
246
247 /* Testing has shown this allocation to take long enough under
248 * GFP_KERNEL that the local node can get fenced. It would be
249 * nicest if we could pre-allocate these bios and avoid this
250 * all together. */
251 bio = bio_alloc(GFP_ATOMIC, nr_vecs);
252 if (!bio) {
253 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
254 bio = ERR_PTR(-ENOMEM);
255 goto bail;
256 }
257
258 /* Must put everything in 512 byte sectors for the bio... */
259 bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
260 bio->bi_bdev = reg->hr_bdev;
261 bio->bi_private = wc;
262 bio->bi_end_io = o2hb_bio_end_io;
263
264 first_page = start_slot / spp;
265 last_page = first_page + nr_vecs;
266 vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
267 for(i = first_page; i < last_page; i++) {
268 page = reg->hr_slot_data[i];
269
270 vec_len = PAGE_CACHE_SIZE;
271 /* last page might be short */
272 if (((i + 1) * spp) > (start_slot + num_slots))
273 vec_len = ((num_slots + start_slot) % spp) << bits;
274 vec_len -= vec_start;
275
276 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
277 i, vec_len, vec_start);
278
279 len = bio_add_page(bio, page, vec_len, vec_start);
280 if (len != vec_len) {
281 bio_put(bio);
282 bio = ERR_PTR(-EIO);
283
284 mlog(ML_ERROR, "Error adding page to bio i = %d, "
285 "vec_len = %u, len = %d\n, start = %u\n",
286 i, vec_len, len, vec_start);
287 goto bail;
288 }
289
290 vec_start = 0;
291 }
292
293bail:
294 return bio;
295}
296
297/*
298 * Compute the maximum number of sectors the bdev can handle in one bio,
299 * as a power of two.
300 *
301 * Stolen from oracleasm, thanks Joel!
302 */
303static int compute_max_sectors(struct block_device *bdev)
304{
305 int max_pages, max_sectors, pow_two_sectors;
306
307 struct request_queue *q;
308
309 q = bdev_get_queue(bdev);
310 max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
311 if (max_pages > BIO_MAX_PAGES)
312 max_pages = BIO_MAX_PAGES;
313 if (max_pages > q->max_phys_segments)
314 max_pages = q->max_phys_segments;
315 if (max_pages > q->max_hw_segments)
316 max_pages = q->max_hw_segments;
317 max_pages--; /* Handle I/Os that straddle a page */
318
319 max_sectors = max_pages << (PAGE_SHIFT - 9);
320
321 /* Why is fls() 1-based???? */
322 pow_two_sectors = 1 << (fls(max_sectors) - 1);
323
324 return pow_two_sectors;
325}
326
327static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
328 unsigned int num_slots,
329 unsigned int *num_bios,
330 unsigned int *slots_per_bio)
331{
332 unsigned int max_sectors, io_sectors;
333
334 max_sectors = compute_max_sectors(reg->hr_bdev);
335
336 io_sectors = num_slots << (reg->hr_block_bits - 9);
337
338 *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
339 *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
340
341 mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
342 "device can handle %u sectors of I/O\n", io_sectors, num_slots,
343 max_sectors);
344 mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
345 *num_bios, *slots_per_bio);
346}
347
348static int o2hb_read_slots(struct o2hb_region *reg,
349 unsigned int max_slots)
350{
351 unsigned int num_bios, slots_per_bio, start_slot, num_slots;
352 int i, status;
353 struct o2hb_bio_wait_ctxt wc;
354 struct bio **bios;
355 struct bio *bio;
356
357 o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
358
359 bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
360 if (!bios) {
361 status = -ENOMEM;
362 mlog_errno(status);
363 return status;
364 }
365
366 o2hb_bio_wait_init(&wc, num_bios);
367
368 num_slots = slots_per_bio;
369 for(i = 0; i < num_bios; i++) {
370 start_slot = i * slots_per_bio;
371
372 /* adjust num_slots at last bio */
373 if (max_slots < (start_slot + num_slots))
374 num_slots = max_slots - start_slot;
375
376 bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
377 if (IS_ERR(bio)) {
378 o2hb_bio_wait_dec(&wc, num_bios - i);
379
380 status = PTR_ERR(bio);
381 mlog_errno(status);
382 goto bail_and_wait;
383 }
384 bios[i] = bio;
385
386 submit_bio(READ, bio);
387 }
388
389 status = 0;
390
391bail_and_wait:
392 o2hb_wait_on_io(reg, &wc);
393
394 if (bios) {
395 for(i = 0; i < num_bios; i++)
396 if (bios[i])
397 bio_put(bios[i]);
398 kfree(bios);
399 }
400
401 return status;
402}
403
404static int o2hb_issue_node_write(struct o2hb_region *reg,
405 struct bio **write_bio,
406 struct o2hb_bio_wait_ctxt *write_wc)
407{
408 int status;
409 unsigned int slot;
410 struct bio *bio;
411
412 o2hb_bio_wait_init(write_wc, 1);
413
414 slot = o2nm_this_node();
415
416 bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
417 if (IS_ERR(bio)) {
418 status = PTR_ERR(bio);
419 mlog_errno(status);
420 goto bail;
421 }
422
423 submit_bio(WRITE, bio);
424
425 *write_bio = bio;
426 status = 0;
427bail:
428 return status;
429}
430
431static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
432 struct o2hb_disk_heartbeat_block *hb_block)
433{
434 __le32 old_cksum;
435 u32 ret;
436
437 /* We want to compute the block crc with a 0 value in the
438 * hb_cksum field. Save it off here and replace after the
439 * crc. */
440 old_cksum = hb_block->hb_cksum;
441 hb_block->hb_cksum = 0;
442
443 ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
444
445 hb_block->hb_cksum = old_cksum;
446
447 return ret;
448}
449
450static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
451{
452 mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
453 "cksum = 0x%x, generation 0x%"MLFx64"\n",
454 le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
455 le32_to_cpu(hb_block->hb_cksum),
456 le64_to_cpu(hb_block->hb_generation));
457}
458
459static int o2hb_verify_crc(struct o2hb_region *reg,
460 struct o2hb_disk_heartbeat_block *hb_block)
461{
462 u32 read, computed;
463
464 read = le32_to_cpu(hb_block->hb_cksum);
465 computed = o2hb_compute_block_crc_le(reg, hb_block);
466
467 return read == computed;
468}
469
470/* We want to make sure that nobody is heartbeating on top of us --
471 * this will help detect an invalid configuration. */
472static int o2hb_check_last_timestamp(struct o2hb_region *reg)
473{
474 int node_num, ret;
475 struct o2hb_disk_slot *slot;
476 struct o2hb_disk_heartbeat_block *hb_block;
477
478 node_num = o2nm_this_node();
479
480 ret = 1;
481 slot = &reg->hr_slots[node_num];
482 /* Don't check on our 1st timestamp */
483 if (slot->ds_last_time) {
484 hb_block = slot->ds_raw_block;
485
486 if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
487 ret = 0;
488 }
489
490 return ret;
491}
492
493static inline void o2hb_prepare_block(struct o2hb_region *reg,
494 u64 generation)
495{
496 int node_num;
497 u64 cputime;
498 struct o2hb_disk_slot *slot;
499 struct o2hb_disk_heartbeat_block *hb_block;
500
501 node_num = o2nm_this_node();
502 slot = &reg->hr_slots[node_num];
503
504 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
505 memset(hb_block, 0, reg->hr_block_bytes);
506 /* TODO: time stuff */
507 cputime = CURRENT_TIME.tv_sec;
508 if (!cputime)
509 cputime = 1;
510
511 hb_block->hb_seq = cpu_to_le64(cputime);
512 hb_block->hb_node = node_num;
513 hb_block->hb_generation = cpu_to_le64(generation);
514
515 /* This step must always happen last! */
516 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
517 hb_block));
518
519 mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
520 cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
521}
522
523static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
524 struct o2nm_node *node,
525 int idx)
526{
527 struct list_head *iter;
528 struct o2hb_callback_func *f;
529
530 list_for_each(iter, &hbcall->list) {
531 f = list_entry(iter, struct o2hb_callback_func, hc_item);
532 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
533 (f->hc_func)(node, idx, f->hc_data);
534 }
535}
536
537/* Will run the list in order until we process the passed event */
538static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
539{
540 int empty;
541 struct o2hb_callback *hbcall;
542 struct o2hb_node_event *event;
543
544 spin_lock(&o2hb_live_lock);
545 empty = list_empty(&queued_event->hn_item);
546 spin_unlock(&o2hb_live_lock);
547 if (empty)
548 return;
549
550 /* Holding callback sem assures we don't alter the callback
551 * lists when doing this, and serializes ourselves with other
552 * processes wanting callbacks. */
553 down_write(&o2hb_callback_sem);
554
555 spin_lock(&o2hb_live_lock);
556 while (!list_empty(&o2hb_node_events)
557 && !list_empty(&queued_event->hn_item)) {
558 event = list_entry(o2hb_node_events.next,
559 struct o2hb_node_event,
560 hn_item);
561 list_del_init(&event->hn_item);
562 spin_unlock(&o2hb_live_lock);
563
564 mlog(ML_HEARTBEAT, "Node %s event for %d\n",
565 event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
566 event->hn_node_num);
567
568 hbcall = hbcall_from_type(event->hn_event_type);
569
570 /* We should *never* have gotten on to the list with a
571 * bad type... This isn't something that we should try
572 * to recover from. */
573 BUG_ON(IS_ERR(hbcall));
574
575 o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
576
577 spin_lock(&o2hb_live_lock);
578 }
579 spin_unlock(&o2hb_live_lock);
580
581 up_write(&o2hb_callback_sem);
582}
583
584static void o2hb_queue_node_event(struct o2hb_node_event *event,
585 enum o2hb_callback_type type,
586 struct o2nm_node *node,
587 int node_num)
588{
589 assert_spin_locked(&o2hb_live_lock);
590
591 event->hn_event_type = type;
592 event->hn_node = node;
593 event->hn_node_num = node_num;
594
595 mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
596 type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
597
598 list_add_tail(&event->hn_item, &o2hb_node_events);
599}
600
601static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
602{
603 struct o2hb_node_event event =
604 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
605 struct o2nm_node *node;
606
607 node = o2nm_get_node_by_num(slot->ds_node_num);
608 if (!node)
609 return;
610
611 spin_lock(&o2hb_live_lock);
612 if (!list_empty(&slot->ds_live_item)) {
613 mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
614 slot->ds_node_num);
615
616 list_del_init(&slot->ds_live_item);
617
618 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
619 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
620
621 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
622 slot->ds_node_num);
623 }
624 }
625 spin_unlock(&o2hb_live_lock);
626
627 o2hb_run_event_list(&event);
628
629 o2nm_node_put(node);
630}
631
632static int o2hb_check_slot(struct o2hb_region *reg,
633 struct o2hb_disk_slot *slot)
634{
635 int changed = 0, gen_changed = 0;
636 struct o2hb_node_event event =
637 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
638 struct o2nm_node *node;
639 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
640 u64 cputime;
641
642 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
643
644 /* Is this correct? Do we assume that the node doesn't exist
645 * if we're not configured for him? */
646 node = o2nm_get_node_by_num(slot->ds_node_num);
647 if (!node)
648 return 0;
649
650 if (!o2hb_verify_crc(reg, hb_block)) {
651 /* all paths from here will drop o2hb_live_lock for
652 * us. */
653 spin_lock(&o2hb_live_lock);
654
655 /* Don't print an error on the console in this case -
656 * a freshly formatted heartbeat area will not have a
657 * crc set on it. */
658 if (list_empty(&slot->ds_live_item))
659 goto out;
660
661 /* The node is live but pushed out a bad crc. We
662 * consider it a transient miss but don't populate any
663 * other values as they may be junk. */
664 mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
665 slot->ds_node_num, reg->hr_dev_name);
666 o2hb_dump_slot(hb_block);
667
668 slot->ds_equal_samples++;
669 goto fire_callbacks;
670 }
671
672 /* we don't care if these wrap.. the state transitions below
673 * clear at the right places */
674 cputime = le64_to_cpu(hb_block->hb_seq);
675 if (slot->ds_last_time != cputime)
676 slot->ds_changed_samples++;
677 else
678 slot->ds_equal_samples++;
679 slot->ds_last_time = cputime;
680
681 /* The node changed heartbeat generations. We assume this to
682 * mean it dropped off but came back before we timed out. We
683 * want to consider it down for the time being but don't want
684 * to lose any changed_samples state we might build up to
685 * considering it live again. */
686 if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
687 gen_changed = 1;
688 slot->ds_equal_samples = 0;
689 mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
690 "to 0x%"MLFx64")\n", slot->ds_node_num,
691 slot->ds_last_generation,
692 le64_to_cpu(hb_block->hb_generation));
693 }
694
695 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
696
697 mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
698 "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
699 slot->ds_node_num, slot->ds_last_generation,
700 le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq),
701 slot->ds_last_time, slot->ds_changed_samples,
702 slot->ds_equal_samples);
703
704 spin_lock(&o2hb_live_lock);
705
706fire_callbacks:
707 /* dead nodes only come to life after some number of
708 * changes at any time during their dead time */
709 if (list_empty(&slot->ds_live_item) &&
710 slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
711 mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
712 "region\n", slot->ds_node_num, slot->ds_last_generation);
713
714 /* first on the list generates a callback */
715 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
716 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
717
718 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
719 slot->ds_node_num);
720
721 changed = 1;
722 }
723
724 list_add_tail(&slot->ds_live_item,
725 &o2hb_live_slots[slot->ds_node_num]);
726
727 slot->ds_equal_samples = 0;
728 goto out;
729 }
730
731 /* if the list is dead, we're done.. */
732 if (list_empty(&slot->ds_live_item))
733 goto out;
734
735 /* live nodes only go dead after enough consequtive missed
736 * samples.. reset the missed counter whenever we see
737 * activity */
738 if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
739 mlog(ML_HEARTBEAT, "Node %d left my region\n",
740 slot->ds_node_num);
741
742 /* last off the live_slot generates a callback */
743 list_del_init(&slot->ds_live_item);
744 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
745 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
746
747 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
748 slot->ds_node_num);
749
750 changed = 1;
751 }
752
753 /* We don't clear this because the node is still
754 * actually writing new blocks. */
755 if (!gen_changed)
756 slot->ds_changed_samples = 0;
757 goto out;
758 }
759 if (slot->ds_changed_samples) {
760 slot->ds_changed_samples = 0;
761 slot->ds_equal_samples = 0;
762 }
763out:
764 spin_unlock(&o2hb_live_lock);
765
766 o2hb_run_event_list(&event);
767
768 o2nm_node_put(node);
769 return changed;
770}
771
772/* This could be faster if we just implmented a find_last_bit, but I
773 * don't think the circumstances warrant it. */
774static int o2hb_highest_node(unsigned long *nodes,
775 int numbits)
776{
777 int highest, node;
778
779 highest = numbits;
780 node = -1;
781 while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
782 if (node >= numbits)
783 break;
784
785 highest = node;
786 }
787
788 return highest;
789}
790
791static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
792{
793 int i, ret, highest_node, change = 0;
794 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
795 struct bio *write_bio;
796 struct o2hb_bio_wait_ctxt write_wc;
797
798 if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
799 return;
800
801 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
802 if (highest_node >= O2NM_MAX_NODES) {
803 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
804 return;
805 }
806
807 /* No sense in reading the slots of nodes that don't exist
808 * yet. Of course, if the node definitions have holes in them
809 * then we're reading an empty slot anyway... Consider this
810 * best-effort. */
811 ret = o2hb_read_slots(reg, highest_node + 1);
812 if (ret < 0) {
813 mlog_errno(ret);
814 return;
815 }
816
817 /* With an up to date view of the slots, we can check that no
818 * other node has been improperly configured to heartbeat in
819 * our slot. */
820 if (!o2hb_check_last_timestamp(reg))
821 mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
822 "in our slot!\n", reg->hr_dev_name);
823
824 /* fill in the proper info for our next heartbeat */
825 o2hb_prepare_block(reg, reg->hr_generation);
826
827 /* And fire off the write. Note that we don't wait on this I/O
828 * until later. */
829 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
830 if (ret < 0) {
831 mlog_errno(ret);
832 return;
833 }
834
835 i = -1;
836 while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
837
838 change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
839 }
840
841 /*
842 * We have to be sure we've advertised ourselves on disk
843 * before we can go to steady state. This ensures that
844 * people we find in our steady state have seen us.
845 */
846 o2hb_wait_on_io(reg, &write_wc);
847 bio_put(write_bio);
848 o2hb_arm_write_timeout(reg);
849
850 /* let the person who launched us know when things are steady */
851 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
852 if (atomic_dec_and_test(&reg->hr_steady_iterations))
853 wake_up(&o2hb_steady_queue);
854 }
855}
856
857/* Subtract b from a, storing the result in a. a *must* have a larger
858 * value than b. */
859static void o2hb_tv_subtract(struct timeval *a,
860 struct timeval *b)
861{
862 /* just return 0 when a is after b */
863 if (a->tv_sec < b->tv_sec ||
864 (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
865 a->tv_sec = 0;
866 a->tv_usec = 0;
867 return;
868 }
869
870 a->tv_sec -= b->tv_sec;
871 a->tv_usec -= b->tv_usec;
872 while ( a->tv_usec < 0 ) {
873 a->tv_sec--;
874 a->tv_usec += 1000000;
875 }
876}
877
878static unsigned int o2hb_elapsed_msecs(struct timeval *start,
879 struct timeval *end)
880{
881 struct timeval res = *end;
882
883 o2hb_tv_subtract(&res, start);
884
885 return res.tv_sec * 1000 + res.tv_usec / 1000;
886}
887
888/*
889 * we ride the region ref that the region dir holds. before the region
890 * dir is removed and drops it ref it will wait to tear down this
891 * thread.
892 */
893static int o2hb_thread(void *data)
894{
895 int i, ret;
896 struct o2hb_region *reg = data;
897 struct bio *write_bio;
898 struct o2hb_bio_wait_ctxt write_wc;
899 struct timeval before_hb, after_hb;
900 unsigned int elapsed_msec;
901
902 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
903
904 set_user_nice(current, -20);
905
906 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
907 /* We track the time spent inside
908 * o2hb_do_disk_heartbeat so that we avoid more then
909 * hr_timeout_ms between disk writes. On busy systems
910 * this should result in a heartbeat which is less
911 * likely to time itself out. */
912 do_gettimeofday(&before_hb);
913
914 o2hb_do_disk_heartbeat(reg);
915
916 do_gettimeofday(&after_hb);
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
922
923 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no
925 * need to record the return value. */
926 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
927 }
928 }
929
930 o2hb_disarm_write_timeout(reg);
931
932 /* unclean stop is only used in very bad situation */
933 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
934 o2hb_shutdown_slot(&reg->hr_slots[i]);
935
936 /* Explicit down notification - avoid forcing the other nodes
937 * to timeout on this region when we could just as easily
938 * write a clear generation - thus indicating to them that
939 * this node has left this region.
940 *
941 * XXX: Should we skip this on unclean_stop? */
942 o2hb_prepare_block(reg, 0);
943 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
944 if (ret == 0) {
945 o2hb_wait_on_io(reg, &write_wc);
946 bio_put(write_bio);
947 } else {
948 mlog_errno(ret);
949 }
950
951 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
952
953 return 0;
954}
955
956void o2hb_init(void)
957{
958 int i;
959
960 for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
961 INIT_LIST_HEAD(&o2hb_callbacks[i].list);
962
963 for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
964 INIT_LIST_HEAD(&o2hb_live_slots[i]);
965
966 INIT_LIST_HEAD(&o2hb_node_events);
967
968 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
969}
970
971/* if we're already in a callback then we're already serialized by the sem */
972static void o2hb_fill_node_map_from_callback(unsigned long *map,
973 unsigned bytes)
974{
975 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
976
977 memcpy(map, &o2hb_live_node_bitmap, bytes);
978}
979
980/*
981 * get a map of all nodes that are heartbeating in any regions
982 */
983void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
984{
985 /* callers want to serialize this map and callbacks so that they
986 * can trust that they don't miss nodes coming to the party */
987 down_read(&o2hb_callback_sem);
988 spin_lock(&o2hb_live_lock);
989 o2hb_fill_node_map_from_callback(map, bytes);
990 spin_unlock(&o2hb_live_lock);
991 up_read(&o2hb_callback_sem);
992}
993EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
994
995/*
996 * heartbeat configfs bits. The heartbeat set is a default set under
997 * the cluster set in nodemanager.c.
998 */
999
1000static struct o2hb_region *to_o2hb_region(struct config_item *item)
1001{
1002 return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
1003}
1004
1005/* drop_item only drops its ref after killing the thread, nothing should
1006 * be using the region anymore. this has to clean up any state that
1007 * attributes might have built up. */
1008static void o2hb_region_release(struct config_item *item)
1009{
1010 int i;
1011 struct page *page;
1012 struct o2hb_region *reg = to_o2hb_region(item);
1013
1014 if (reg->hr_tmp_block)
1015 kfree(reg->hr_tmp_block);
1016
1017 if (reg->hr_slot_data) {
1018 for (i = 0; i < reg->hr_num_pages; i++) {
1019 page = reg->hr_slot_data[i];
1020 if (page)
1021 __free_page(page);
1022 }
1023 kfree(reg->hr_slot_data);
1024 }
1025
1026 if (reg->hr_bdev)
1027 blkdev_put(reg->hr_bdev);
1028
1029 if (reg->hr_slots)
1030 kfree(reg->hr_slots);
1031
1032 spin_lock(&o2hb_live_lock);
1033 list_del(&reg->hr_all_item);
1034 spin_unlock(&o2hb_live_lock);
1035
1036 kfree(reg);
1037}
1038
1039static int o2hb_read_block_input(struct o2hb_region *reg,
1040 const char *page,
1041 size_t count,
1042 unsigned long *ret_bytes,
1043 unsigned int *ret_bits)
1044{
1045 unsigned long bytes;
1046 char *p = (char *)page;
1047
1048 bytes = simple_strtoul(p, &p, 0);
1049 if (!p || (*p && (*p != '\n')))
1050 return -EINVAL;
1051
1052 /* Heartbeat and fs min / max block sizes are the same. */
1053 if (bytes > 4096 || bytes < 512)
1054 return -ERANGE;
1055 if (hweight16(bytes) != 1)
1056 return -EINVAL;
1057
1058 if (ret_bytes)
1059 *ret_bytes = bytes;
1060 if (ret_bits)
1061 *ret_bits = ffs(bytes) - 1;
1062
1063 return 0;
1064}
1065
1066static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
1067 char *page)
1068{
1069 return sprintf(page, "%u\n", reg->hr_block_bytes);
1070}
1071
1072static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
1073 const char *page,
1074 size_t count)
1075{
1076 int status;
1077 unsigned long block_bytes;
1078 unsigned int block_bits;
1079
1080 if (reg->hr_bdev)
1081 return -EINVAL;
1082
1083 status = o2hb_read_block_input(reg, page, count,
1084 &block_bytes, &block_bits);
1085 if (status)
1086 return status;
1087
1088 reg->hr_block_bytes = (unsigned int)block_bytes;
1089 reg->hr_block_bits = block_bits;
1090
1091 return count;
1092}
1093
1094static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
1095 char *page)
1096{
1097 return sprintf(page, "%llu\n", reg->hr_start_block);
1098}
1099
1100static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
1101 const char *page,
1102 size_t count)
1103{
1104 unsigned long long tmp;
1105 char *p = (char *)page;
1106
1107 if (reg->hr_bdev)
1108 return -EINVAL;
1109
1110 tmp = simple_strtoull(p, &p, 0);
1111 if (!p || (*p && (*p != '\n')))
1112 return -EINVAL;
1113
1114 reg->hr_start_block = tmp;
1115
1116 return count;
1117}
1118
1119static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
1120 char *page)
1121{
1122 return sprintf(page, "%d\n", reg->hr_blocks);
1123}
1124
1125static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
1126 const char *page,
1127 size_t count)
1128{
1129 unsigned long tmp;
1130 char *p = (char *)page;
1131
1132 if (reg->hr_bdev)
1133 return -EINVAL;
1134
1135 tmp = simple_strtoul(p, &p, 0);
1136 if (!p || (*p && (*p != '\n')))
1137 return -EINVAL;
1138
1139 if (tmp > O2NM_MAX_NODES || tmp == 0)
1140 return -ERANGE;
1141
1142 reg->hr_blocks = (unsigned int)tmp;
1143
1144 return count;
1145}
1146
1147static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
1148 char *page)
1149{
1150 unsigned int ret = 0;
1151
1152 if (reg->hr_bdev)
1153 ret = sprintf(page, "%s\n", reg->hr_dev_name);
1154
1155 return ret;
1156}
1157
1158static void o2hb_init_region_params(struct o2hb_region *reg)
1159{
1160 reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
1161 reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
1162
1163 mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
1164 reg->hr_start_block, reg->hr_blocks);
1165 mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
1166 reg->hr_block_bytes, reg->hr_block_bits);
1167 mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
1168 mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
1169}
1170
1171static int o2hb_map_slot_data(struct o2hb_region *reg)
1172{
1173 int i, j;
1174 unsigned int last_slot;
1175 unsigned int spp = reg->hr_slots_per_page;
1176 struct page *page;
1177 char *raw;
1178 struct o2hb_disk_slot *slot;
1179
1180 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
1181 if (reg->hr_tmp_block == NULL) {
1182 mlog_errno(-ENOMEM);
1183 return -ENOMEM;
1184 }
1185
1186 reg->hr_slots = kcalloc(reg->hr_blocks,
1187 sizeof(struct o2hb_disk_slot), GFP_KERNEL);
1188 if (reg->hr_slots == NULL) {
1189 mlog_errno(-ENOMEM);
1190 return -ENOMEM;
1191 }
1192
1193 for(i = 0; i < reg->hr_blocks; i++) {
1194 slot = &reg->hr_slots[i];
1195 slot->ds_node_num = i;
1196 INIT_LIST_HEAD(&slot->ds_live_item);
1197 slot->ds_raw_block = NULL;
1198 }
1199
1200 reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
1201 mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
1202 "at %u blocks per page\n",
1203 reg->hr_num_pages, reg->hr_blocks, spp);
1204
1205 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
1206 GFP_KERNEL);
1207 if (!reg->hr_slot_data) {
1208 mlog_errno(-ENOMEM);
1209 return -ENOMEM;
1210 }
1211
1212 for(i = 0; i < reg->hr_num_pages; i++) {
1213 page = alloc_page(GFP_KERNEL);
1214 if (!page) {
1215 mlog_errno(-ENOMEM);
1216 return -ENOMEM;
1217 }
1218
1219 reg->hr_slot_data[i] = page;
1220
1221 last_slot = i * spp;
1222 raw = page_address(page);
1223 for (j = 0;
1224 (j < spp) && ((j + last_slot) < reg->hr_blocks);
1225 j++) {
1226 BUG_ON((j + last_slot) >= reg->hr_blocks);
1227
1228 slot = &reg->hr_slots[j + last_slot];
1229 slot->ds_raw_block =
1230 (struct o2hb_disk_heartbeat_block *) raw;
1231
1232 raw += reg->hr_block_bytes;
1233 }
1234 }
1235
1236 return 0;
1237}
1238
1239/* Read in all the slots available and populate the tracking
1240 * structures so that we can start with a baseline idea of what's
1241 * there. */
1242static int o2hb_populate_slot_data(struct o2hb_region *reg)
1243{
1244 int ret, i;
1245 struct o2hb_disk_slot *slot;
1246 struct o2hb_disk_heartbeat_block *hb_block;
1247
1248 mlog_entry_void();
1249
1250 ret = o2hb_read_slots(reg, reg->hr_blocks);
1251 if (ret) {
1252 mlog_errno(ret);
1253 goto out;
1254 }
1255
1256 /* We only want to get an idea of the values initially in each
1257 * slot, so we do no verification - o2hb_check_slot will
1258 * actually determine if each configured slot is valid and
1259 * whether any values have changed. */
1260 for(i = 0; i < reg->hr_blocks; i++) {
1261 slot = &reg->hr_slots[i];
1262 hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
1263
1264 /* Only fill the values that o2hb_check_slot uses to
1265 * determine changing slots */
1266 slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
1267 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
1268 }
1269
1270out:
1271 mlog_exit(ret);
1272 return ret;
1273}
1274
1275/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
1276static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1277 const char *page,
1278 size_t count)
1279{
1280 long fd;
1281 int sectsize;
1282 char *p = (char *)page;
1283 struct file *filp = NULL;
1284 struct inode *inode = NULL;
1285 ssize_t ret = -EINVAL;
1286
1287 if (reg->hr_bdev)
1288 goto out;
1289
1290 /* We can't heartbeat without having had our node number
1291 * configured yet. */
1292 if (o2nm_this_node() == O2NM_MAX_NODES)
1293 goto out;
1294
1295 fd = simple_strtol(p, &p, 0);
1296 if (!p || (*p && (*p != '\n')))
1297 goto out;
1298
1299 if (fd < 0 || fd >= INT_MAX)
1300 goto out;
1301
1302 filp = fget(fd);
1303 if (filp == NULL)
1304 goto out;
1305
1306 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
1307 reg->hr_block_bytes == 0)
1308 goto out;
1309
1310 inode = igrab(filp->f_mapping->host);
1311 if (inode == NULL)
1312 goto out;
1313
1314 if (!S_ISBLK(inode->i_mode))
1315 goto out;
1316
1317 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1318 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
1319 if (ret) {
1320 reg->hr_bdev = NULL;
1321 goto out;
1322 }
1323 inode = NULL;
1324
1325 bdevname(reg->hr_bdev, reg->hr_dev_name);
1326
1327 sectsize = bdev_hardsect_size(reg->hr_bdev);
1328 if (sectsize != reg->hr_block_bytes) {
1329 mlog(ML_ERROR,
1330 "blocksize %u incorrect for device, expected %d",
1331 reg->hr_block_bytes, sectsize);
1332 ret = -EINVAL;
1333 goto out;
1334 }
1335
1336 o2hb_init_region_params(reg);
1337
1338 /* Generation of zero is invalid */
1339 do {
1340 get_random_bytes(&reg->hr_generation,
1341 sizeof(reg->hr_generation));
1342 } while (reg->hr_generation == 0);
1343
1344 ret = o2hb_map_slot_data(reg);
1345 if (ret) {
1346 mlog_errno(ret);
1347 goto out;
1348 }
1349
1350 ret = o2hb_populate_slot_data(reg);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
1357
1358 /*
1359 * A node is considered live after it has beat LIVE_THRESHOLD
1360 * times. We're not steady until we've given them a chance
1361 * _after_ our first read.
1362 */
1363 atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
1364
1365 reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1366 reg->hr_item.ci_name);
1367 if (IS_ERR(reg->hr_task)) {
1368 ret = PTR_ERR(reg->hr_task);
1369 mlog_errno(ret);
1370 reg->hr_task = NULL;
1371 goto out;
1372 }
1373
1374 ret = wait_event_interruptible(o2hb_steady_queue,
1375 atomic_read(&reg->hr_steady_iterations) == 0);
1376 if (ret) {
1377 kthread_stop(reg->hr_task);
1378 reg->hr_task = NULL;
1379 goto out;
1380 }
1381
1382 ret = count;
1383out:
1384 if (filp)
1385 fput(filp);
1386 if (inode)
1387 iput(inode);
1388 if (ret < 0) {
1389 if (reg->hr_bdev) {
1390 blkdev_put(reg->hr_bdev);
1391 reg->hr_bdev = NULL;
1392 }
1393 }
1394 return ret;
1395}
1396
1397struct o2hb_region_attribute {
1398 struct configfs_attribute attr;
1399 ssize_t (*show)(struct o2hb_region *, char *);
1400 ssize_t (*store)(struct o2hb_region *, const char *, size_t);
1401};
1402
1403static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
1404 .attr = { .ca_owner = THIS_MODULE,
1405 .ca_name = "block_bytes",
1406 .ca_mode = S_IRUGO | S_IWUSR },
1407 .show = o2hb_region_block_bytes_read,
1408 .store = o2hb_region_block_bytes_write,
1409};
1410
1411static struct o2hb_region_attribute o2hb_region_attr_start_block = {
1412 .attr = { .ca_owner = THIS_MODULE,
1413 .ca_name = "start_block",
1414 .ca_mode = S_IRUGO | S_IWUSR },
1415 .show = o2hb_region_start_block_read,
1416 .store = o2hb_region_start_block_write,
1417};
1418
1419static struct o2hb_region_attribute o2hb_region_attr_blocks = {
1420 .attr = { .ca_owner = THIS_MODULE,
1421 .ca_name = "blocks",
1422 .ca_mode = S_IRUGO | S_IWUSR },
1423 .show = o2hb_region_blocks_read,
1424 .store = o2hb_region_blocks_write,
1425};
1426
1427static struct o2hb_region_attribute o2hb_region_attr_dev = {
1428 .attr = { .ca_owner = THIS_MODULE,
1429 .ca_name = "dev",
1430 .ca_mode = S_IRUGO | S_IWUSR },
1431 .show = o2hb_region_dev_read,
1432 .store = o2hb_region_dev_write,
1433};
1434
1435static struct configfs_attribute *o2hb_region_attrs[] = {
1436 &o2hb_region_attr_block_bytes.attr,
1437 &o2hb_region_attr_start_block.attr,
1438 &o2hb_region_attr_blocks.attr,
1439 &o2hb_region_attr_dev.attr,
1440 NULL,
1441};
1442
1443static ssize_t o2hb_region_show(struct config_item *item,
1444 struct configfs_attribute *attr,
1445 char *page)
1446{
1447 struct o2hb_region *reg = to_o2hb_region(item);
1448 struct o2hb_region_attribute *o2hb_region_attr =
1449 container_of(attr, struct o2hb_region_attribute, attr);
1450 ssize_t ret = 0;
1451
1452 if (o2hb_region_attr->show)
1453 ret = o2hb_region_attr->show(reg, page);
1454 return ret;
1455}
1456
1457static ssize_t o2hb_region_store(struct config_item *item,
1458 struct configfs_attribute *attr,
1459 const char *page, size_t count)
1460{
1461 struct o2hb_region *reg = to_o2hb_region(item);
1462 struct o2hb_region_attribute *o2hb_region_attr =
1463 container_of(attr, struct o2hb_region_attribute, attr);
1464 ssize_t ret = -EINVAL;
1465
1466 if (o2hb_region_attr->store)
1467 ret = o2hb_region_attr->store(reg, page, count);
1468 return ret;
1469}
1470
1471static struct configfs_item_operations o2hb_region_item_ops = {
1472 .release = o2hb_region_release,
1473 .show_attribute = o2hb_region_show,
1474 .store_attribute = o2hb_region_store,
1475};
1476
1477static struct config_item_type o2hb_region_type = {
1478 .ct_item_ops = &o2hb_region_item_ops,
1479 .ct_attrs = o2hb_region_attrs,
1480 .ct_owner = THIS_MODULE,
1481};
1482
1483/* heartbeat set */
1484
1485struct o2hb_heartbeat_group {
1486 struct config_group hs_group;
1487 /* some stuff? */
1488};
1489
1490static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
1491{
1492 return group ?
1493 container_of(group, struct o2hb_heartbeat_group, hs_group)
1494 : NULL;
1495}
1496
1497static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1498 const char *name)
1499{
1500 struct o2hb_region *reg = NULL;
1501 struct config_item *ret = NULL;
1502
1503 reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
1504 if (reg == NULL)
1505 goto out; /* ENOMEM */
1506
1507 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1508
1509 ret = &reg->hr_item;
1510
1511 spin_lock(&o2hb_live_lock);
1512 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1513 spin_unlock(&o2hb_live_lock);
1514out:
1515 if (ret == NULL)
1516 kfree(reg);
1517
1518 return ret;
1519}
1520
1521static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1522 struct config_item *item)
1523{
1524 struct o2hb_region *reg = to_o2hb_region(item);
1525
1526 /* stop the thread when the user removes the region dir */
1527 if (reg->hr_task) {
1528 kthread_stop(reg->hr_task);
1529 reg->hr_task = NULL;
1530 }
1531
1532 config_item_put(item);
1533}
1534
1535struct o2hb_heartbeat_group_attribute {
1536 struct configfs_attribute attr;
1537 ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
1538 ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
1539};
1540
1541static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
1542 struct configfs_attribute *attr,
1543 char *page)
1544{
1545 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1546 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1547 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1548 ssize_t ret = 0;
1549
1550 if (o2hb_heartbeat_group_attr->show)
1551 ret = o2hb_heartbeat_group_attr->show(reg, page);
1552 return ret;
1553}
1554
1555static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
1556 struct configfs_attribute *attr,
1557 const char *page, size_t count)
1558{
1559 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1560 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1561 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1562 ssize_t ret = -EINVAL;
1563
1564 if (o2hb_heartbeat_group_attr->store)
1565 ret = o2hb_heartbeat_group_attr->store(reg, page, count);
1566 return ret;
1567}
1568
1569static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
1570 char *page)
1571{
1572 return sprintf(page, "%u\n", o2hb_dead_threshold);
1573}
1574
1575static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
1576 const char *page,
1577 size_t count)
1578{
1579 unsigned long tmp;
1580 char *p = (char *)page;
1581
1582 tmp = simple_strtoul(p, &p, 10);
1583 if (!p || (*p && (*p != '\n')))
1584 return -EINVAL;
1585
1586 /* this will validate ranges for us. */
1587 o2hb_dead_threshold_set((unsigned int) tmp);
1588
1589 return count;
1590}
1591
1592static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1593 .attr = { .ca_owner = THIS_MODULE,
1594 .ca_name = "dead_threshold",
1595 .ca_mode = S_IRUGO | S_IWUSR },
1596 .show = o2hb_heartbeat_group_threshold_show,
1597 .store = o2hb_heartbeat_group_threshold_store,
1598};
1599
1600static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1601 &o2hb_heartbeat_group_attr_threshold.attr,
1602 NULL,
1603};
1604
1605static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
1606 .show_attribute = o2hb_heartbeat_group_show,
1607 .store_attribute = o2hb_heartbeat_group_store,
1608};
1609
1610static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
1611 .make_item = o2hb_heartbeat_group_make_item,
1612 .drop_item = o2hb_heartbeat_group_drop_item,
1613};
1614
1615static struct config_item_type o2hb_heartbeat_group_type = {
1616 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
1617 .ct_item_ops = &o2hb_hearbeat_group_item_ops,
1618 .ct_attrs = o2hb_heartbeat_group_attrs,
1619 .ct_owner = THIS_MODULE,
1620};
1621
1622/* this is just here to avoid touching group in heartbeat.h which the
1623 * entire damn world #includes */
1624struct config_group *o2hb_alloc_hb_set(void)
1625{
1626 struct o2hb_heartbeat_group *hs = NULL;
1627 struct config_group *ret = NULL;
1628
1629 hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
1630 if (hs == NULL)
1631 goto out;
1632
1633 config_group_init_type_name(&hs->hs_group, "heartbeat",
1634 &o2hb_heartbeat_group_type);
1635
1636 ret = &hs->hs_group;
1637out:
1638 if (ret == NULL)
1639 kfree(hs);
1640 return ret;
1641}
1642
1643void o2hb_free_hb_set(struct config_group *group)
1644{
1645 struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
1646 kfree(hs);
1647}
1648
1649/* hb callback registration and issueing */
1650
1651static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
1652{
1653 if (type == O2HB_NUM_CB)
1654 return ERR_PTR(-EINVAL);
1655
1656 return &o2hb_callbacks[type];
1657}
1658
1659void o2hb_setup_callback(struct o2hb_callback_func *hc,
1660 enum o2hb_callback_type type,
1661 o2hb_cb_func *func,
1662 void *data,
1663 int priority)
1664{
1665 INIT_LIST_HEAD(&hc->hc_item);
1666 hc->hc_func = func;
1667 hc->hc_data = data;
1668 hc->hc_priority = priority;
1669 hc->hc_type = type;
1670 hc->hc_magic = O2HB_CB_MAGIC;
1671}
1672EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1673
1674int o2hb_register_callback(struct o2hb_callback_func *hc)
1675{
1676 struct o2hb_callback_func *tmp;
1677 struct list_head *iter;
1678 struct o2hb_callback *hbcall;
1679 int ret;
1680
1681 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1682 BUG_ON(!list_empty(&hc->hc_item));
1683
1684 hbcall = hbcall_from_type(hc->hc_type);
1685 if (IS_ERR(hbcall)) {
1686 ret = PTR_ERR(hbcall);
1687 goto out;
1688 }
1689
1690 down_write(&o2hb_callback_sem);
1691
1692 list_for_each(iter, &hbcall->list) {
1693 tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
1694 if (hc->hc_priority < tmp->hc_priority) {
1695 list_add_tail(&hc->hc_item, iter);
1696 break;
1697 }
1698 }
1699 if (list_empty(&hc->hc_item))
1700 list_add_tail(&hc->hc_item, &hbcall->list);
1701
1702 up_write(&o2hb_callback_sem);
1703 ret = 0;
1704out:
1705 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
1706 ret, __builtin_return_address(0), hc);
1707 return ret;
1708}
1709EXPORT_SYMBOL_GPL(o2hb_register_callback);
1710
1711int o2hb_unregister_callback(struct o2hb_callback_func *hc)
1712{
1713 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1714
1715 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1716 __builtin_return_address(0), hc);
1717
1718 if (list_empty(&hc->hc_item))
1719 return 0;
1720
1721 down_write(&o2hb_callback_sem);
1722
1723 list_del_init(&hc->hc_item);
1724
1725 up_write(&o2hb_callback_sem);
1726
1727 return 0;
1728}
1729EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
1730
1731int o2hb_check_node_heartbeating(u8 node_num)
1732{
1733 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1734
1735 o2hb_fill_node_map(testing_map, sizeof(testing_map));
1736 if (!test_bit(node_num, testing_map)) {
1737 mlog(ML_HEARTBEAT,
1738 "node (%u) does not have heartbeating enabled.\n",
1739 node_num);
1740 return 0;
1741 }
1742
1743 return 1;
1744}
1745EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
1746
1747int o2hb_check_node_heartbeating_from_callback(u8 node_num)
1748{
1749 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1750
1751 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
1752 if (!test_bit(node_num, testing_map)) {
1753 mlog(ML_HEARTBEAT,
1754 "node (%u) does not have heartbeating enabled.\n",
1755 node_num);
1756 return 0;
1757 }
1758
1759 return 1;
1760}
1761EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
1762
1763/* Makes sure our local node is configured with a node number, and is
1764 * heartbeating. */
1765int o2hb_check_local_node_heartbeating(void)
1766{
1767 u8 node_num;
1768
1769 /* if this node was set then we have networking */
1770 node_num = o2nm_this_node();
1771 if (node_num == O2NM_MAX_NODES) {
1772 mlog(ML_HEARTBEAT, "this node has not been configured.\n");
1773 return 0;
1774 }
1775
1776 return o2hb_check_node_heartbeating(node_num);
1777}
1778EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
1779
1780/*
1781 * this is just a hack until we get the plumbing which flips file systems
1782 * read only and drops the hb ref instead of killing the node dead.
1783 */
1784void o2hb_stop_all_regions(void)
1785{
1786 struct o2hb_region *reg;
1787
1788 mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
1789
1790 spin_lock(&o2hb_live_lock);
1791
1792 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
1793 reg->hr_unclean_stop = 1;
1794
1795 spin_unlock(&o2hb_live_lock);
1796}
1797EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 000000000000..cac6223206a9
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_HEARTBEAT_H
28#define O2CLUSTER_HEARTBEAT_H
29
30#include "ocfs2_heartbeat.h"
31
32#define O2HB_REGION_TIMEOUT_MS 2000
33
34/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
42
43#define O2HB_CB_MAGIC 0x51d1e4ec
44
45/* callback stuff */
46enum o2hb_callback_type {
47 O2HB_NODE_DOWN_CB = 0,
48 O2HB_NODE_UP_CB,
49 O2HB_NUM_CB
50};
51
52struct o2nm_node;
53typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
54
55struct o2hb_callback_func {
56 u32 hc_magic;
57 struct list_head hc_item;
58 o2hb_cb_func *hc_func;
59 void *hc_data;
60 int hc_priority;
61 enum o2hb_callback_type hc_type;
62};
63
64struct config_group *o2hb_alloc_hb_set(void);
65void o2hb_free_hb_set(struct config_group *group);
66
67void o2hb_setup_callback(struct o2hb_callback_func *hc,
68 enum o2hb_callback_type type,
69 o2hb_cb_func *func,
70 void *data,
71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc);
73int o2hb_unregister_callback(struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes);
76void o2hb_init(void);
77int o2hb_check_node_heartbeating(u8 node_num);
78int o2hb_check_node_heartbeating_from_callback(u8 node_num);
79int o2hb_check_local_node_heartbeating(void);
80void o2hb_stop_all_regions(void);
81
82#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 000000000000..fd741cea5705
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/proc_fs.h>
25#include <linux/seq_file.h>
26#include <linux/string.h>
27#include <asm/uaccess.h>
28
29#include "masklog.h"
30
31struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
32EXPORT_SYMBOL_GPL(mlog_and_bits);
33struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
34EXPORT_SYMBOL_GPL(mlog_not_bits);
35
36static ssize_t mlog_mask_show(u64 mask, char *buf)
37{
38 char *state;
39
40 if (__mlog_test_u64(mask, mlog_and_bits))
41 state = "allow";
42 else if (__mlog_test_u64(mask, mlog_not_bits))
43 state = "deny";
44 else
45 state = "off";
46
47 return snprintf(buf, PAGE_SIZE, "%s\n", state);
48}
49
50static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
51{
52 if (!strnicmp(buf, "allow", 5)) {
53 __mlog_set_u64(mask, mlog_and_bits);
54 __mlog_clear_u64(mask, mlog_not_bits);
55 } else if (!strnicmp(buf, "deny", 4)) {
56 __mlog_set_u64(mask, mlog_not_bits);
57 __mlog_clear_u64(mask, mlog_and_bits);
58 } else if (!strnicmp(buf, "off", 3)) {
59 __mlog_clear_u64(mask, mlog_not_bits);
60 __mlog_clear_u64(mask, mlog_and_bits);
61 } else
62 return -EINVAL;
63
64 return count;
65}
66
67struct mlog_attribute {
68 struct attribute attr;
69 u64 mask;
70};
71
72#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
73
74#define define_mask(_name) { \
75 .attr = { \
76 .name = #_name, \
77 .mode = S_IRUGO | S_IWUSR, \
78 }, \
79 .mask = ML_##_name, \
80}
81
82static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
83 define_mask(ENTRY),
84 define_mask(EXIT),
85 define_mask(TCP),
86 define_mask(MSG),
87 define_mask(SOCKET),
88 define_mask(HEARTBEAT),
89 define_mask(HB_BIO),
90 define_mask(DLMFS),
91 define_mask(DLM),
92 define_mask(DLM_DOMAIN),
93 define_mask(DLM_THREAD),
94 define_mask(DLM_MASTER),
95 define_mask(DLM_RECOVERY),
96 define_mask(AIO),
97 define_mask(JOURNAL),
98 define_mask(DISK_ALLOC),
99 define_mask(SUPER),
100 define_mask(FILE_IO),
101 define_mask(EXTENT_MAP),
102 define_mask(DLM_GLUE),
103 define_mask(BH_IO),
104 define_mask(UPTODATE),
105 define_mask(NAMEI),
106 define_mask(INODE),
107 define_mask(VOTE),
108 define_mask(DCACHE),
109 define_mask(CONN),
110 define_mask(QUORUM),
111 define_mask(EXPORT),
112 define_mask(ERROR),
113 define_mask(NOTICE),
114 define_mask(KTHREAD),
115};
116
117static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
118
119static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
120 char *buf)
121{
122 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
123
124 return mlog_mask_show(mlog_attr->mask, buf);
125}
126
127static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
128 const char *buf, size_t count)
129{
130 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
131
132 return mlog_mask_store(mlog_attr->mask, buf, count);
133}
134
135static struct sysfs_ops mlog_attr_ops = {
136 .show = mlog_show,
137 .store = mlog_store,
138};
139
140static struct kobj_type mlog_ktype = {
141 .default_attrs = mlog_attr_ptrs,
142 .sysfs_ops = &mlog_attr_ops,
143};
144
145static struct kset mlog_kset = {
146 .kobj = {.name = "logmask", .ktype = &mlog_ktype},
147};
148
149int mlog_sys_init(struct subsystem *o2cb_subsys)
150{
151 int i = 0;
152
153 while (mlog_attrs[i].attr.mode) {
154 mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
155 i++;
156 }
157 mlog_attr_ptrs[i] = NULL;
158
159 mlog_kset.subsys = o2cb_subsys;
160 return kset_register(&mlog_kset);
161}
162
163void mlog_sys_shutdown(void)
164{
165 kset_unregister(&mlog_kset);
166}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 000000000000..f5ef5ea61a05
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,275 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_MASKLOG_H
23#define O2CLUSTER_MASKLOG_H
24
25/*
26 * For now this is a trivial wrapper around printk() that gives the critical
27 * ability to enable sets of debugging output at run-time. In the future this
28 * will almost certainly be redirected to relayfs so that it can pay a
29 * substantially lower heisenberg tax.
30 *
31 * Callers associate the message with a bitmask and a global bitmask is
32 * maintained with help from /proc. If any of the bits match the message is
33 * output.
34 *
35 * We must have efficient bit tests on i386 and it seems gcc still emits crazy
36 * code for the 64bit compare. It emits very good code for the dual unsigned
37 * long tests, though, completely avoiding tests that can never pass if the
38 * caller gives a constant bitmask that fills one of the longs with all 0s. So
39 * the desire is to have almost all of the calls decided on by comparing just
40 * one of the longs. This leads to having infrequently given bits that are
41 * frequently matched in the high bits.
42 *
43 * _ERROR and _NOTICE are used for messages that always go to the console and
44 * have appropriate KERN_ prefixes. We wrap these in our function instead of
45 * just calling printk() so that this can eventually make its way through
46 * relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
47 * The inline tests and macro dance give GCC the opportunity to quite cleverly
48 * only emit the appropriage printk() when the caller passes in a constant
49 * mask, as is almost always the case.
50 *
51 * All this bitmask nonsense is hidden from the /proc interface so that Joel
52 * doesn't have an aneurism. Reading the file gives a straight forward
53 * indication of which bits are on or off:
54 * ENTRY off
55 * EXIT off
56 * TCP off
57 * MSG off
58 * SOCKET off
59 * ERROR off
60 * NOTICE on
61 *
62 * Writing changes the state of a given bit and requires a strictly formatted
63 * single write() call:
64 *
65 * write(fd, "ENTRY on", 8);
66 *
67 * would turn the entry bit on. "1" is also accepted in the place of "on", and
68 * "off" and "0" behave as expected.
69 *
70 * Some trivial shell can flip all the bits on or off:
71 *
72 * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
73 * cat $log_mask | (
74 * while read bit status; do
75 * # $1 is "on" or "off", say
76 * echo "$bit $1" > $log_mask
77 * done
78 * )
79 */
80
81/* for task_struct */
82#include <linux/sched.h>
83
84/* bits that are frequently given and infrequently matched in the low word */
85/* NOTE: If you add a flag, you need to also update mlog.c! */
86#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
87#define ML_EXIT 0x0000000000000002ULL /* func call exit */
88#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
89#define ML_MSG 0x0000000000000008ULL /* net network messages */
90#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
91#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
92#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
93#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
94#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
95#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
96#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
97#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
98#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
99#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
100#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
101#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
102#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
103#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
104#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
105#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
106#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
107#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
108#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
109#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
110#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
111#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
112#define ML_CONN 0x0000000004000000ULL /* net connection management */
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115/* bits that are infrequently given and frequently matched in the high word */
116#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
117#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
118#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
119
120#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
121#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
122#ifndef MLOG_MASK_PREFIX
123#define MLOG_MASK_PREFIX 0
124#endif
125
126#define MLOG_MAX_BITS 64
127
128struct mlog_bits {
129 unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
130};
131
132extern struct mlog_bits mlog_and_bits, mlog_not_bits;
133
134#if BITS_PER_LONG == 32
135
136#define __mlog_test_u64(mask, bits) \
137 ( (u32)(mask & 0xffffffff) & bits.words[0] || \
138 ((u64)(mask) >> 32) & bits.words[1] )
139#define __mlog_set_u64(mask, bits) do { \
140 bits.words[0] |= (u32)(mask & 0xffffffff); \
141 bits.words[1] |= (u64)(mask) >> 32; \
142} while (0)
143#define __mlog_clear_u64(mask, bits) do { \
144 bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
145 bits.words[1] &= ~((u64)(mask) >> 32); \
146} while (0)
147#define MLOG_BITS_RHS(mask) { \
148 { \
149 [0] = (u32)(mask & 0xffffffff), \
150 [1] = (u64)(mask) >> 32, \
151 } \
152}
153
154#else /* 32bit long above, 64bit long below */
155
156#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
157#define __mlog_set_u64(mask, bits) do { \
158 bits.words[0] |= (mask); \
159} while (0)
160#define __mlog_clear_u64(mask, bits) do { \
161 bits.words[0] &= ~(mask); \
162} while (0)
163#define MLOG_BITS_RHS(mask) { { (mask) } }
164
165#endif
166
167/*
168 * smp_processor_id() "helpfully" screams when called outside preemptible
169 * regions in current kernels. sles doesn't have the variants that don't
170 * scream. just do this instead of trying to guess which we're building
171 * against.. *sigh*.
172 */
173#define __mlog_cpu_guess ({ \
174 unsigned long _cpu = get_cpu(); \
175 put_cpu(); \
176 _cpu; \
177})
178
179/* In the following two macros, the whitespace after the ',' just
180 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
181 * previous token if args expands to nothing.
182 */
183#define __mlog_printk(level, fmt, args...) \
184 printk(level "(%u,%lu):%s:%d " fmt, current->pid, \
185 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
186 ##args)
187
188#define mlog(mask, fmt, args...) do { \
189 u64 __m = MLOG_MASK_PREFIX | (mask); \
190 if (__mlog_test_u64(__m, mlog_and_bits) && \
191 !__mlog_test_u64(__m, mlog_not_bits)) { \
192 if (__m & ML_ERROR) \
193 __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
194 else if (__m & ML_NOTICE) \
195 __mlog_printk(KERN_NOTICE, fmt , ##args); \
196 else __mlog_printk(KERN_INFO, fmt , ##args); \
197 } \
198} while (0)
199
200#define mlog_errno(st) do { \
201 int _st = (st); \
202 if (_st != -ERESTARTSYS && _st != -EINTR && \
203 _st != AOP_TRUNCATED_PAGE) \
204 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
205} while (0)
206
207#define mlog_entry(fmt, args...) do { \
208 mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
209} while (0)
210
211#define mlog_entry_void() do { \
212 mlog(ML_ENTRY, "ENTRY:\n"); \
213} while (0)
214
215/* We disable this for old compilers since they don't have support for
216 * __builtin_types_compatible_p.
217 */
218#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
219 !defined(__CHECKER__)
220#define mlog_exit(st) do { \
221 if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
222 mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
223 else if (__builtin_types_compatible_p(typeof(st), signed long)) \
224 mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
225 else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
226 || __builtin_types_compatible_p(typeof(st), unsigned short) \
227 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
228 mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
229 else if (__builtin_types_compatible_p(typeof(st), signed int) \
230 || __builtin_types_compatible_p(typeof(st), signed short) \
231 || __builtin_types_compatible_p(typeof(st), signed char)) \
232 mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
233 else if (__builtin_types_compatible_p(typeof(st), long long)) \
234 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
235 else \
236 mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
237} while (0)
238#else
239#define mlog_exit(st) do { \
240 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
241} while (0)
242#endif
243
244#define mlog_exit_ptr(ptr) do { \
245 mlog(ML_EXIT, "EXIT: %p\n", ptr); \
246} while (0)
247
248#define mlog_exit_void() do { \
249 mlog(ML_EXIT, "EXIT\n"); \
250} while (0)
251
252#define mlog_bug_on_msg(cond, fmt, args...) do { \
253 if (cond) { \
254 mlog(ML_ERROR, "bug expression: " #cond "\n"); \
255 mlog(ML_ERROR, fmt, ##args); \
256 BUG(); \
257 } \
258} while (0)
259
260#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
261#define MLFi64 "lld"
262#define MLFu64 "llu"
263#define MLFx64 "llx"
264#else
265#define MLFi64 "ld"
266#define MLFu64 "lu"
267#define MLFx64 "lx"
268#endif
269
270#include <linux/kobject.h>
271#include <linux/sysfs.h>
272int mlog_sys_init(struct subsystem *o2cb_subsys);
273void mlog_sys_shutdown(void);
274
275#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 000000000000..5fd60c105913
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/sysctl.h>
25#include <linux/configfs.h>
26
27#include "endian.h"
28#include "tcp.h"
29#include "nodemanager.h"
30#include "heartbeat.h"
31#include "masklog.h"
32#include "sys.h"
33#include "ver.h"
34
35/* for now we operate under the assertion that there can be only one
36 * cluster active at a time. Changing this will require trickling
37 * cluster references throughout where nodes are looked up */
38static struct o2nm_cluster *o2nm_single_cluster = NULL;
39
40#define OCFS2_MAX_HB_CTL_PATH 256
41static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
42
43static ctl_table ocfs2_nm_table[] = {
44 {
45 .ctl_name = 1,
46 .procname = "hb_ctl_path",
47 .data = ocfs2_hb_ctl_path,
48 .maxlen = OCFS2_MAX_HB_CTL_PATH,
49 .mode = 0644,
50 .proc_handler = &proc_dostring,
51 .strategy = &sysctl_string,
52 },
53 { .ctl_name = 0 }
54};
55
56static ctl_table ocfs2_mod_table[] = {
57 {
58 .ctl_name = KERN_OCFS2_NM,
59 .procname = "nm",
60 .data = NULL,
61 .maxlen = 0,
62 .mode = 0555,
63 .child = ocfs2_nm_table
64 },
65 { .ctl_name = 0}
66};
67
68static ctl_table ocfs2_kern_table[] = {
69 {
70 .ctl_name = KERN_OCFS2,
71 .procname = "ocfs2",
72 .data = NULL,
73 .maxlen = 0,
74 .mode = 0555,
75 .child = ocfs2_mod_table
76 },
77 { .ctl_name = 0}
78};
79
80static ctl_table ocfs2_root_table[] = {
81 {
82 .ctl_name = CTL_FS,
83 .procname = "fs",
84 .data = NULL,
85 .maxlen = 0,
86 .mode = 0555,
87 .child = ocfs2_kern_table
88 },
89 { .ctl_name = 0 }
90};
91
92static struct ctl_table_header *ocfs2_table_header = NULL;
93
94const char *o2nm_get_hb_ctl_path(void)
95{
96 return ocfs2_hb_ctl_path;
97}
98EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
99
100struct o2nm_cluster {
101 struct config_group cl_group;
102 unsigned cl_has_local:1;
103 u8 cl_local_node;
104 rwlock_t cl_nodes_lock;
105 struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
106 struct rb_root cl_node_ip_tree;
107 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
108 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
109};
110
111struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
112{
113 struct o2nm_node *node = NULL;
114
115 if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
116 goto out;
117
118 read_lock(&o2nm_single_cluster->cl_nodes_lock);
119 node = o2nm_single_cluster->cl_nodes[node_num];
120 if (node)
121 config_item_get(&node->nd_item);
122 read_unlock(&o2nm_single_cluster->cl_nodes_lock);
123out:
124 return node;
125}
126EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
127
128int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
129{
130 struct o2nm_cluster *cluster = o2nm_single_cluster;
131
132 BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
133
134 if (cluster == NULL)
135 return -EINVAL;
136
137 read_lock(&cluster->cl_nodes_lock);
138 memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
139 read_unlock(&cluster->cl_nodes_lock);
140
141 return 0;
142}
143EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
144
145static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
146 __be32 ip_needle,
147 struct rb_node ***ret_p,
148 struct rb_node **ret_parent)
149{
150 struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
151 struct rb_node *parent = NULL;
152 struct o2nm_node *node, *ret = NULL;
153
154 while (*p) {
155 parent = *p;
156 node = rb_entry(parent, struct o2nm_node, nd_ip_node);
157
158 if (memcmp(&ip_needle, &node->nd_ipv4_address,
159 sizeof(ip_needle)) < 0)
160 p = &(*p)->rb_left;
161 else if (memcmp(&ip_needle, &node->nd_ipv4_address,
162 sizeof(ip_needle)) > 0)
163 p = &(*p)->rb_right;
164 else {
165 ret = node;
166 break;
167 }
168 }
169
170 if (ret_p != NULL)
171 *ret_p = p;
172 if (ret_parent != NULL)
173 *ret_parent = parent;
174
175 return ret;
176}
177
178struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
179{
180 struct o2nm_node *node = NULL;
181 struct o2nm_cluster *cluster = o2nm_single_cluster;
182
183 if (cluster == NULL)
184 goto out;
185
186 read_lock(&cluster->cl_nodes_lock);
187 node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
188 if (node)
189 config_item_get(&node->nd_item);
190 read_unlock(&cluster->cl_nodes_lock);
191
192out:
193 return node;
194}
195EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
196
197void o2nm_node_put(struct o2nm_node *node)
198{
199 config_item_put(&node->nd_item);
200}
201EXPORT_SYMBOL_GPL(o2nm_node_put);
202
203void o2nm_node_get(struct o2nm_node *node)
204{
205 config_item_get(&node->nd_item);
206}
207EXPORT_SYMBOL_GPL(o2nm_node_get);
208
209u8 o2nm_this_node(void)
210{
211 u8 node_num = O2NM_MAX_NODES;
212
213 if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
214 node_num = o2nm_single_cluster->cl_local_node;
215
216 return node_num;
217}
218EXPORT_SYMBOL_GPL(o2nm_this_node);
219
220/* node configfs bits */
221
222static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
223{
224 return item ?
225 container_of(to_config_group(item), struct o2nm_cluster,
226 cl_group)
227 : NULL;
228}
229
230static struct o2nm_node *to_o2nm_node(struct config_item *item)
231{
232 return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
233}
234
235static void o2nm_node_release(struct config_item *item)
236{
237 struct o2nm_node *node = to_o2nm_node(item);
238 kfree(node);
239}
240
241static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
242{
243 return sprintf(page, "%d\n", node->nd_num);
244}
245
246static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
247{
248 /* through the first node_set .parent
249 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
250 return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
251}
252
253enum {
254 O2NM_NODE_ATTR_NUM = 0,
255 O2NM_NODE_ATTR_PORT,
256 O2NM_NODE_ATTR_ADDRESS,
257 O2NM_NODE_ATTR_LOCAL,
258};
259
260static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
261 size_t count)
262{
263 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
264 unsigned long tmp;
265 char *p = (char *)page;
266
267 tmp = simple_strtoul(p, &p, 0);
268 if (!p || (*p && (*p != '\n')))
269 return -EINVAL;
270
271 if (tmp >= O2NM_MAX_NODES)
272 return -ERANGE;
273
274 /* once we're in the cl_nodes tree networking can look us up by
275 * node number and try to use our address and port attributes
276 * to connect to this node.. make sure that they've been set
277 * before writing the node attribute? */
278 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
279 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
280 return -EINVAL; /* XXX */
281
282 write_lock(&cluster->cl_nodes_lock);
283 if (cluster->cl_nodes[tmp])
284 p = NULL;
285 else {
286 cluster->cl_nodes[tmp] = node;
287 node->nd_num = tmp;
288 set_bit(tmp, cluster->cl_nodes_bitmap);
289 }
290 write_unlock(&cluster->cl_nodes_lock);
291 if (p == NULL)
292 return -EEXIST;
293
294 return count;
295}
296static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
297{
298 return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
299}
300
301static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
302 const char *page, size_t count)
303{
304 unsigned long tmp;
305 char *p = (char *)page;
306
307 tmp = simple_strtoul(p, &p, 0);
308 if (!p || (*p && (*p != '\n')))
309 return -EINVAL;
310
311 if (tmp == 0)
312 return -EINVAL;
313 if (tmp >= (u16)-1)
314 return -ERANGE;
315
316 node->nd_ipv4_port = htons(tmp);
317
318 return count;
319}
320
321static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
322{
323 return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
324}
325
326static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
327 const char *page,
328 size_t count)
329{
330 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
331 int ret, i;
332 struct rb_node **p, *parent;
333 unsigned int octets[4];
334 __be32 ipv4_addr = 0;
335
336 ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
337 &octets[1], &octets[0]);
338 if (ret != 4)
339 return -EINVAL;
340
341 for (i = 0; i < ARRAY_SIZE(octets); i++) {
342 if (octets[i] > 255)
343 return -ERANGE;
344 be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
345 }
346
347 ret = 0;
348 write_lock(&cluster->cl_nodes_lock);
349 if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
350 ret = -EEXIST;
351 else {
352 rb_link_node(&node->nd_ip_node, parent, p);
353 rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
354 }
355 write_unlock(&cluster->cl_nodes_lock);
356 if (ret)
357 return ret;
358
359 memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
360
361 return count;
362}
363
364static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
365{
366 return sprintf(page, "%d\n", node->nd_local);
367}
368
369static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
370 size_t count)
371{
372 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
373 unsigned long tmp;
374 char *p = (char *)page;
375 ssize_t ret;
376
377 tmp = simple_strtoul(p, &p, 0);
378 if (!p || (*p && (*p != '\n')))
379 return -EINVAL;
380
381 tmp = !!tmp; /* boolean of whether this node wants to be local */
382
383 /* setting local turns on networking rx for now so we require having
384 * set everything else first */
385 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
386 !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
387 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
388 return -EINVAL; /* XXX */
389
390 /* the only failure case is trying to set a new local node
391 * when a different one is already set */
392 if (tmp && tmp == cluster->cl_has_local &&
393 cluster->cl_local_node != node->nd_num)
394 return -EBUSY;
395
396 /* bring up the rx thread if we're setting the new local node. */
397 if (tmp && !cluster->cl_has_local) {
398 ret = o2net_start_listening(node);
399 if (ret)
400 return ret;
401 }
402
403 if (!tmp && cluster->cl_has_local &&
404 cluster->cl_local_node == node->nd_num) {
405 o2net_stop_listening(node);
406 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
407 }
408
409 node->nd_local = tmp;
410 if (node->nd_local) {
411 cluster->cl_has_local = tmp;
412 cluster->cl_local_node = node->nd_num;
413 }
414
415 return count;
416}
417
418struct o2nm_node_attribute {
419 struct configfs_attribute attr;
420 ssize_t (*show)(struct o2nm_node *, char *);
421 ssize_t (*store)(struct o2nm_node *, const char *, size_t);
422};
423
424static struct o2nm_node_attribute o2nm_node_attr_num = {
425 .attr = { .ca_owner = THIS_MODULE,
426 .ca_name = "num",
427 .ca_mode = S_IRUGO | S_IWUSR },
428 .show = o2nm_node_num_read,
429 .store = o2nm_node_num_write,
430};
431
432static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
433 .attr = { .ca_owner = THIS_MODULE,
434 .ca_name = "ipv4_port",
435 .ca_mode = S_IRUGO | S_IWUSR },
436 .show = o2nm_node_ipv4_port_read,
437 .store = o2nm_node_ipv4_port_write,
438};
439
440static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
441 .attr = { .ca_owner = THIS_MODULE,
442 .ca_name = "ipv4_address",
443 .ca_mode = S_IRUGO | S_IWUSR },
444 .show = o2nm_node_ipv4_address_read,
445 .store = o2nm_node_ipv4_address_write,
446};
447
448static struct o2nm_node_attribute o2nm_node_attr_local = {
449 .attr = { .ca_owner = THIS_MODULE,
450 .ca_name = "local",
451 .ca_mode = S_IRUGO | S_IWUSR },
452 .show = o2nm_node_local_read,
453 .store = o2nm_node_local_write,
454};
455
456static struct configfs_attribute *o2nm_node_attrs[] = {
457 [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
458 [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
459 [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
460 [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
461 NULL,
462};
463
464static int o2nm_attr_index(struct configfs_attribute *attr)
465{
466 int i;
467 for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
468 if (attr == o2nm_node_attrs[i])
469 return i;
470 }
471 BUG();
472 return 0;
473}
474
475static ssize_t o2nm_node_show(struct config_item *item,
476 struct configfs_attribute *attr,
477 char *page)
478{
479 struct o2nm_node *node = to_o2nm_node(item);
480 struct o2nm_node_attribute *o2nm_node_attr =
481 container_of(attr, struct o2nm_node_attribute, attr);
482 ssize_t ret = 0;
483
484 if (o2nm_node_attr->show)
485 ret = o2nm_node_attr->show(node, page);
486 return ret;
487}
488
489static ssize_t o2nm_node_store(struct config_item *item,
490 struct configfs_attribute *attr,
491 const char *page, size_t count)
492{
493 struct o2nm_node *node = to_o2nm_node(item);
494 struct o2nm_node_attribute *o2nm_node_attr =
495 container_of(attr, struct o2nm_node_attribute, attr);
496 ssize_t ret;
497 int attr_index = o2nm_attr_index(attr);
498
499 if (o2nm_node_attr->store == NULL) {
500 ret = -EINVAL;
501 goto out;
502 }
503
504 if (test_bit(attr_index, &node->nd_set_attributes))
505 return -EBUSY;
506
507 ret = o2nm_node_attr->store(node, page, count);
508 if (ret < count)
509 goto out;
510
511 set_bit(attr_index, &node->nd_set_attributes);
512out:
513 return ret;
514}
515
516static struct configfs_item_operations o2nm_node_item_ops = {
517 .release = o2nm_node_release,
518 .show_attribute = o2nm_node_show,
519 .store_attribute = o2nm_node_store,
520};
521
522static struct config_item_type o2nm_node_type = {
523 .ct_item_ops = &o2nm_node_item_ops,
524 .ct_attrs = o2nm_node_attrs,
525 .ct_owner = THIS_MODULE,
526};
527
528/* node set */
529
530struct o2nm_node_group {
531 struct config_group ns_group;
532 /* some stuff? */
533};
534
535#if 0
536static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
537{
538 return group ?
539 container_of(group, struct o2nm_node_group, ns_group)
540 : NULL;
541}
542#endif
543
544static struct config_item *o2nm_node_group_make_item(struct config_group *group,
545 const char *name)
546{
547 struct o2nm_node *node = NULL;
548 struct config_item *ret = NULL;
549
550 if (strlen(name) > O2NM_MAX_NAME_LEN)
551 goto out; /* ENAMETOOLONG */
552
553 node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
554 if (node == NULL)
555 goto out; /* ENOMEM */
556
557 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
558 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
559 spin_lock_init(&node->nd_lock);
560
561 ret = &node->nd_item;
562
563out:
564 if (ret == NULL)
565 kfree(node);
566
567 return ret;
568}
569
570static void o2nm_node_group_drop_item(struct config_group *group,
571 struct config_item *item)
572{
573 struct o2nm_node *node = to_o2nm_node(item);
574 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
575
576 o2net_disconnect_node(node);
577
578 if (cluster->cl_has_local &&
579 (cluster->cl_local_node == node->nd_num)) {
580 cluster->cl_has_local = 0;
581 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
582 o2net_stop_listening(node);
583 }
584
585 /* XXX call into net to stop this node from trading messages */
586
587 write_lock(&cluster->cl_nodes_lock);
588
589 /* XXX sloppy */
590 if (node->nd_ipv4_address)
591 rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
592
593 /* nd_num might be 0 if the node number hasn't been set.. */
594 if (cluster->cl_nodes[node->nd_num] == node) {
595 cluster->cl_nodes[node->nd_num] = NULL;
596 clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
597 }
598 write_unlock(&cluster->cl_nodes_lock);
599
600 config_item_put(item);
601}
602
603static struct configfs_group_operations o2nm_node_group_group_ops = {
604 .make_item = o2nm_node_group_make_item,
605 .drop_item = o2nm_node_group_drop_item,
606};
607
608static struct config_item_type o2nm_node_group_type = {
609 .ct_group_ops = &o2nm_node_group_group_ops,
610 .ct_owner = THIS_MODULE,
611};
612
613/* cluster */
614
615static void o2nm_cluster_release(struct config_item *item)
616{
617 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
618
619 kfree(cluster->cl_group.default_groups);
620 kfree(cluster);
621}
622
623static struct configfs_item_operations o2nm_cluster_item_ops = {
624 .release = o2nm_cluster_release,
625};
626
627static struct config_item_type o2nm_cluster_type = {
628 .ct_item_ops = &o2nm_cluster_item_ops,
629 .ct_owner = THIS_MODULE,
630};
631
632/* cluster set */
633
634struct o2nm_cluster_group {
635 struct configfs_subsystem cs_subsys;
636 /* some stuff? */
637};
638
639#if 0
640static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
641{
642 return group ?
643 container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
644 : NULL;
645}
646#endif
647
648static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
649 const char *name)
650{
651 struct o2nm_cluster *cluster = NULL;
652 struct o2nm_node_group *ns = NULL;
653 struct config_group *o2hb_group = NULL, *ret = NULL;
654 void *defs = NULL;
655
656 /* this runs under the parent dir's i_sem; there can be only
657 * one caller in here at a time */
658 if (o2nm_single_cluster)
659 goto out; /* ENOSPC */
660
661 cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
662 ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
663 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
664 o2hb_group = o2hb_alloc_hb_set();
665 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
666 goto out;
667
668 config_group_init_type_name(&cluster->cl_group, name,
669 &o2nm_cluster_type);
670 config_group_init_type_name(&ns->ns_group, "node",
671 &o2nm_node_group_type);
672
673 cluster->cl_group.default_groups = defs;
674 cluster->cl_group.default_groups[0] = &ns->ns_group;
675 cluster->cl_group.default_groups[1] = o2hb_group;
676 cluster->cl_group.default_groups[2] = NULL;
677 rwlock_init(&cluster->cl_nodes_lock);
678 cluster->cl_node_ip_tree = RB_ROOT;
679
680 ret = &cluster->cl_group;
681 o2nm_single_cluster = cluster;
682
683out:
684 if (ret == NULL) {
685 kfree(cluster);
686 kfree(ns);
687 o2hb_free_hb_set(o2hb_group);
688 kfree(defs);
689 }
690
691 return ret;
692}
693
694static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
695{
696 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
697 int i;
698 struct config_item *killme;
699
700 BUG_ON(o2nm_single_cluster != cluster);
701 o2nm_single_cluster = NULL;
702
703 for (i = 0; cluster->cl_group.default_groups[i]; i++) {
704 killme = &cluster->cl_group.default_groups[i]->cg_item;
705 cluster->cl_group.default_groups[i] = NULL;
706 config_item_put(killme);
707 }
708
709 config_item_put(item);
710}
711
712static struct configfs_group_operations o2nm_cluster_group_group_ops = {
713 .make_group = o2nm_cluster_group_make_group,
714 .drop_item = o2nm_cluster_group_drop_item,
715};
716
717static struct config_item_type o2nm_cluster_group_type = {
718 .ct_group_ops = &o2nm_cluster_group_group_ops,
719 .ct_owner = THIS_MODULE,
720};
721
722static struct o2nm_cluster_group o2nm_cluster_group = {
723 .cs_subsys = {
724 .su_group = {
725 .cg_item = {
726 .ci_namebuf = "cluster",
727 .ci_type = &o2nm_cluster_group_type,
728 },
729 },
730 },
731};
732
733static void __exit exit_o2nm(void)
734{
735 if (ocfs2_table_header)
736 unregister_sysctl_table(ocfs2_table_header);
737
738 /* XXX sync with hb callbacks and shut down hb? */
739 o2net_unregister_hb_callbacks();
740 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
741 o2cb_sys_shutdown();
742
743 o2net_exit();
744}
745
746static int __init init_o2nm(void)
747{
748 int ret = -1;
749
750 cluster_print_version();
751
752 o2hb_init();
753 o2net_init();
754
755 ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
756 if (!ocfs2_table_header) {
757 printk(KERN_ERR "nodemanager: unable to register sysctl\n");
758 ret = -ENOMEM; /* or something. */
759 goto out;
760 }
761
762 ret = o2net_register_hb_callbacks();
763 if (ret)
764 goto out_sysctl;
765
766 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
767 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
768 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
769 if (ret) {
770 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
771 goto out_callbacks;
772 }
773
774 ret = o2cb_sys_init();
775 if (!ret)
776 goto out;
777
778 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
779out_callbacks:
780 o2net_unregister_hb_callbacks();
781out_sysctl:
782 unregister_sysctl_table(ocfs2_table_header);
783out:
784 return ret;
785}
786
787MODULE_AUTHOR("Oracle");
788MODULE_LICENSE("GPL");
789
790module_init(init_o2nm)
791module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 000000000000..fce8033c310f
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * nodemanager.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_NODEMANAGER_H
28#define O2CLUSTER_NODEMANAGER_H
29
30#include "ocfs2_nodemanager.h"
31
32/* This totally doesn't belong here. */
33#include <linux/configfs.h>
34#include <linux/rbtree.h>
35
36#define KERN_OCFS2 988
37#define KERN_OCFS2_NM 1
38
39const char *o2nm_get_hb_ctl_path(void);
40
41struct o2nm_node {
42 spinlock_t nd_lock;
43 struct config_item nd_item;
44 char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
45 __u8 nd_num;
46 /* only one address per node, as attributes, for now. */
47 __be32 nd_ipv4_address;
48 __be16 nd_ipv4_port;
49 struct rb_node nd_ip_node;
50 /* there can be only one local node for now */
51 int nd_local;
52
53 unsigned long nd_set_attributes;
54};
55
56u8 o2nm_this_node(void);
57
58int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
59struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
60struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
61void o2nm_node_get(struct o2nm_node *node);
62void o2nm_node_put(struct o2nm_node *node);
63
64#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 000000000000..94096069cb43
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_heartbeat.h
5 *
6 * On-disk structures for ocfs2_heartbeat
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _OCFS2_HEARTBEAT_H
27#define _OCFS2_HEARTBEAT_H
28
29struct o2hb_disk_heartbeat_block {
30 __le64 hb_seq;
31 __u8 hb_node;
32 __u8 hb_pad1[3];
33 __le32 hb_cksum;
34 __le64 hb_generation;
35};
36
37#endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 000000000000..5b9854bad571
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_nodemanager.h
5 *
6 * Header describing the interface between userspace and the kernel
7 * for the ocfs2_nodemanager module.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 */
27
28#ifndef _OCFS2_NODEMANAGER_H
29#define _OCFS2_NODEMANAGER_H
30
31#define O2NM_API_VERSION 5
32
33#define O2NM_MAX_NODES 255
34#define O2NM_INVALID_NODE_NUM 255
35
36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38
39#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 000000000000..7bba98fbfc15
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2005 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 */
22
23/* This quorum hack is only here until we transition to some more rational
24 * approach that is driven from userspace. Honest. No foolin'.
25 *
26 * Imagine two nodes lose network connectivity to each other but they're still
27 * up and operating in every other way. Presumably a network timeout indicates
28 * that a node is broken and should be recovered. They can't both recover each
29 * other and both carry on without serialising their access to the file system.
30 * They need to decide who is authoritative. Now extend that problem to
31 * arbitrary groups of nodes losing connectivity between each other.
32 *
33 * So we declare that a node which has given up on connecting to a majority
34 * of nodes who are still heartbeating will fence itself.
35 *
36 * There are huge opportunities for races here. After we give up on a node's
37 * connection we need to wait long enough to give heartbeat an opportunity
38 * to declare the node as truly dead. We also need to be careful with the
39 * race between when we see a node start heartbeating and when we connect
40 * to it.
41 *
42 * So nodes that are in this transtion put a hold on the quorum decision
43 * with a counter. As they fall out of this transition they drop the count
44 * and if they're the last, they fire off the decision.
45 */
46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h>
49
50#include "heartbeat.h"
51#include "nodemanager.h"
52#define MLOG_MASK_PREFIX ML_QUORUM
53#include "masklog.h"
54#include "quorum.h"
55
56static struct o2quo_state {
57 spinlock_t qs_lock;
58 struct work_struct qs_work;
59 int qs_pending;
60 int qs_heartbeating;
61 unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
62 int qs_connected;
63 unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
64 int qs_holds;
65 unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
66} o2quo_state;
67
68/* this is horribly heavy-handed. It should instead flip the file
69 * system RO and call some userspace script. */
70static void o2quo_fence_self(void)
71{
72 /* panic spins with interrupts enabled. with preempt
73 * threads can still schedule, etc, etc */
74 o2hb_stop_all_regions();
75 panic("ocfs2 is very sorry to be fencing this system by panicing\n");
76}
77
78/* Indicate that a timeout occured on a hearbeat region write. The
79 * other nodes in the cluster may consider us dead at that time so we
80 * want to "fence" ourselves so that we don't scribble on the disk
81 * after they think they've recovered us. This can't solve all
82 * problems related to writeout after recovery but this hack can at
83 * least close some of those gaps. When we have real fencing, this can
84 * go away as our node would be fenced externally before other nodes
85 * begin recovery. */
86void o2quo_disk_timeout(void)
87{
88 o2quo_fence_self();
89}
90
91static void o2quo_make_decision(void *arg)
92{
93 int quorum;
94 int lowest_hb, lowest_reachable = 0, fence = 0;
95 struct o2quo_state *qs = &o2quo_state;
96
97 spin_lock(&qs->qs_lock);
98
99 lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
100 if (lowest_hb != O2NM_MAX_NODES)
101 lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
102
103 mlog(0, "heartbeating: %d, connected: %d, "
104 "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
105 qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
106
107 if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
108 qs->qs_heartbeating == 1)
109 goto out;
110
111 if (qs->qs_heartbeating & 1) {
112 /* the odd numbered cluster case is straight forward --
113 * if we can't talk to the majority we're hosed */
114 quorum = (qs->qs_heartbeating + 1)/2;
115 if (qs->qs_connected < quorum) {
116 mlog(ML_ERROR, "fencing this node because it is "
117 "only connected to %u nodes and %u is needed "
118 "to make a quorum out of %u heartbeating nodes\n",
119 qs->qs_connected, quorum,
120 qs->qs_heartbeating);
121 fence = 1;
122 }
123 } else {
124 /* the even numbered cluster adds the possibility of each half
125 * of the cluster being able to talk amongst themselves.. in
126 * that case we're hosed if we can't talk to the group that has
127 * the lowest numbered node */
128 quorum = qs->qs_heartbeating / 2;
129 if (qs->qs_connected < quorum) {
130 mlog(ML_ERROR, "fencing this node because it is "
131 "only connected to %u nodes and %u is needed "
132 "to make a quorum out of %u heartbeating nodes\n",
133 qs->qs_connected, quorum,
134 qs->qs_heartbeating);
135 fence = 1;
136 }
137 else if ((qs->qs_connected == quorum) &&
138 !lowest_reachable) {
139 mlog(ML_ERROR, "fencing this node because it is "
140 "connected to a half-quorum of %u out of %u "
141 "nodes which doesn't include the lowest active "
142 "node %u\n", quorum, qs->qs_heartbeating,
143 lowest_hb);
144 fence = 1;
145 }
146 }
147
148out:
149 spin_unlock(&qs->qs_lock);
150 if (fence)
151 o2quo_fence_self();
152}
153
154static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
155{
156 assert_spin_locked(&qs->qs_lock);
157
158 if (!test_and_set_bit(node, qs->qs_hold_bm)) {
159 qs->qs_holds++;
160 mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
161 "node %u\n", node);
162 mlog(0, "node %u, %d total\n", node, qs->qs_holds);
163 }
164}
165
166static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
167{
168 assert_spin_locked(&qs->qs_lock);
169
170 if (test_and_clear_bit(node, qs->qs_hold_bm)) {
171 mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
172 if (--qs->qs_holds == 0) {
173 if (qs->qs_pending) {
174 qs->qs_pending = 0;
175 schedule_work(&qs->qs_work);
176 }
177 }
178 mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
179 node, qs->qs_holds);
180 }
181}
182
183/* as a node comes up we delay the quorum decision until we know the fate of
184 * the connection. the hold will be droped in conn_up or hb_down. it might be
185 * perpetuated by con_err until hb_down. if we already have a conn, we might
186 * be dropping a hold that conn_up got. */
187void o2quo_hb_up(u8 node)
188{
189 struct o2quo_state *qs = &o2quo_state;
190
191 spin_lock(&qs->qs_lock);
192
193 qs->qs_heartbeating++;
194 mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
195 "node %u\n", node);
196 mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
197 set_bit(node, qs->qs_hb_bm);
198
199 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
200
201 if (!test_bit(node, qs->qs_conn_bm))
202 o2quo_set_hold(qs, node);
203 else
204 o2quo_clear_hold(qs, node);
205
206 spin_unlock(&qs->qs_lock);
207}
208
209/* hb going down releases any holds we might have had due to this node from
210 * conn_up, conn_err, or hb_up */
211void o2quo_hb_down(u8 node)
212{
213 struct o2quo_state *qs = &o2quo_state;
214
215 spin_lock(&qs->qs_lock);
216
217 qs->qs_heartbeating--;
218 mlog_bug_on_msg(qs->qs_heartbeating < 0,
219 "node %u, %d heartbeating\n",
220 node, qs->qs_heartbeating);
221 mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
222 clear_bit(node, qs->qs_hb_bm);
223
224 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
225
226 o2quo_clear_hold(qs, node);
227
228 spin_unlock(&qs->qs_lock);
229}
230
231/* this tells us that we've decided that the node is still heartbeating
232 * even though we've lost it's conn. it must only be called after conn_err
233 * and indicates that we must now make a quorum decision in the future,
234 * though we might be doing so after waiting for holds to drain. Here
235 * we'll be dropping the hold from conn_err. */
236void o2quo_hb_still_up(u8 node)
237{
238 struct o2quo_state *qs = &o2quo_state;
239
240 spin_lock(&qs->qs_lock);
241
242 mlog(0, "node %u\n", node);
243
244 qs->qs_pending = 1;
245 o2quo_clear_hold(qs, node);
246
247 spin_unlock(&qs->qs_lock);
248}
249
250/* This is analagous to hb_up. as a node's connection comes up we delay the
251 * quorum decision until we see it heartbeating. the hold will be droped in
252 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
253 * it's already heartbeating we we might be dropping a hold that conn_up got.
254 * */
255void o2quo_conn_up(u8 node)
256{
257 struct o2quo_state *qs = &o2quo_state;
258
259 spin_lock(&qs->qs_lock);
260
261 qs->qs_connected++;
262 mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
263 "node %u\n", node);
264 mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
265 set_bit(node, qs->qs_conn_bm);
266
267 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
268
269 if (!test_bit(node, qs->qs_hb_bm))
270 o2quo_set_hold(qs, node);
271 else
272 o2quo_clear_hold(qs, node);
273
274 spin_unlock(&qs->qs_lock);
275}
276
277/* we've decided that we won't ever be connecting to the node again. if it's
278 * still heartbeating we grab a hold that will delay decisions until either the
279 * node stops heartbeating from hb_down or the caller decides that the node is
280 * still up and calls still_up */
281void o2quo_conn_err(u8 node)
282{
283 struct o2quo_state *qs = &o2quo_state;
284
285 spin_lock(&qs->qs_lock);
286
287 if (test_bit(node, qs->qs_conn_bm)) {
288 qs->qs_connected--;
289 mlog_bug_on_msg(qs->qs_connected < 0,
290 "node %u, connected %d\n",
291 node, qs->qs_connected);
292
293 clear_bit(node, qs->qs_conn_bm);
294 }
295
296 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
297
298 if (test_bit(node, qs->qs_hb_bm))
299 o2quo_set_hold(qs, node);
300
301 spin_unlock(&qs->qs_lock);
302}
303
304void o2quo_init(void)
305{
306 struct o2quo_state *qs = &o2quo_state;
307
308 spin_lock_init(&qs->qs_lock);
309 INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
310}
311
312void o2quo_exit(void)
313{
314 flush_scheduled_work();
315}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 000000000000..6649cc6f67c9
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 */
22
23#ifndef O2CLUSTER_QUORUM_H
24#define O2CLUSTER_QUORUM_H
25
26void o2quo_init(void);
27void o2quo_exit(void);
28
29void o2quo_hb_up(u8 node);
30void o2quo_hb_down(u8 node);
31void o2quo_hb_still_up(u8 node);
32void o2quo_conn_up(u8 node);
33void o2quo_conn_err(u8 node);
34void o2quo_disk_timeout(void);
35
36#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 000000000000..1d9f6acafa2e
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.c
5 *
6 * OCFS2 cluster sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/module.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31
32#include "ocfs2_nodemanager.h"
33#include "masklog.h"
34#include "sys.h"
35
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
46#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
47
48static ssize_t o2cb_interface_revision_show(char *buf)
49{
50 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
51}
52
53static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
54
55static struct attribute *o2cb_attrs[] = {
56 &o2cb_attr_interface_revision.attr,
57 NULL,
58};
59
60static ssize_t
61o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
62static ssize_t
63o2cb_store(struct kobject * kobj, struct attribute * attr,
64 const char * buffer, size_t count);
65static struct sysfs_ops o2cb_sysfs_ops = {
66 .show = o2cb_show,
67 .store = o2cb_store,
68};
69
70static struct kobj_type o2cb_subsys_type = {
71 .default_attrs = o2cb_attrs,
72 .sysfs_ops = &o2cb_sysfs_ops,
73};
74
75/* gives us o2cb_subsys */
76static decl_subsys(o2cb, NULL, NULL);
77
78static ssize_t
79o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
80{
81 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
82 struct subsystem *sbs = to_o2cb_subsys(kobj);
83
84 BUG_ON(sbs != &o2cb_subsys);
85
86 if (o2cb_attr->show)
87 return o2cb_attr->show(buffer);
88 return -EIO;
89}
90
91static ssize_t
92o2cb_store(struct kobject * kobj, struct attribute * attr,
93 const char * buffer, size_t count)
94{
95 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
96 struct subsystem *sbs = to_o2cb_subsys(kobj);
97
98 BUG_ON(sbs != &o2cb_subsys);
99
100 if (o2cb_attr->store)
101 return o2cb_attr->store(buffer, count);
102 return -EIO;
103}
104
105void o2cb_sys_shutdown(void)
106{
107 mlog_sys_shutdown();
108 subsystem_unregister(&o2cb_subsys);
109}
110
111int o2cb_sys_init(void)
112{
113 int ret;
114
115 o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
116 ret = subsystem_register(&o2cb_subsys);
117 if (ret)
118 return ret;
119
120 ret = mlog_sys_init(&o2cb_subsys);
121 if (ret)
122 subsystem_unregister(&o2cb_subsys);
123 return ret;
124}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 000000000000..d66b8ab0045e
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.h
5 *
6 * Function prototypes for o2cb sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_SYS_H
28#define O2CLUSTER_SYS_H
29
30void o2cb_sys_shutdown(void);
31int o2cb_sys_init(void);
32
33#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 000000000000..35d92c01a972
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2004 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 *
22 * ----
23 *
24 * Callers for this were originally written against a very simple synchronus
25 * API. This implementation reflects those simple callers. Some day I'm sure
26 * we'll need to move to a more robust posting/callback mechanism.
27 *
28 * Transmit calls pass in kernel virtual addresses and block copying this into
29 * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting
30 * for a failed socket to timeout. TX callers can also pass in a poniter to an
31 * 'int' which gets filled with an errno off the wire in response to the
32 * message they send.
33 *
34 * Handlers for unsolicited messages are registered. Each socket has a page
35 * that incoming data is copied into. First the header, then the data.
36 * Handlers are called from only one thread with a reference to this per-socket
37 * page. This page is destroyed after the handler call, so it can't be
38 * referenced beyond the call. Handlers may block but are discouraged from
39 * doing so.
40 *
41 * Any framing errors (bad magic, large payload lengths) close a connection.
42 *
43 * Our sock_container holds the state we associate with a socket. It's current
44 * framing state is held there as well as the refcounting we do around when it
45 * is safe to tear down the socket. The socket is only finally torn down from
46 * the container when the container loses all of its references -- so as long
47 * as you hold a ref on the container you can trust that the socket is valid
48 * for use with kernel socket APIs.
49 *
50 * Connections are initiated between a pair of nodes when the node with the
51 * higher node number gets a heartbeat callback which indicates that the lower
52 * numbered node has started heartbeating. The lower numbered node is passive
53 * and only accepts the connection if the higher numbered node is heartbeating.
54 */
55
56#include <linux/kernel.h>
57#include <linux/jiffies.h>
58#include <linux/slab.h>
59#include <linux/idr.h>
60#include <linux/kref.h>
61#include <net/tcp.h>
62
63#include <asm/uaccess.h>
64
65#include "heartbeat.h"
66#include "tcp.h"
67#include "nodemanager.h"
68#define MLOG_MASK_PREFIX ML_TCP
69#include "masklog.h"
70#include "quorum.h"
71
72#include "tcp_internal.h"
73
74/*
75 * The linux network stack isn't sparse endian clean.. It has macros like
76 * ntohs() which perform the endian checks and structs like sockaddr_in
77 * which aren't annotated. So __force is found here to get the build
78 * clean. When they emerge from the dark ages and annotate the code
79 * we can remove these.
80 */
81
82#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
83#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
84 NIPQUAD(sc->sc_node->nd_ipv4_address), \
85 ntohs(sc->sc_node->nd_ipv4_port)
86
87/*
88 * In the following two log macros, the whitespace after the ',' just
89 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
90 * previous token if args expands to nothing.
91 */
92#define msglog(hdr, fmt, args...) do { \
93 typeof(hdr) __hdr = (hdr); \
94 mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \
95 "key %08x num %u] " fmt, \
96 be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \
97 be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \
98 be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \
99 be32_to_cpu(__hdr->msg_num) , ##args); \
100} while (0)
101
102#define sclog(sc, fmt, args...) do { \
103 typeof(sc) __sc = (sc); \
104 mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
105 "pg_off %zu] " fmt, __sc, \
106 atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
107 __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
108 ##args); \
109} while (0)
110
111static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
112static struct rb_root o2net_handler_tree = RB_ROOT;
113
114static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
115
116/* XXX someday we'll need better accounting */
117static struct socket *o2net_listen_sock = NULL;
118
119/*
120 * listen work is only queued by the listening socket callbacks on the
121 * o2net_wq. teardown detaches the callbacks before destroying the workqueue.
122 * quorum work is queued as sock containers are shutdown.. stop_listening
123 * tears down all the node's sock containers, preventing future shutdowns
124 * and queued quroum work, before canceling delayed quorum work and
125 * destroying the work queue.
126 */
127static struct workqueue_struct *o2net_wq;
128static struct work_struct o2net_listen_work;
129
130static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
131#define O2NET_HB_PRI 0x1
132
133static struct o2net_handshake *o2net_hand;
134static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
135
136static int o2net_sys_err_translations[O2NET_ERR_MAX] =
137 {[O2NET_ERR_NONE] = 0,
138 [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT,
139 [O2NET_ERR_OVERFLOW] = -EOVERFLOW,
140 [O2NET_ERR_DIED] = -EHOSTDOWN,};
141
142/* can't quite avoid *all* internal declarations :/ */
143static void o2net_sc_connect_completed(void *arg);
144static void o2net_rx_until_empty(void *arg);
145static void o2net_shutdown_sc(void *arg);
146static void o2net_listen_data_ready(struct sock *sk, int bytes);
147static void o2net_sc_send_keep_req(void *arg);
148static void o2net_idle_timer(unsigned long data);
149static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
150
151static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
152{
153 int trans;
154 BUG_ON(err >= O2NET_ERR_MAX);
155 trans = o2net_sys_err_translations[err];
156
157 /* Just in case we mess up the translation table above */
158 BUG_ON(err != O2NET_ERR_NONE && trans == 0);
159 return trans;
160}
161
162static struct o2net_node * o2net_nn_from_num(u8 node_num)
163{
164 BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
165 return &o2net_nodes[node_num];
166}
167
168static u8 o2net_num_from_nn(struct o2net_node *nn)
169{
170 BUG_ON(nn == NULL);
171 return nn - o2net_nodes;
172}
173
174/* ------------------------------------------------------------ */
175
176static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
177{
178 int ret = 0;
179
180 do {
181 if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
182 ret = -EAGAIN;
183 break;
184 }
185 spin_lock(&nn->nn_lock);
186 ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
187 if (ret == 0)
188 list_add_tail(&nsw->ns_node_item,
189 &nn->nn_status_list);
190 spin_unlock(&nn->nn_lock);
191 } while (ret == -EAGAIN);
192
193 if (ret == 0) {
194 init_waitqueue_head(&nsw->ns_wq);
195 nsw->ns_sys_status = O2NET_ERR_NONE;
196 nsw->ns_status = 0;
197 }
198
199 return ret;
200}
201
202static void o2net_complete_nsw_locked(struct o2net_node *nn,
203 struct o2net_status_wait *nsw,
204 enum o2net_system_error sys_status,
205 s32 status)
206{
207 assert_spin_locked(&nn->nn_lock);
208
209 if (!list_empty(&nsw->ns_node_item)) {
210 list_del_init(&nsw->ns_node_item);
211 nsw->ns_sys_status = sys_status;
212 nsw->ns_status = status;
213 idr_remove(&nn->nn_status_idr, nsw->ns_id);
214 wake_up(&nsw->ns_wq);
215 }
216}
217
218static void o2net_complete_nsw(struct o2net_node *nn,
219 struct o2net_status_wait *nsw,
220 u64 id, enum o2net_system_error sys_status,
221 s32 status)
222{
223 spin_lock(&nn->nn_lock);
224 if (nsw == NULL) {
225 if (id > INT_MAX)
226 goto out;
227
228 nsw = idr_find(&nn->nn_status_idr, id);
229 if (nsw == NULL)
230 goto out;
231 }
232
233 o2net_complete_nsw_locked(nn, nsw, sys_status, status);
234
235out:
236 spin_unlock(&nn->nn_lock);
237 return;
238}
239
240static void o2net_complete_nodes_nsw(struct o2net_node *nn)
241{
242 struct list_head *iter, *tmp;
243 unsigned int num_kills = 0;
244 struct o2net_status_wait *nsw;
245
246 assert_spin_locked(&nn->nn_lock);
247
248 list_for_each_safe(iter, tmp, &nn->nn_status_list) {
249 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
250 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
251 num_kills++;
252 }
253
254 mlog(0, "completed %d messages for node %u\n", num_kills,
255 o2net_num_from_nn(nn));
256}
257
258static int o2net_nsw_completed(struct o2net_node *nn,
259 struct o2net_status_wait *nsw)
260{
261 int completed;
262 spin_lock(&nn->nn_lock);
263 completed = list_empty(&nsw->ns_node_item);
264 spin_unlock(&nn->nn_lock);
265 return completed;
266}
267
268/* ------------------------------------------------------------ */
269
270static void sc_kref_release(struct kref *kref)
271{
272 struct o2net_sock_container *sc = container_of(kref,
273 struct o2net_sock_container, sc_kref);
274 sclog(sc, "releasing\n");
275
276 if (sc->sc_sock) {
277 sock_release(sc->sc_sock);
278 sc->sc_sock = NULL;
279 }
280
281 o2nm_node_put(sc->sc_node);
282 sc->sc_node = NULL;
283
284 kfree(sc);
285}
286
287static void sc_put(struct o2net_sock_container *sc)
288{
289 sclog(sc, "put\n");
290 kref_put(&sc->sc_kref, sc_kref_release);
291}
292static void sc_get(struct o2net_sock_container *sc)
293{
294 sclog(sc, "get\n");
295 kref_get(&sc->sc_kref);
296}
297static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
298{
299 struct o2net_sock_container *sc, *ret = NULL;
300 struct page *page = NULL;
301
302 page = alloc_page(GFP_NOFS);
303 sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
304 if (sc == NULL || page == NULL)
305 goto out;
306
307 kref_init(&sc->sc_kref);
308 o2nm_node_get(node);
309 sc->sc_node = node;
310
311 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
312 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
313 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
314 INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
315
316 init_timer(&sc->sc_idle_timeout);
317 sc->sc_idle_timeout.function = o2net_idle_timer;
318 sc->sc_idle_timeout.data = (unsigned long)sc;
319
320 sclog(sc, "alloced\n");
321
322 ret = sc;
323 sc->sc_page = page;
324 sc = NULL;
325 page = NULL;
326
327out:
328 if (page)
329 __free_page(page);
330 kfree(sc);
331
332 return ret;
333}
334
335/* ------------------------------------------------------------ */
336
337static void o2net_sc_queue_work(struct o2net_sock_container *sc,
338 struct work_struct *work)
339{
340 sc_get(sc);
341 if (!queue_work(o2net_wq, work))
342 sc_put(sc);
343}
344static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
345 struct work_struct *work,
346 int delay)
347{
348 sc_get(sc);
349 if (!queue_delayed_work(o2net_wq, work, delay))
350 sc_put(sc);
351}
352static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
353 struct work_struct *work)
354{
355 if (cancel_delayed_work(work))
356 sc_put(sc);
357}
358
359static void o2net_set_nn_state(struct o2net_node *nn,
360 struct o2net_sock_container *sc,
361 unsigned valid, int err)
362{
363 int was_valid = nn->nn_sc_valid;
364 int was_err = nn->nn_persistent_error;
365 struct o2net_sock_container *old_sc = nn->nn_sc;
366
367 assert_spin_locked(&nn->nn_lock);
368
369 /* the node num comparison and single connect/accept path should stop
370 * an non-null sc from being overwritten with another */
371 BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
372 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
373 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
374
375 /* we won't reconnect after our valid conn goes away for
376 * this hb iteration.. here so it shows up in the logs */
377 if (was_valid && !valid && err == 0)
378 err = -ENOTCONN;
379
380 mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
381 o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
382 nn->nn_persistent_error, err);
383
384 nn->nn_sc = sc;
385 nn->nn_sc_valid = valid ? 1 : 0;
386 nn->nn_persistent_error = err;
387
388 /* mirrors o2net_tx_can_proceed() */
389 if (nn->nn_persistent_error || nn->nn_sc_valid)
390 wake_up(&nn->nn_sc_wq);
391
392 if (!was_err && nn->nn_persistent_error) {
393 o2quo_conn_err(o2net_num_from_nn(nn));
394 queue_delayed_work(o2net_wq, &nn->nn_still_up,
395 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
396 }
397
398 if (was_valid && !valid) {
399 mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
400 SC_NODEF_ARGS(old_sc));
401 o2net_complete_nodes_nsw(nn);
402 }
403
404 if (!was_valid && valid) {
405 o2quo_conn_up(o2net_num_from_nn(nn));
406 /* this is a bit of a hack. we only try reconnecting
407 * when heartbeating starts until we get a connection.
408 * if that connection then dies we don't try reconnecting.
409 * the only way to start connecting again is to down
410 * heartbeat and bring it back up. */
411 cancel_delayed_work(&nn->nn_connect_expired);
412 mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n",
413 o2nm_this_node() > sc->sc_node->nd_num ?
414 "connected to" : "accepted connection from",
415 SC_NODEF_ARGS(sc));
416 }
417
418 /* trigger the connecting worker func as long as we're not valid,
419 * it will back off if it shouldn't connect. This can be called
420 * from node config teardown and so needs to be careful about
421 * the work queue actually being up. */
422 if (!valid && o2net_wq) {
423 unsigned long delay;
424 /* delay if we're withing a RECONNECT_DELAY of the
425 * last attempt */
426 delay = (nn->nn_last_connect_attempt +
427 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
428 - jiffies;
429 if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
430 delay = 0;
431 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
432 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
433 }
434
435 /* keep track of the nn's sc ref for the caller */
436 if ((old_sc == NULL) && sc)
437 sc_get(sc);
438 if (old_sc && (old_sc != sc)) {
439 o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
440 sc_put(old_sc);
441 }
442}
443
444/* see o2net_register_callbacks() */
445static void o2net_data_ready(struct sock *sk, int bytes)
446{
447 void (*ready)(struct sock *sk, int bytes);
448
449 read_lock(&sk->sk_callback_lock);
450 if (sk->sk_user_data) {
451 struct o2net_sock_container *sc = sk->sk_user_data;
452 sclog(sc, "data_ready hit\n");
453 do_gettimeofday(&sc->sc_tv_data_ready);
454 o2net_sc_queue_work(sc, &sc->sc_rx_work);
455 ready = sc->sc_data_ready;
456 } else {
457 ready = sk->sk_data_ready;
458 }
459 read_unlock(&sk->sk_callback_lock);
460
461 ready(sk, bytes);
462}
463
464/* see o2net_register_callbacks() */
465static void o2net_state_change(struct sock *sk)
466{
467 void (*state_change)(struct sock *sk);
468 struct o2net_sock_container *sc;
469
470 read_lock(&sk->sk_callback_lock);
471 sc = sk->sk_user_data;
472 if (sc == NULL) {
473 state_change = sk->sk_state_change;
474 goto out;
475 }
476
477 sclog(sc, "state_change to %d\n", sk->sk_state);
478
479 state_change = sc->sc_state_change;
480
481 switch(sk->sk_state) {
482 /* ignore connecting sockets as they make progress */
483 case TCP_SYN_SENT:
484 case TCP_SYN_RECV:
485 break;
486 case TCP_ESTABLISHED:
487 o2net_sc_queue_work(sc, &sc->sc_connect_work);
488 break;
489 default:
490 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
491 break;
492 }
493out:
494 read_unlock(&sk->sk_callback_lock);
495 state_change(sk);
496}
497
498/*
499 * we register callbacks so we can queue work on events before calling
500 * the original callbacks. our callbacks our careful to test user_data
501 * to discover when they've reaced with o2net_unregister_callbacks().
502 */
503static void o2net_register_callbacks(struct sock *sk,
504 struct o2net_sock_container *sc)
505{
506 write_lock_bh(&sk->sk_callback_lock);
507
508 /* accepted sockets inherit the old listen socket data ready */
509 if (sk->sk_data_ready == o2net_listen_data_ready) {
510 sk->sk_data_ready = sk->sk_user_data;
511 sk->sk_user_data = NULL;
512 }
513
514 BUG_ON(sk->sk_user_data != NULL);
515 sk->sk_user_data = sc;
516 sc_get(sc);
517
518 sc->sc_data_ready = sk->sk_data_ready;
519 sc->sc_state_change = sk->sk_state_change;
520 sk->sk_data_ready = o2net_data_ready;
521 sk->sk_state_change = o2net_state_change;
522
523 write_unlock_bh(&sk->sk_callback_lock);
524}
525
526static int o2net_unregister_callbacks(struct sock *sk,
527 struct o2net_sock_container *sc)
528{
529 int ret = 0;
530
531 write_lock_bh(&sk->sk_callback_lock);
532 if (sk->sk_user_data == sc) {
533 ret = 1;
534 sk->sk_user_data = NULL;
535 sk->sk_data_ready = sc->sc_data_ready;
536 sk->sk_state_change = sc->sc_state_change;
537 }
538 write_unlock_bh(&sk->sk_callback_lock);
539
540 return ret;
541}
542
543/*
544 * this is a little helper that is called by callers who have seen a problem
545 * with an sc and want to detach it from the nn if someone already hasn't beat
546 * them to it. if an error is given then the shutdown will be persistent
547 * and pending transmits will be canceled.
548 */
549static void o2net_ensure_shutdown(struct o2net_node *nn,
550 struct o2net_sock_container *sc,
551 int err)
552{
553 spin_lock(&nn->nn_lock);
554 if (nn->nn_sc == sc)
555 o2net_set_nn_state(nn, NULL, 0, err);
556 spin_unlock(&nn->nn_lock);
557}
558
559/*
560 * This work queue function performs the blocking parts of socket shutdown. A
561 * few paths lead here. set_nn_state will trigger this callback if it sees an
562 * sc detached from the nn. state_change will also trigger this callback
563 * directly when it sees errors. In that case we need to call set_nn_state
564 * ourselves as state_change couldn't get the nn_lock and call set_nn_state
565 * itself.
566 */
567static void o2net_shutdown_sc(void *arg)
568{
569 struct o2net_sock_container *sc = arg;
570 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
571
572 sclog(sc, "shutting down\n");
573
574 /* drop the callbacks ref and call shutdown only once */
575 if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
576 /* we shouldn't flush as we're in the thread, the
577 * races with pending sc work structs are harmless */
578 del_timer_sync(&sc->sc_idle_timeout);
579 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
580 sc_put(sc);
581 sc->sc_sock->ops->shutdown(sc->sc_sock,
582 RCV_SHUTDOWN|SEND_SHUTDOWN);
583 }
584
585 /* not fatal so failed connects before the other guy has our
586 * heartbeat can be retried */
587 o2net_ensure_shutdown(nn, sc, 0);
588 sc_put(sc);
589}
590
591/* ------------------------------------------------------------ */
592
593static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
594 u32 key)
595{
596 int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
597
598 if (ret == 0)
599 ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
600
601 return ret;
602}
603
604static struct o2net_msg_handler *
605o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
606 struct rb_node **ret_parent)
607{
608 struct rb_node **p = &o2net_handler_tree.rb_node;
609 struct rb_node *parent = NULL;
610 struct o2net_msg_handler *nmh, *ret = NULL;
611 int cmp;
612
613 while (*p) {
614 parent = *p;
615 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
616 cmp = o2net_handler_cmp(nmh, msg_type, key);
617
618 if (cmp < 0)
619 p = &(*p)->rb_left;
620 else if (cmp > 0)
621 p = &(*p)->rb_right;
622 else {
623 ret = nmh;
624 break;
625 }
626 }
627
628 if (ret_p != NULL)
629 *ret_p = p;
630 if (ret_parent != NULL)
631 *ret_parent = parent;
632
633 return ret;
634}
635
636static void o2net_handler_kref_release(struct kref *kref)
637{
638 struct o2net_msg_handler *nmh;
639 nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
640
641 kfree(nmh);
642}
643
644static void o2net_handler_put(struct o2net_msg_handler *nmh)
645{
646 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
647}
648
649/* max_len is protection for the handler func. incoming messages won't
650 * be given to the handler if their payload is longer than the max. */
651int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
652 o2net_msg_handler_func *func, void *data,
653 struct list_head *unreg_list)
654{
655 struct o2net_msg_handler *nmh = NULL;
656 struct rb_node **p, *parent;
657 int ret = 0;
658
659 if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
660 mlog(0, "max_len for message handler out of range: %u\n",
661 max_len);
662 ret = -EINVAL;
663 goto out;
664 }
665
666 if (!msg_type) {
667 mlog(0, "no message type provided: %u, %p\n", msg_type, func);
668 ret = -EINVAL;
669 goto out;
670
671 }
672 if (!func) {
673 mlog(0, "no message handler provided: %u, %p\n",
674 msg_type, func);
675 ret = -EINVAL;
676 goto out;
677 }
678
679 nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
680 if (nmh == NULL) {
681 ret = -ENOMEM;
682 goto out;
683 }
684
685 nmh->nh_func = func;
686 nmh->nh_func_data = data;
687 nmh->nh_msg_type = msg_type;
688 nmh->nh_max_len = max_len;
689 nmh->nh_key = key;
690 /* the tree and list get this ref.. they're both removed in
691 * unregister when this ref is dropped */
692 kref_init(&nmh->nh_kref);
693 INIT_LIST_HEAD(&nmh->nh_unregister_item);
694
695 write_lock(&o2net_handler_lock);
696 if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
697 ret = -EEXIST;
698 else {
699 rb_link_node(&nmh->nh_node, parent, p);
700 rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
701 list_add_tail(&nmh->nh_unregister_item, unreg_list);
702
703 mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
704 func, msg_type, key);
705 /* we've had some trouble with handlers seemingly vanishing. */
706 mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
707 &parent) == NULL,
708 "couldn't find handler we *just* registerd "
709 "for type %u key %08x\n", msg_type, key);
710 }
711 write_unlock(&o2net_handler_lock);
712 if (ret)
713 goto out;
714
715out:
716 if (ret)
717 kfree(nmh);
718
719 return ret;
720}
721EXPORT_SYMBOL_GPL(o2net_register_handler);
722
723void o2net_unregister_handler_list(struct list_head *list)
724{
725 struct list_head *pos, *n;
726 struct o2net_msg_handler *nmh;
727
728 write_lock(&o2net_handler_lock);
729 list_for_each_safe(pos, n, list) {
730 nmh = list_entry(pos, struct o2net_msg_handler,
731 nh_unregister_item);
732 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
733 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
734 rb_erase(&nmh->nh_node, &o2net_handler_tree);
735 list_del_init(&nmh->nh_unregister_item);
736 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
737 }
738 write_unlock(&o2net_handler_lock);
739}
740EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
741
742static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
743{
744 struct o2net_msg_handler *nmh;
745
746 read_lock(&o2net_handler_lock);
747 nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
748 if (nmh)
749 kref_get(&nmh->nh_kref);
750 read_unlock(&o2net_handler_lock);
751
752 return nmh;
753}
754
755/* ------------------------------------------------------------ */
756
757static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
758{
759 int ret;
760 mm_segment_t oldfs;
761 struct kvec vec = {
762 .iov_len = len,
763 .iov_base = data,
764 };
765 struct msghdr msg = {
766 .msg_iovlen = 1,
767 .msg_iov = (struct iovec *)&vec,
768 .msg_flags = MSG_DONTWAIT,
769 };
770
771 oldfs = get_fs();
772 set_fs(get_ds());
773 ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
774 set_fs(oldfs);
775
776 return ret;
777}
778
779static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
780 size_t veclen, size_t total)
781{
782 int ret;
783 mm_segment_t oldfs;
784 struct msghdr msg = {
785 .msg_iov = (struct iovec *)vec,
786 .msg_iovlen = veclen,
787 };
788
789 if (sock == NULL) {
790 ret = -EINVAL;
791 goto out;
792 }
793
794 oldfs = get_fs();
795 set_fs(get_ds());
796 ret = sock_sendmsg(sock, &msg, total);
797 set_fs(oldfs);
798 if (ret != total) {
799 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
800 total);
801 if (ret >= 0)
802 ret = -EPIPE; /* should be smarter, I bet */
803 goto out;
804 }
805
806 ret = 0;
807out:
808 if (ret < 0)
809 mlog(0, "returning error: %d\n", ret);
810 return ret;
811}
812
813static void o2net_sendpage(struct o2net_sock_container *sc,
814 void *kmalloced_virt,
815 size_t size)
816{
817 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
818 ssize_t ret;
819
820
821 ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
822 virt_to_page(kmalloced_virt),
823 (long)kmalloced_virt & ~PAGE_MASK,
824 size, MSG_DONTWAIT);
825 if (ret != size) {
826 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
827 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
828 o2net_ensure_shutdown(nn, sc, 0);
829 }
830}
831
832static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
833{
834 memset(msg, 0, sizeof(struct o2net_msg));
835 msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
836 msg->data_len = cpu_to_be16(data_len);
837 msg->msg_type = cpu_to_be16(msg_type);
838 msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
839 msg->status = 0;
840 msg->key = cpu_to_be32(key);
841}
842
843static int o2net_tx_can_proceed(struct o2net_node *nn,
844 struct o2net_sock_container **sc_ret,
845 int *error)
846{
847 int ret = 0;
848
849 spin_lock(&nn->nn_lock);
850 if (nn->nn_persistent_error) {
851 ret = 1;
852 *sc_ret = NULL;
853 *error = nn->nn_persistent_error;
854 } else if (nn->nn_sc_valid) {
855 kref_get(&nn->nn_sc->sc_kref);
856
857 ret = 1;
858 *sc_ret = nn->nn_sc;
859 *error = 0;
860 }
861 spin_unlock(&nn->nn_lock);
862
863 return ret;
864}
865
866int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
867 size_t caller_veclen, u8 target_node, int *status)
868{
869 int ret, error = 0;
870 struct o2net_msg *msg = NULL;
871 size_t veclen, caller_bytes = 0;
872 struct kvec *vec = NULL;
873 struct o2net_sock_container *sc = NULL;
874 struct o2net_node *nn = o2net_nn_from_num(target_node);
875 struct o2net_status_wait nsw = {
876 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
877 };
878
879 if (o2net_wq == NULL) {
880 mlog(0, "attempt to tx without o2netd running\n");
881 ret = -ESRCH;
882 goto out;
883 }
884
885 if (caller_veclen == 0) {
886 mlog(0, "bad kvec array length\n");
887 ret = -EINVAL;
888 goto out;
889 }
890
891 caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
892 if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
893 mlog(0, "total payload len %zu too large\n", caller_bytes);
894 ret = -EINVAL;
895 goto out;
896 }
897
898 if (target_node == o2nm_this_node()) {
899 ret = -ELOOP;
900 goto out;
901 }
902
903 ret = wait_event_interruptible(nn->nn_sc_wq,
904 o2net_tx_can_proceed(nn, &sc, &error));
905 if (!ret && error)
906 ret = error;
907 if (ret)
908 goto out;
909
910 veclen = caller_veclen + 1;
911 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
912 if (vec == NULL) {
913 mlog(0, "failed to %zu element kvec!\n", veclen);
914 ret = -ENOMEM;
915 goto out;
916 }
917
918 msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
919 if (!msg) {
920 mlog(0, "failed to allocate a o2net_msg!\n");
921 ret = -ENOMEM;
922 goto out;
923 }
924
925 o2net_init_msg(msg, caller_bytes, msg_type, key);
926
927 vec[0].iov_len = sizeof(struct o2net_msg);
928 vec[0].iov_base = msg;
929 memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
930
931 ret = o2net_prep_nsw(nn, &nsw);
932 if (ret)
933 goto out;
934
935 msg->msg_num = cpu_to_be32(nsw.ns_id);
936
937 /* finally, convert the message header to network byte-order
938 * and send */
939 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
940 sizeof(struct o2net_msg) + caller_bytes);
941 msglog(msg, "sending returned %d\n", ret);
942 if (ret < 0) {
943 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
944 goto out;
945 }
946
947 /* wait on other node's handler */
948 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
949
950 /* Note that we avoid overwriting the callers status return
951 * variable if a system error was reported on the other
952 * side. Callers beware. */
953 ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
954 if (status && !ret)
955 *status = nsw.ns_status;
956
957 mlog(0, "woken, returning system status %d, user status %d\n",
958 ret, nsw.ns_status);
959out:
960 if (sc)
961 sc_put(sc);
962 if (vec)
963 kfree(vec);
964 if (msg)
965 kfree(msg);
966 o2net_complete_nsw(nn, &nsw, 0, 0, 0);
967 return ret;
968}
969EXPORT_SYMBOL_GPL(o2net_send_message_vec);
970
971int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
972 u8 target_node, int *status)
973{
974 struct kvec vec = {
975 .iov_base = data,
976 .iov_len = len,
977 };
978 return o2net_send_message_vec(msg_type, key, &vec, 1,
979 target_node, status);
980}
981EXPORT_SYMBOL_GPL(o2net_send_message);
982
983static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
984 enum o2net_system_error syserr, int err)
985{
986 struct kvec vec = {
987 .iov_base = hdr,
988 .iov_len = sizeof(struct o2net_msg),
989 };
990
991 BUG_ON(syserr >= O2NET_ERR_MAX);
992
993 /* leave other fields intact from the incoming message, msg_num
994 * in particular */
995 hdr->sys_status = cpu_to_be32(syserr);
996 hdr->status = cpu_to_be32(err);
997 hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic
998 hdr->data_len = 0;
999
1000 msglog(hdr, "about to send status magic %d\n", err);
1001 /* hdr has been in host byteorder this whole time */
1002 return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
1003}
1004
1005/* this returns -errno if the header was unknown or too large, etc.
1006 * after this is called the buffer us reused for the next message */
1007static int o2net_process_message(struct o2net_sock_container *sc,
1008 struct o2net_msg *hdr)
1009{
1010 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1011 int ret = 0, handler_status;
1012 enum o2net_system_error syserr;
1013 struct o2net_msg_handler *nmh = NULL;
1014
1015 msglog(hdr, "processing message\n");
1016
1017 o2net_sc_postpone_idle(sc);
1018
1019 switch(be16_to_cpu(hdr->magic)) {
1020 case O2NET_MSG_STATUS_MAGIC:
1021 /* special type for returning message status */
1022 o2net_complete_nsw(nn, NULL,
1023 be32_to_cpu(hdr->msg_num),
1024 be32_to_cpu(hdr->sys_status),
1025 be32_to_cpu(hdr->status));
1026 goto out;
1027 case O2NET_MSG_KEEP_REQ_MAGIC:
1028 o2net_sendpage(sc, o2net_keep_resp,
1029 sizeof(*o2net_keep_resp));
1030 goto out;
1031 case O2NET_MSG_KEEP_RESP_MAGIC:
1032 goto out;
1033 case O2NET_MSG_MAGIC:
1034 break;
1035 default:
1036 msglog(hdr, "bad magic\n");
1037 ret = -EINVAL;
1038 goto out;
1039 break;
1040 }
1041
1042 /* find a handler for it */
1043 handler_status = 0;
1044 nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
1045 be32_to_cpu(hdr->key));
1046 if (!nmh) {
1047 mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
1048 be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
1049 syserr = O2NET_ERR_NO_HNDLR;
1050 goto out_respond;
1051 }
1052
1053 syserr = O2NET_ERR_NONE;
1054
1055 if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
1056 syserr = O2NET_ERR_OVERFLOW;
1057
1058 if (syserr != O2NET_ERR_NONE)
1059 goto out_respond;
1060
1061 do_gettimeofday(&sc->sc_tv_func_start);
1062 sc->sc_msg_key = be32_to_cpu(hdr->key);
1063 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1064 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1065 be16_to_cpu(hdr->data_len),
1066 nmh->nh_func_data);
1067 do_gettimeofday(&sc->sc_tv_func_stop);
1068
1069out_respond:
1070 /* this destroys the hdr, so don't use it after this */
1071 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
1072 handler_status);
1073 hdr = NULL;
1074 mlog(0, "sending handler status %d, syserr %d returned %d\n",
1075 handler_status, syserr, ret);
1076
1077out:
1078 if (nmh)
1079 o2net_handler_put(nmh);
1080 return ret;
1081}
1082
1083static int o2net_check_handshake(struct o2net_sock_container *sc)
1084{
1085 struct o2net_handshake *hand = page_address(sc->sc_page);
1086 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1087
1088 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1089 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
1090 "version %llu but %llu is required, disconnecting\n",
1091 SC_NODEF_ARGS(sc),
1092 (unsigned long long)be64_to_cpu(hand->protocol_version),
1093 O2NET_PROTOCOL_VERSION);
1094
1095 /* don't bother reconnecting if its the wrong version. */
1096 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1097 return -1;
1098 }
1099
1100 sc->sc_handshake_ok = 1;
1101
1102 spin_lock(&nn->nn_lock);
1103 /* set valid and queue the idle timers only if it hasn't been
1104 * shut down already */
1105 if (nn->nn_sc == sc) {
1106 o2net_sc_postpone_idle(sc);
1107 o2net_set_nn_state(nn, sc, 1, 0);
1108 }
1109 spin_unlock(&nn->nn_lock);
1110
1111 /* shift everything up as though it wasn't there */
1112 sc->sc_page_off -= sizeof(struct o2net_handshake);
1113 if (sc->sc_page_off)
1114 memmove(hand, hand + 1, sc->sc_page_off);
1115
1116 return 0;
1117}
1118
1119/* this demuxes the queued rx bytes into header or payload bits and calls
1120 * handlers as each full message is read off the socket. it returns -error,
1121 * == 0 eof, or > 0 for progress made.*/
1122static int o2net_advance_rx(struct o2net_sock_container *sc)
1123{
1124 struct o2net_msg *hdr;
1125 int ret = 0;
1126 void *data;
1127 size_t datalen;
1128
1129 sclog(sc, "receiving\n");
1130 do_gettimeofday(&sc->sc_tv_advance_start);
1131
1132 /* do we need more header? */
1133 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1134 data = page_address(sc->sc_page) + sc->sc_page_off;
1135 datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
1136 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1137 if (ret > 0) {
1138 sc->sc_page_off += ret;
1139
1140 /* this working relies on the handshake being
1141 * smaller than the normal message header */
1142 if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
1143 !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
1144 ret = -EPROTO;
1145 goto out;
1146 }
1147
1148 /* only swab incoming here.. we can
1149 * only get here once as we cross from
1150 * being under to over */
1151 if (sc->sc_page_off == sizeof(struct o2net_msg)) {
1152 hdr = page_address(sc->sc_page);
1153 if (be16_to_cpu(hdr->data_len) >
1154 O2NET_MAX_PAYLOAD_BYTES)
1155 ret = -EOVERFLOW;
1156 }
1157 }
1158 if (ret <= 0)
1159 goto out;
1160 }
1161
1162 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1163 /* oof, still don't have a header */
1164 goto out;
1165 }
1166
1167 /* this was swabbed above when we first read it */
1168 hdr = page_address(sc->sc_page);
1169
1170 msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
1171
1172 /* do we need more payload? */
1173 if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
1174 /* need more payload */
1175 data = page_address(sc->sc_page) + sc->sc_page_off;
1176 datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
1177 sc->sc_page_off;
1178 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1179 if (ret > 0)
1180 sc->sc_page_off += ret;
1181 if (ret <= 0)
1182 goto out;
1183 }
1184
1185 if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
1186 /* we can only get here once, the first time we read
1187 * the payload.. so set ret to progress if the handler
1188 * works out. after calling this the message is toast */
1189 ret = o2net_process_message(sc, hdr);
1190 if (ret == 0)
1191 ret = 1;
1192 sc->sc_page_off = 0;
1193 }
1194
1195out:
1196 sclog(sc, "ret = %d\n", ret);
1197 do_gettimeofday(&sc->sc_tv_advance_stop);
1198 return ret;
1199}
1200
1201/* this work func is triggerd by data ready. it reads until it can read no
1202 * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing
1203 * our work the work struct will be marked and we'll be called again. */
1204static void o2net_rx_until_empty(void *arg)
1205{
1206 struct o2net_sock_container *sc = arg;
1207 int ret;
1208
1209 do {
1210 ret = o2net_advance_rx(sc);
1211 } while (ret > 0);
1212
1213 if (ret <= 0 && ret != -EAGAIN) {
1214 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1215 sclog(sc, "saw error %d, closing\n", ret);
1216 /* not permanent so read failed handshake can retry */
1217 o2net_ensure_shutdown(nn, sc, 0);
1218 }
1219
1220 sc_put(sc);
1221}
1222
1223static int o2net_set_nodelay(struct socket *sock)
1224{
1225 int ret, val = 1;
1226 mm_segment_t oldfs;
1227
1228 oldfs = get_fs();
1229 set_fs(KERNEL_DS);
1230
1231 /*
1232 * Dear unsuspecting programmer,
1233 *
1234 * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
1235 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
1236 * silently turn into SO_DEBUG.
1237 *
1238 * Yours,
1239 * Keeper of hilariously fragile interfaces.
1240 */
1241 ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
1242 (char __user *)&val, sizeof(val));
1243
1244 set_fs(oldfs);
1245 return ret;
1246}
1247
1248/* ------------------------------------------------------------ */
1249
1250/* called when a connect completes and after a sock is accepted. the
1251 * rx path will see the response and mark the sc valid */
1252static void o2net_sc_connect_completed(void *arg)
1253{
1254 struct o2net_sock_container *sc = arg;
1255
1256 mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
1257 (unsigned long long)O2NET_PROTOCOL_VERSION,
1258 (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
1259
1260 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1261 sc_put(sc);
1262}
1263
1264/* this is called as a work_struct func. */
1265static void o2net_sc_send_keep_req(void *arg)
1266{
1267 struct o2net_sock_container *sc = arg;
1268
1269 o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
1270 sc_put(sc);
1271}
1272
1273/* socket shutdown does a del_timer_sync against this as it tears down.
1274 * we can't start this timer until we've got to the point in sc buildup
1275 * where shutdown is going to be involved */
1276static void o2net_idle_timer(unsigned long data)
1277{
1278 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1279 struct timeval now;
1280
1281 do_gettimeofday(&now);
1282
1283 mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
1284 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
1296
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298}
1299
1300static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
1301{
1302 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1303 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1304 O2NET_KEEPALIVE_DELAY_SECS * HZ);
1305 do_gettimeofday(&sc->sc_tv_timer);
1306 mod_timer(&sc->sc_idle_timeout,
1307 jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
1308}
1309
1310/* this work func is kicked whenever a path sets the nn state which doesn't
1311 * have valid set. This includes seeing hb come up, losing a connection,
1312 * having a connect attempt fail, etc. This centralizes the logic which decides
1313 * if a connect attempt should be made or if we should give up and all future
1314 * transmit attempts should fail */
1315static void o2net_start_connect(void *arg)
1316{
1317 struct o2net_node *nn = arg;
1318 struct o2net_sock_container *sc = NULL;
1319 struct o2nm_node *node = NULL;
1320 struct socket *sock = NULL;
1321 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1322 int ret = 0;
1323
1324 /* if we're greater we initiate tx, otherwise we accept */
1325 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1326 goto out;
1327
1328 /* watch for racing with tearing a node down */
1329 node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
1330 if (node == NULL) {
1331 ret = 0;
1332 goto out;
1333 }
1334
1335 spin_lock(&nn->nn_lock);
1336 /* see if we already have one pending or have given up */
1337 if (nn->nn_sc || nn->nn_persistent_error)
1338 arg = NULL;
1339 spin_unlock(&nn->nn_lock);
1340 if (arg == NULL) /* *shrug*, needed some indicator */
1341 goto out;
1342
1343 nn->nn_last_connect_attempt = jiffies;
1344
1345 sc = sc_alloc(node);
1346 if (sc == NULL) {
1347 mlog(0, "couldn't allocate sc\n");
1348 ret = -ENOMEM;
1349 goto out;
1350 }
1351
1352 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1353 if (ret < 0) {
1354 mlog(0, "can't create socket: %d\n", ret);
1355 goto out;
1356 }
1357 sc->sc_sock = sock; /* freed by sc_kref_release */
1358
1359 sock->sk->sk_allocation = GFP_ATOMIC;
1360
1361 myaddr.sin_family = AF_INET;
1362 myaddr.sin_port = (__force u16)htons(0); /* any port */
1363
1364 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
1365 sizeof(myaddr));
1366 if (ret) {
1367 mlog(0, "bind failed: %d\n", ret);
1368 goto out;
1369 }
1370
1371 ret = o2net_set_nodelay(sc->sc_sock);
1372 if (ret) {
1373 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1374 goto out;
1375 }
1376
1377 o2net_register_callbacks(sc->sc_sock->sk, sc);
1378
1379 spin_lock(&nn->nn_lock);
1380 /* handshake completion will set nn->nn_sc_valid */
1381 o2net_set_nn_state(nn, sc, 0, 0);
1382 spin_unlock(&nn->nn_lock);
1383
1384 remoteaddr.sin_family = AF_INET;
1385 remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
1386 remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
1387
1388 ret = sc->sc_sock->ops->connect(sc->sc_sock,
1389 (struct sockaddr *)&remoteaddr,
1390 sizeof(remoteaddr),
1391 O_NONBLOCK);
1392 if (ret == -EINPROGRESS)
1393 ret = 0;
1394
1395out:
1396 if (ret) {
1397 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
1398 "with errno %d\n", SC_NODEF_ARGS(sc), ret);
1399 /* 0 err so that another will be queued and attempted
1400 * from set_nn_state */
1401 if (sc)
1402 o2net_ensure_shutdown(nn, sc, 0);
1403 }
1404 if (sc)
1405 sc_put(sc);
1406 if (node)
1407 o2nm_node_put(node);
1408
1409 return;
1410}
1411
1412static void o2net_connect_expired(void *arg)
1413{
1414 struct o2net_node *nn = arg;
1415
1416 spin_lock(&nn->nn_lock);
1417 if (!nn->nn_sc_valid) {
1418 mlog(ML_ERROR, "no connection established with node %u after "
1419 "%u seconds, giving up and returning errors.\n",
1420 o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
1421
1422 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1423 }
1424 spin_unlock(&nn->nn_lock);
1425}
1426
1427static void o2net_still_up(void *arg)
1428{
1429 struct o2net_node *nn = arg;
1430
1431 o2quo_hb_still_up(o2net_num_from_nn(nn));
1432}
1433
1434/* ------------------------------------------------------------ */
1435
1436void o2net_disconnect_node(struct o2nm_node *node)
1437{
1438 struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
1439
1440 /* don't reconnect until it's heartbeating again */
1441 spin_lock(&nn->nn_lock);
1442 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1443 spin_unlock(&nn->nn_lock);
1444
1445 if (o2net_wq) {
1446 cancel_delayed_work(&nn->nn_connect_expired);
1447 cancel_delayed_work(&nn->nn_connect_work);
1448 cancel_delayed_work(&nn->nn_still_up);
1449 flush_workqueue(o2net_wq);
1450 }
1451}
1452
1453static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1454 void *data)
1455{
1456 o2quo_hb_down(node_num);
1457
1458 if (node_num != o2nm_this_node())
1459 o2net_disconnect_node(node);
1460}
1461
1462static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1463 void *data)
1464{
1465 struct o2net_node *nn = o2net_nn_from_num(node_num);
1466
1467 o2quo_hb_up(node_num);
1468
1469 /* ensure an immediate connect attempt */
1470 nn->nn_last_connect_attempt = jiffies -
1471 (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
1472
1473 if (node_num != o2nm_this_node()) {
1474 /* heartbeat doesn't work unless a local node number is
1475 * configured and doing so brings up the o2net_wq, so we can
1476 * use it.. */
1477 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1478 O2NET_IDLE_TIMEOUT_SECS * HZ);
1479
1480 /* believe it or not, accept and node hearbeating testing
1481 * can succeed for this node before we got here.. so
1482 * only use set_nn_state to clear the persistent error
1483 * if that hasn't already happened */
1484 spin_lock(&nn->nn_lock);
1485 if (nn->nn_persistent_error)
1486 o2net_set_nn_state(nn, NULL, 0, 0);
1487 spin_unlock(&nn->nn_lock);
1488 }
1489}
1490
1491void o2net_unregister_hb_callbacks(void)
1492{
1493 int ret;
1494
1495 ret = o2hb_unregister_callback(&o2net_hb_up);
1496 if (ret < 0)
1497 mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
1498 "callback!\n", ret);
1499
1500 ret = o2hb_unregister_callback(&o2net_hb_down);
1501 if (ret < 0)
1502 mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
1503 "callback!\n", ret);
1504}
1505
1506int o2net_register_hb_callbacks(void)
1507{
1508 int ret;
1509
1510 o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
1511 o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
1512 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1513 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1514
1515 ret = o2hb_register_callback(&o2net_hb_up);
1516 if (ret == 0)
1517 ret = o2hb_register_callback(&o2net_hb_down);
1518
1519 if (ret)
1520 o2net_unregister_hb_callbacks();
1521
1522 return ret;
1523}
1524
1525/* ------------------------------------------------------------ */
1526
1527static int o2net_accept_one(struct socket *sock)
1528{
1529 int ret, slen;
1530 struct sockaddr_in sin;
1531 struct socket *new_sock = NULL;
1532 struct o2nm_node *node = NULL;
1533 struct o2net_sock_container *sc = NULL;
1534 struct o2net_node *nn;
1535
1536 BUG_ON(sock == NULL);
1537 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
1538 sock->sk->sk_protocol, &new_sock);
1539 if (ret)
1540 goto out;
1541
1542 new_sock->type = sock->type;
1543 new_sock->ops = sock->ops;
1544 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
1545 if (ret < 0)
1546 goto out;
1547
1548 new_sock->sk->sk_allocation = GFP_ATOMIC;
1549
1550 ret = o2net_set_nodelay(new_sock);
1551 if (ret) {
1552 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1553 goto out;
1554 }
1555
1556 slen = sizeof(sin);
1557 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
1558 &slen, 1);
1559 if (ret < 0)
1560 goto out;
1561
1562 node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
1563 if (node == NULL) {
1564 mlog(ML_NOTICE, "attempt to connect from unknown node at "
1565 "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
1566 ntohs((__force __be16)sin.sin_port));
1567 ret = -EINVAL;
1568 goto out;
1569 }
1570
1571 if (o2nm_this_node() > node->nd_num) {
1572 mlog(ML_NOTICE, "unexpected connect attempted from a lower "
1573 "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
1574 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1575 ntohs((__force __be16)sin.sin_port), node->nd_num);
1576 ret = -EINVAL;
1577 goto out;
1578 }
1579
1580 /* this happens all the time when the other node sees our heartbeat
1581 * and tries to connect before we see their heartbeat */
1582 if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
1583 mlog(ML_CONN, "attempt to connect from node '%s' at "
1584 "%u.%u.%u.%u:%d but it isn't heartbeating\n",
1585 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1586 ntohs((__force __be16)sin.sin_port));
1587 ret = -EINVAL;
1588 goto out;
1589 }
1590
1591 nn = o2net_nn_from_num(node->nd_num);
1592
1593 spin_lock(&nn->nn_lock);
1594 if (nn->nn_sc)
1595 ret = -EBUSY;
1596 else
1597 ret = 0;
1598 spin_unlock(&nn->nn_lock);
1599 if (ret) {
1600 mlog(ML_NOTICE, "attempt to connect from node '%s' at "
1601 "%u.%u.%u.%u:%d but it already has an open connection\n",
1602 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1603 ntohs((__force __be16)sin.sin_port));
1604 goto out;
1605 }
1606
1607 sc = sc_alloc(node);
1608 if (sc == NULL) {
1609 ret = -ENOMEM;
1610 goto out;
1611 }
1612
1613 sc->sc_sock = new_sock;
1614 new_sock = NULL;
1615
1616 spin_lock(&nn->nn_lock);
1617 o2net_set_nn_state(nn, sc, 0, 0);
1618 spin_unlock(&nn->nn_lock);
1619
1620 o2net_register_callbacks(sc->sc_sock->sk, sc);
1621 o2net_sc_queue_work(sc, &sc->sc_rx_work);
1622
1623 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1624
1625out:
1626 if (new_sock)
1627 sock_release(new_sock);
1628 if (node)
1629 o2nm_node_put(node);
1630 if (sc)
1631 sc_put(sc);
1632 return ret;
1633}
1634
1635static void o2net_accept_many(void *arg)
1636{
1637 struct socket *sock = arg;
1638 while (o2net_accept_one(sock) == 0)
1639 cond_resched();
1640}
1641
1642static void o2net_listen_data_ready(struct sock *sk, int bytes)
1643{
1644 void (*ready)(struct sock *sk, int bytes);
1645
1646 read_lock(&sk->sk_callback_lock);
1647 ready = sk->sk_user_data;
1648 if (ready == NULL) { /* check for teardown race */
1649 ready = sk->sk_data_ready;
1650 goto out;
1651 }
1652
1653 /* ->sk_data_ready is also called for a newly established child socket
1654 * before it has been accepted and the acceptor has set up their
1655 * data_ready.. we only want to queue listen work for our listening
1656 * socket */
1657 if (sk->sk_state == TCP_LISTEN) {
1658 mlog(ML_TCP, "bytes: %d\n", bytes);
1659 queue_work(o2net_wq, &o2net_listen_work);
1660 }
1661
1662out:
1663 read_unlock(&sk->sk_callback_lock);
1664 ready(sk, bytes);
1665}
1666
1667static int o2net_open_listening_sock(__be16 port)
1668{
1669 struct socket *sock = NULL;
1670 int ret;
1671 struct sockaddr_in sin = {
1672 .sin_family = PF_INET,
1673 .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
1674 .sin_port = (__force u16)port,
1675 };
1676
1677 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1678 if (ret < 0) {
1679 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
1680 goto out;
1681 }
1682
1683 sock->sk->sk_allocation = GFP_ATOMIC;
1684
1685 write_lock_bh(&sock->sk->sk_callback_lock);
1686 sock->sk->sk_user_data = sock->sk->sk_data_ready;
1687 sock->sk->sk_data_ready = o2net_listen_data_ready;
1688 write_unlock_bh(&sock->sk->sk_callback_lock);
1689
1690 o2net_listen_sock = sock;
1691 INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
1692
1693 sock->sk->sk_reuse = 1;
1694 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
1695 if (ret < 0) {
1696 mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
1697 ntohs(port), ret);
1698 goto out;
1699 }
1700
1701 ret = sock->ops->listen(sock, 64);
1702 if (ret < 0) {
1703 mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
1704 ntohs(port), ret);
1705 }
1706
1707out:
1708 if (ret) {
1709 o2net_listen_sock = NULL;
1710 if (sock)
1711 sock_release(sock);
1712 }
1713 return ret;
1714}
1715
1716/*
1717 * called from node manager when we should bring up our network listening
1718 * socket. node manager handles all the serialization to only call this
1719 * once and to match it with o2net_stop_listening(). note,
1720 * o2nm_this_node() doesn't work yet as we're being called while it
1721 * is being set up.
1722 */
1723int o2net_start_listening(struct o2nm_node *node)
1724{
1725 int ret = 0;
1726
1727 BUG_ON(o2net_wq != NULL);
1728 BUG_ON(o2net_listen_sock != NULL);
1729
1730 mlog(ML_KTHREAD, "starting o2net thread...\n");
1731 o2net_wq = create_singlethread_workqueue("o2net");
1732 if (o2net_wq == NULL) {
1733 mlog(ML_ERROR, "unable to launch o2net thread\n");
1734 return -ENOMEM; /* ? */
1735 }
1736
1737 ret = o2net_open_listening_sock(node->nd_ipv4_port);
1738 if (ret) {
1739 destroy_workqueue(o2net_wq);
1740 o2net_wq = NULL;
1741 } else
1742 o2quo_conn_up(node->nd_num);
1743
1744 return ret;
1745}
1746
1747/* again, o2nm_this_node() doesn't work here as we're involved in
1748 * tearing it down */
1749void o2net_stop_listening(struct o2nm_node *node)
1750{
1751 struct socket *sock = o2net_listen_sock;
1752 size_t i;
1753
1754 BUG_ON(o2net_wq == NULL);
1755 BUG_ON(o2net_listen_sock == NULL);
1756
1757 /* stop the listening socket from generating work */
1758 write_lock_bh(&sock->sk->sk_callback_lock);
1759 sock->sk->sk_data_ready = sock->sk->sk_user_data;
1760 sock->sk->sk_user_data = NULL;
1761 write_unlock_bh(&sock->sk->sk_callback_lock);
1762
1763 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1764 struct o2nm_node *node = o2nm_get_node_by_num(i);
1765 if (node) {
1766 o2net_disconnect_node(node);
1767 o2nm_node_put(node);
1768 }
1769 }
1770
1771 /* finish all work and tear down the work queue */
1772 mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
1773 destroy_workqueue(o2net_wq);
1774 o2net_wq = NULL;
1775
1776 sock_release(o2net_listen_sock);
1777 o2net_listen_sock = NULL;
1778
1779 o2quo_conn_err(node->nd_num);
1780}
1781
1782/* ------------------------------------------------------------ */
1783
1784int o2net_init(void)
1785{
1786 unsigned long i;
1787
1788 o2quo_init();
1789
1790 o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
1791 o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1792 o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1793 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
1794 kfree(o2net_hand);
1795 kfree(o2net_keep_req);
1796 kfree(o2net_keep_resp);
1797 return -ENOMEM;
1798 }
1799
1800 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
1801 o2net_hand->connector_id = cpu_to_be64(1);
1802
1803 o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
1804 o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
1805
1806 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1807 struct o2net_node *nn = o2net_nn_from_num(i);
1808
1809 spin_lock_init(&nn->nn_lock);
1810 INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
1811 INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
1812 INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
1813 /* until we see hb from a node we'll return einval */
1814 nn->nn_persistent_error = -ENOTCONN;
1815 init_waitqueue_head(&nn->nn_sc_wq);
1816 idr_init(&nn->nn_status_idr);
1817 INIT_LIST_HEAD(&nn->nn_status_list);
1818 }
1819
1820 return 0;
1821}
1822
1823void o2net_exit(void)
1824{
1825 o2quo_exit();
1826 kfree(o2net_hand);
1827 kfree(o2net_keep_req);
1828 kfree(o2net_keep_resp);
1829}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 000000000000..a6f4585501c8
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * tcp.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_TCP_H
28#define O2CLUSTER_TCP_H
29
30#include <linux/socket.h>
31#ifdef __KERNEL__
32#include <net/sock.h>
33#include <linux/tcp.h>
34#else
35#include <sys/socket.h>
36#endif
37#include <linux/inet.h>
38#include <linux/in.h>
39
40struct o2net_msg
41{
42 __be16 magic;
43 __be16 data_len;
44 __be16 msg_type;
45 __be16 pad1;
46 __be32 sys_status;
47 __be32 status;
48 __be32 key;
49 __be32 msg_num;
50 __u8 buf[0];
51};
52
53typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
54
55#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
56
57/* TODO: figure this out.... */
58static inline int o2net_link_down(int err, struct socket *sock)
59{
60 if (sock) {
61 if (sock->sk->sk_state != TCP_ESTABLISHED &&
62 sock->sk->sk_state != TCP_CLOSE_WAIT)
63 return 1;
64 }
65
66 if (err >= 0)
67 return 0;
68 switch (err) {
69 /* ????????????????????????? */
70 case -ERESTARTSYS:
71 case -EBADF:
72 /* When the server has died, an ICMP port unreachable
73 * message prompts ECONNREFUSED. */
74 case -ECONNREFUSED:
75 case -ENOTCONN:
76 case -ECONNRESET:
77 case -EPIPE:
78 return 1;
79 }
80 return 0;
81}
82
83enum {
84 O2NET_DRIVER_UNINITED,
85 O2NET_DRIVER_READY,
86};
87
88int o2net_init_tcp_sock(struct inode *inode);
89int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
90 u8 target_node, int *status);
91int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
92 size_t veclen, u8 target_node, int *status);
93int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
94 struct inode *group);
95
96int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
97 o2net_msg_handler_func *func, void *data,
98 struct list_head *unreg_list);
99void o2net_unregister_handler_list(struct list_head *list);
100
101struct o2nm_node;
102int o2net_register_hb_callbacks(void);
103void o2net_unregister_hb_callbacks(void);
104int o2net_start_listening(struct o2nm_node *node);
105void o2net_stop_listening(struct o2nm_node *node);
106void o2net_disconnect_node(struct o2nm_node *node);
107
108int o2net_init(void);
109void o2net_exit(void);
110int o2net_proc_init(struct proc_dir_entry *parent);
111void o2net_proc_exit(struct proc_dir_entry *parent);
112
113#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 000000000000..ff9e2e2104c2
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_TCP_INTERNAL_H
23#define O2CLUSTER_TCP_INTERNAL_H
24
25#define O2NET_MSG_MAGIC ((u16)0xfa55)
26#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
27#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
28#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
29
30/* same as hb delay, we're waiting for another node to recognize our hb */
31#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
32
33/* we're delaying our quorum decision so that heartbeat will have timed
34 * out truly dead nodes by the time we come around to making decisions
35 * on their number */
36#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
37
38#define O2NET_KEEPALIVE_DELAY_SECS 5
39#define O2NET_IDLE_TIMEOUT_SECS 10
40
41/*
42 * This version number represents quite a lot, unfortunately. It not
43 * only represents the raw network message protocol on the wire but also
44 * locking semantics of the file system using the protocol. It should
45 * be somewhere else, I'm sure, but right now it isn't.
46 *
47 * New in version 2:
48 * - full 64 bit i_size in the metadata lock lvbs
49 * - introduction of "rw" lock and pushing meta/data locking down
50 */
51#define O2NET_PROTOCOL_VERSION 2ULL
52struct o2net_handshake {
53 __be64 protocol_version;
54 __be64 connector_id;
55};
56
57struct o2net_node {
58 /* this is never called from int/bh */
59 spinlock_t nn_lock;
60
61 /* set the moment an sc is allocated and a connect is started */
62 struct o2net_sock_container *nn_sc;
63 /* _valid is only set after the handshake passes and tx can happen */
64 unsigned nn_sc_valid:1;
65 /* if this is set tx just returns it */
66 int nn_persistent_error;
67
68 /* threads waiting for an sc to arrive wait on the wq for generation
69 * to increase. it is increased when a connecting socket succeeds
70 * or fails or when an accepted socket is attached. */
71 wait_queue_head_t nn_sc_wq;
72
73 struct idr nn_status_idr;
74 struct list_head nn_status_list;
75
76 /* connects are attempted from when heartbeat comes up until either hb
77 * goes down, the node is unconfigured, no connect attempts succeed
78 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
79 * is queued from set_nn_state both from hb up and from itself if a
80 * connect attempt fails and so can be self-arming. shutdown is
81 * careful to first mark the nn such that no connects will be attempted
82 * before canceling delayed connect work and flushing the queue. */
83 struct work_struct nn_connect_work;
84 unsigned long nn_last_connect_attempt;
85
86 /* this is queued as nodes come up and is canceled when a connection is
87 * established. this expiring gives up on the node and errors out
88 * transmits */
89 struct work_struct nn_connect_expired;
90
91 /* after we give up on a socket we wait a while before deciding
92 * that it is still heartbeating and that we should do some
93 * quorum work */
94 struct work_struct nn_still_up;
95};
96
97struct o2net_sock_container {
98 struct kref sc_kref;
99 /* the next two are vaild for the life time of the sc */
100 struct socket *sc_sock;
101 struct o2nm_node *sc_node;
102
103 /* all of these sc work structs hold refs on the sc while they are
104 * queued. they should not be able to ref a freed sc. the teardown
105 * race is with o2net_wq destruction in o2net_stop_listening() */
106
107 /* rx and connect work are generated from socket callbacks. sc
108 * shutdown removes the callbacks and then flushes the work queue */
109 struct work_struct sc_rx_work;
110 struct work_struct sc_connect_work;
111 /* shutdown work is triggered in two ways. the simple way is
112 * for a code path calls ensure_shutdown which gets a lock, removes
113 * the sc from the nn, and queues the work. in this case the
114 * work is single-shot. the work is also queued from a sock
115 * callback, though, and in this case the work will find the sc
116 * still on the nn and will call ensure_shutdown itself.. this
117 * ends up triggering the shutdown work again, though nothing
118 * will be done in that second iteration. so work queue teardown
119 * has to be careful to remove the sc from the nn before waiting
120 * on the work queue so that the shutdown work doesn't remove the
121 * sc and rearm itself.
122 */
123 struct work_struct sc_shutdown_work;
124
125 struct timer_list sc_idle_timeout;
126 struct work_struct sc_keepalive_work;
127
128 unsigned sc_handshake_ok:1;
129
130 struct page *sc_page;
131 size_t sc_page_off;
132
133 /* original handlers for the sockets */
134 void (*sc_state_change)(struct sock *sk);
135 void (*sc_data_ready)(struct sock *sk, int bytes);
136
137 struct timeval sc_tv_timer;
138 struct timeval sc_tv_data_ready;
139 struct timeval sc_tv_advance_start;
140 struct timeval sc_tv_advance_stop;
141 struct timeval sc_tv_func_start;
142 struct timeval sc_tv_func_stop;
143 u32 sc_msg_key;
144 u16 sc_msg_type;
145};
146
147struct o2net_msg_handler {
148 struct rb_node nh_node;
149 u32 nh_max_len;
150 u32 nh_msg_type;
151 u32 nh_key;
152 o2net_msg_handler_func *nh_func;
153 o2net_msg_handler_func *nh_func_data;
154 struct kref nh_kref;
155 struct list_head nh_unregister_item;
156};
157
158enum o2net_system_error {
159 O2NET_ERR_NONE = 0,
160 O2NET_ERR_NO_HNDLR,
161 O2NET_ERR_OVERFLOW,
162 O2NET_ERR_DIED,
163 O2NET_ERR_MAX
164};
165
166struct o2net_status_wait {
167 enum o2net_system_error ns_sys_status;
168 s32 ns_status;
169 int ns_id;
170 wait_queue_head_t ns_wq;
171 struct list_head ns_node_item;
172};
173
174#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 000000000000..7286c48bb30d
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 000000000000..32554c3382c2
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 000000000000..bd85182e97bc
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.c
5 *
6 * dentry cache handling code
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/namei.h>
30
31#define MLOG_MASK_PREFIX ML_DCACHE
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dcache.h"
38#include "file.h"
39#include "inode.h"
40
41static int ocfs2_dentry_revalidate(struct dentry *dentry,
42 struct nameidata *nd)
43{
44 struct inode *inode = dentry->d_inode;
45 int ret = 0; /* if all else fails, just return false */
46 struct ocfs2_super *osb;
47
48 mlog_entry("(0x%p, '%.*s')\n", dentry,
49 dentry->d_name.len, dentry->d_name.name);
50
51 /* Never trust a negative dentry - force a new lookup. */
52 if (inode == NULL) {
53 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
54 dentry->d_name.name);
55 goto bail;
56 }
57
58 osb = OCFS2_SB(inode->i_sb);
59
60 BUG_ON(!osb);
61
62 if (inode != osb->root_inode) {
63 spin_lock(&OCFS2_I(inode)->ip_lock);
64 /* did we or someone else delete this inode? */
65 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
66 spin_unlock(&OCFS2_I(inode)->ip_lock);
67 mlog(0, "inode (%"MLFu64") deleted, returning false\n",
68 OCFS2_I(inode)->ip_blkno);
69 goto bail;
70 }
71 spin_unlock(&OCFS2_I(inode)->ip_lock);
72
73 if (!inode->i_nlink) {
74 mlog(0, "Inode %"MLFu64" orphaned, returning false "
75 "dir = %d\n", OCFS2_I(inode)->ip_blkno,
76 S_ISDIR(inode->i_mode));
77 goto bail;
78 }
79 }
80
81 ret = 1;
82
83bail:
84 mlog_exit(ret);
85
86 return ret;
87}
88
89struct dentry_operations ocfs2_dentry_ops = {
90 .d_revalidate = ocfs2_dentry_revalidate,
91};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 000000000000..90072771114b
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H
28
29extern struct dentry_operations ocfs2_dentry_ops;
30
31#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 000000000000..856e20ae8263
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c
5 *
6 * Creates, reads, walks and deletes directory-nodes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dir.h"
51#include "dlmglue.h"
52#include "extent_map.h"
53#include "file.h"
54#include "inode.h"
55#include "journal.h"
56#include "namei.h"
57#include "suballoc.h"
58#include "uptodate.h"
59
60#include "buffer_head_io.h"
61
62static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64};
65
66static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir,
68 struct buffer_head *parent_fe_bh,
69 struct buffer_head **new_de_bh);
70/*
71 * ocfs2_readdir()
72 *
73 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75{
76 int error = 0;
77 unsigned long offset, blk;
78 int i, num, stored;
79 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de;
81 int err;
82 struct inode *inode = filp->f_dentry->d_inode;
83 struct super_block * sb = inode->i_sb;
84 int have_disk_lock = 0;
85
86 mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
87
88 stored = 0;
89 bh = NULL;
90
91 error = ocfs2_meta_lock(inode, NULL, NULL, 0);
92 if (error < 0) {
93 if (error != -ENOENT)
94 mlog_errno(error);
95 /* we haven't got any yet, so propagate the error. */
96 stored = error;
97 goto bail;
98 }
99 have_disk_lock = 1;
100
101 offset = filp->f_pos & (sb->s_blocksize - 1);
102
103 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
104 blk = (filp->f_pos) >> sb->s_blocksize_bits;
105 bh = ocfs2_bread(inode, blk, &err, 0);
106 if (!bh) {
107 mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
108 "at offset %lld\n",
109 OCFS2_I(inode)->ip_blkno,
110 filp->f_pos);
111 filp->f_pos += sb->s_blocksize - offset;
112 continue;
113 }
114
115 /*
116 * Do the readahead (8k)
117 */
118 if (!offset) {
119 for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
120 i > 0; i--) {
121 tmp = ocfs2_bread(inode, ++blk, &err, 1);
122 if (tmp)
123 brelse(tmp);
124 }
125 }
126
127revalidate:
128 /* If the dir block has changed since the last call to
129 * readdir(2), then we might be pointing to an invalid
130 * dirent right now. Scan from the start of the block
131 * to make sure. */
132 if (filp->f_version != inode->i_version) {
133 for (i = 0; i < sb->s_blocksize && i < offset; ) {
134 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
135 /* It's too expensive to do a full
136 * dirent test each time round this
137 * loop, but we do have to test at
138 * least that it is non-zero. A
139 * failure will be detected in the
140 * dirent test below. */
141 if (le16_to_cpu(de->rec_len) <
142 OCFS2_DIR_REC_LEN(1))
143 break;
144 i += le16_to_cpu(de->rec_len);
145 }
146 offset = i;
147 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
148 | offset;
149 filp->f_version = inode->i_version;
150 }
151
152 while (!error && filp->f_pos < i_size_read(inode)
153 && offset < sb->s_blocksize) {
154 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
155 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
156 /* On error, skip the f_pos to the
157 next block. */
158 filp->f_pos = (filp->f_pos |
159 (sb->s_blocksize - 1)) + 1;
160 brelse(bh);
161 goto bail;
162 }
163 offset += le16_to_cpu(de->rec_len);
164 if (le64_to_cpu(de->inode)) {
165 /* We might block in the next section
166 * if the data destination is
167 * currently swapped out. So, use a
168 * version stamp to detect whether or
169 * not the directory has been modified
170 * during the copy operation.
171 */
172 unsigned long version = filp->f_version;
173 unsigned char d_type = DT_UNKNOWN;
174
175 if (de->file_type < OCFS2_FT_MAX)
176 d_type = ocfs2_filetype_table[de->file_type];
177 error = filldir(dirent, de->name,
178 de->name_len,
179 filp->f_pos,
180 ino_from_blkno(sb, le64_to_cpu(de->inode)),
181 d_type);
182 if (error)
183 break;
184 if (version != filp->f_version)
185 goto revalidate;
186 stored ++;
187 }
188 filp->f_pos += le16_to_cpu(de->rec_len);
189 }
190 offset = 0;
191 brelse(bh);
192 }
193
194 stored = 0;
195bail:
196 if (have_disk_lock)
197 ocfs2_meta_unlock(inode, 0);
198
199 mlog_exit(stored);
200
201 return stored;
202}
203
204/*
205 * NOTE: this should always be called with parent dir i_sem taken.
206 */
207int ocfs2_find_files_on_disk(const char *name,
208 int namelen,
209 u64 *blkno,
210 struct inode *inode,
211 struct buffer_head **dirent_bh,
212 struct ocfs2_dir_entry **dirent)
213{
214 int status = -ENOENT;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216
217 mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
218 "inode=%p)\n",
219 osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
220
221 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
222 if (!*dirent_bh || !*dirent) {
223 status = -ENOENT;
224 goto leave;
225 }
226
227 *blkno = le64_to_cpu((*dirent)->inode);
228
229 status = 0;
230leave:
231 if (status < 0) {
232 *dirent = NULL;
233 if (*dirent_bh) {
234 brelse(*dirent_bh);
235 *dirent_bh = NULL;
236 }
237 }
238
239 mlog_exit(status);
240 return status;
241}
242
243/* Check for a name within a directory.
244 *
245 * Return 0 if the name does not exist
246 * Return -EEXIST if the directory contains the name
247 *
248 * Callers should have i_sem + a cluster lock on dir
249 */
250int ocfs2_check_dir_for_entry(struct inode *dir,
251 const char *name,
252 int namelen)
253{
254 int ret;
255 struct buffer_head *dirent_bh = NULL;
256 struct ocfs2_dir_entry *dirent = NULL;
257
258 mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
259 namelen, name);
260
261 ret = -EEXIST;
262 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
263 if (dirent_bh)
264 goto bail;
265
266 ret = 0;
267bail:
268 if (dirent_bh)
269 brelse(dirent_bh);
270
271 mlog_exit(ret);
272 return ret;
273}
274
275/*
276 * routine to check that the specified directory is empty (for rmdir)
277 */
278int ocfs2_empty_dir(struct inode *inode)
279{
280 unsigned long offset;
281 struct buffer_head * bh;
282 struct ocfs2_dir_entry * de, * de1;
283 struct super_block * sb;
284 int err;
285
286 sb = inode->i_sb;
287 if ((i_size_read(inode) <
288 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
289 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
290 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
291 "no data block\n",
292 OCFS2_I(inode)->ip_blkno);
293 return 1;
294 }
295
296 de = (struct ocfs2_dir_entry *) bh->b_data;
297 de1 = (struct ocfs2_dir_entry *)
298 ((char *)de + le16_to_cpu(de->rec_len));
299 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
300 !le64_to_cpu(de1->inode) ||
301 strcmp(".", de->name) ||
302 strcmp("..", de1->name)) {
303 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
304 "no `.' or `..'\n",
305 OCFS2_I(inode)->ip_blkno);
306 brelse(bh);
307 return 1;
308 }
309 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
310 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
311 while (offset < i_size_read(inode) ) {
312 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
313 brelse(bh);
314 bh = ocfs2_bread(inode,
315 offset >> sb->s_blocksize_bits, &err, 0);
316 if (!bh) {
317 mlog(ML_ERROR, "directory #%"MLFu64" contains "
318 "a hole at offset %lu\n",
319 OCFS2_I(inode)->ip_blkno, offset);
320 offset += sb->s_blocksize;
321 continue;
322 }
323 de = (struct ocfs2_dir_entry *) bh->b_data;
324 }
325 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
326 brelse(bh);
327 return 1;
328 }
329 if (le64_to_cpu(de->inode)) {
330 brelse(bh);
331 return 0;
332 }
333 offset += le16_to_cpu(de->rec_len);
334 de = (struct ocfs2_dir_entry *)
335 ((char *)de + le16_to_cpu(de->rec_len));
336 }
337 brelse(bh);
338 return 1;
339}
340
341/* returns a bh of the 1st new block in the allocation. */
342int ocfs2_do_extend_dir(struct super_block *sb,
343 struct ocfs2_journal_handle *handle,
344 struct inode *dir,
345 struct buffer_head *parent_fe_bh,
346 struct ocfs2_alloc_context *data_ac,
347 struct ocfs2_alloc_context *meta_ac,
348 struct buffer_head **new_bh)
349{
350 int status;
351 int extend;
352 u64 p_blkno;
353
354 spin_lock(&OCFS2_I(dir)->ip_lock);
355 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
356 spin_unlock(&OCFS2_I(dir)->ip_lock);
357
358 if (extend) {
359 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
360 parent_fe_bh, handle,
361 data_ac, meta_ac, NULL);
362 BUG_ON(status == -EAGAIN);
363 if (status < 0) {
364 mlog_errno(status);
365 goto bail;
366 }
367 }
368
369 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
370 (sb->s_blocksize_bits - 9)),
371 1, &p_blkno, NULL);
372 if (status < 0) {
373 mlog_errno(status);
374 goto bail;
375 }
376
377 *new_bh = sb_getblk(sb, p_blkno);
378 if (!*new_bh) {
379 status = -EIO;
380 mlog_errno(status);
381 goto bail;
382 }
383 status = 0;
384bail:
385 mlog_exit(status);
386 return status;
387}
388
389/* assumes you already have a cluster lock on the directory. */
390static int ocfs2_extend_dir(struct ocfs2_super *osb,
391 struct inode *dir,
392 struct buffer_head *parent_fe_bh,
393 struct buffer_head **new_de_bh)
394{
395 int status = 0;
396 int credits, num_free_extents;
397 loff_t dir_i_size;
398 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
399 struct ocfs2_alloc_context *data_ac = NULL;
400 struct ocfs2_alloc_context *meta_ac = NULL;
401 struct ocfs2_journal_handle *handle = NULL;
402 struct buffer_head *new_bh = NULL;
403 struct ocfs2_dir_entry * de;
404 struct super_block *sb = osb->sb;
405
406 mlog_entry_void();
407
408 dir_i_size = i_size_read(dir);
409 mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
410 OCFS2_I(dir)->ip_blkno, dir_i_size);
411
412 handle = ocfs2_alloc_handle(osb);
413 if (handle == NULL) {
414 status = -ENOMEM;
415 mlog_errno(status);
416 goto bail;
417 }
418
419 /* dir->i_size is always block aligned. */
420 spin_lock(&OCFS2_I(dir)->ip_lock);
421 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
422 spin_unlock(&OCFS2_I(dir)->ip_lock);
423 num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
424 if (num_free_extents < 0) {
425 status = num_free_extents;
426 mlog_errno(status);
427 goto bail;
428 }
429
430 if (!num_free_extents) {
431 status = ocfs2_reserve_new_metadata(osb, handle,
432 fe, &meta_ac);
433 if (status < 0) {
434 if (status != -ENOSPC)
435 mlog_errno(status);
436 goto bail;
437 }
438 }
439
440 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
441 if (status < 0) {
442 if (status != -ENOSPC)
443 mlog_errno(status);
444 goto bail;
445 }
446
447 credits = ocfs2_calc_extend_credits(sb, fe, 1);
448 } else {
449 spin_unlock(&OCFS2_I(dir)->ip_lock);
450 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
451 }
452
453 handle = ocfs2_start_trans(osb, handle, credits);
454 if (IS_ERR(handle)) {
455 status = PTR_ERR(handle);
456 handle = NULL;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
462 data_ac, meta_ac, &new_bh);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467
468 ocfs2_set_new_buffer_uptodate(dir, new_bh);
469
470 status = ocfs2_journal_access(handle, dir, new_bh,
471 OCFS2_JOURNAL_ACCESS_CREATE);
472 if (status < 0) {
473 mlog_errno(status);
474 goto bail;
475 }
476 memset(new_bh->b_data, 0, sb->s_blocksize);
477 de = (struct ocfs2_dir_entry *) new_bh->b_data;
478 de->inode = 0;
479 de->rec_len = cpu_to_le16(sb->s_blocksize);
480 status = ocfs2_journal_dirty(handle, new_bh);
481 if (status < 0) {
482 mlog_errno(status);
483 goto bail;
484 }
485
486 dir_i_size += dir->i_sb->s_blocksize;
487 i_size_write(dir, dir_i_size);
488 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
489 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
490 if (status < 0) {
491 mlog_errno(status);
492 goto bail;
493 }
494
495 *new_de_bh = new_bh;
496 get_bh(*new_de_bh);
497bail:
498 if (handle)
499 ocfs2_commit_trans(handle);
500
501 if (data_ac)
502 ocfs2_free_alloc_context(data_ac);
503 if (meta_ac)
504 ocfs2_free_alloc_context(meta_ac);
505
506 if (new_bh)
507 brelse(new_bh);
508
509 mlog_exit(status);
510 return status;
511}
512
513/*
514 * Search the dir for a good spot, extending it if necessary. The
515 * block containing an appropriate record is returned in ret_de_bh.
516 */
517int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
518 struct inode *dir,
519 struct buffer_head *parent_fe_bh,
520 const char *name,
521 int namelen,
522 struct buffer_head **ret_de_bh)
523{
524 unsigned long offset;
525 struct buffer_head * bh = NULL;
526 unsigned short rec_len;
527 struct ocfs2_dinode *fe;
528 struct ocfs2_dir_entry *de;
529 struct super_block *sb;
530 int status;
531
532 mlog_entry_void();
533
534 mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
535 namelen, OCFS2_I(dir)->ip_blkno);
536
537 BUG_ON(!S_ISDIR(dir->i_mode));
538 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
539 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
540
541 sb = dir->i_sb;
542
543 if (!namelen) {
544 status = -EINVAL;
545 mlog_errno(status);
546 goto bail;
547 }
548
549 bh = ocfs2_bread(dir, 0, &status, 0);
550 if (!bh) {
551 mlog_errno(status);
552 goto bail;
553 }
554
555 rec_len = OCFS2_DIR_REC_LEN(namelen);
556 offset = 0;
557 de = (struct ocfs2_dir_entry *) bh->b_data;
558 while (1) {
559 if ((char *)de >= sb->s_blocksize + bh->b_data) {
560 brelse(bh);
561 bh = NULL;
562
563 if (i_size_read(dir) <= offset) {
564 status = ocfs2_extend_dir(osb,
565 dir,
566 parent_fe_bh,
567 &bh);
568 if (status < 0) {
569 mlog_errno(status);
570 goto bail;
571 }
572 BUG_ON(!bh);
573 *ret_de_bh = bh;
574 get_bh(*ret_de_bh);
575 goto bail;
576 }
577 bh = ocfs2_bread(dir,
578 offset >> sb->s_blocksize_bits,
579 &status,
580 0);
581 if (!bh) {
582 mlog_errno(status);
583 goto bail;
584 }
585 /* move to next block */
586 de = (struct ocfs2_dir_entry *) bh->b_data;
587 }
588 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
589 status = -ENOENT;
590 goto bail;
591 }
592 if (ocfs2_match(namelen, name, de)) {
593 status = -EEXIST;
594 goto bail;
595 }
596 if (((le64_to_cpu(de->inode) == 0) &&
597 (le16_to_cpu(de->rec_len) >= rec_len)) ||
598 (le16_to_cpu(de->rec_len) >=
599 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
600 /* Ok, we found a spot. Return this bh and let
601 * the caller actually fill it in. */
602 *ret_de_bh = bh;
603 get_bh(*ret_de_bh);
604 status = 0;
605 goto bail;
606 }
607 offset += le16_to_cpu(de->rec_len);
608 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
609 }
610
611 status = 0;
612bail:
613 if (bh)
614 brelse(bh);
615
616 mlog_exit(status);
617 return status;
618}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 000000000000..5f614ec9649c
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H
28
29int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name,
31 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
33int ocfs2_find_files_on_disk(const char *name,
34 int namelen,
35 u64 *blkno,
36 struct inode *inode,
37 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir,
42 struct buffer_head *parent_fe_bh,
43 const char *name,
44 int namelen,
45 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb,
48 struct ocfs2_journal_handle *handle,
49 struct inode *dir,
50 struct buffer_head *parent_fe_bh,
51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac,
53 struct buffer_head **new_bh);
54#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 000000000000..ce3f7c29d270
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,8 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 000000000000..53652f51c0e1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmapi.h
5 *
6 * externally exported dlm interfaces
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef DLMAPI_H
28#define DLMAPI_H
29
30struct dlm_lock;
31struct dlm_ctxt;
32
33/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
34enum dlm_status {
35 DLM_NORMAL = 0, /* 0: request in progress */
36 DLM_GRANTED, /* 1: request granted */
37 DLM_DENIED, /* 2: request denied */
38 DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
39 DLM_WORKING, /* 4: async request in progress */
40 DLM_BLOCKED, /* 5: lock request blocked */
41 DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
42 DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
43 DLM_SYSERR, /* 8: system error */
44 DLM_NOSUPPORT, /* 9: unsupported */
45 DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
46 DLM_IVLOCKID, /* 11: bad lockid */
47 DLM_SYNC, /* 12: synchronous request granted */
48 DLM_BADTYPE, /* 13: bad resource type */
49 DLM_BADRESOURCE, /* 14: bad resource handle */
50 DLM_MAXHANDLES, /* 15: no more resource handles */
51 DLM_NOCLINFO, /* 16: can't contact cluster manager */
52 DLM_NOLOCKMGR, /* 17: can't contact lock manager */
53 DLM_NOPURGED, /* 18: can't contact purge daemon */
54 DLM_BADARGS, /* 19: bad api args */
55 DLM_VOID, /* 20: no status */
56 DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
57 DLM_IVBUFLEN, /* 22: invalid resource name length */
58 DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
59 DLM_BADPARAM, /* 24: invalid lock mode specified */
60 DLM_VALNOTVALID, /* 25: value block has been invalidated */
61 DLM_REJECTED, /* 26: request rejected, unrecognized client */
62 DLM_ABORT, /* 27: blocked lock request cancelled */
63 DLM_CANCEL, /* 28: conversion request cancelled */
64 DLM_IVRESHANDLE, /* 29: invalid resource handle */
65 DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
66 DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
67 DLM_FORWARD, /* 32: request must wait for primary's response */
68 DLM_TIMEOUT, /* 33: timeout value for lock has expired */
69 DLM_IVGROUPID, /* 34: invalid group specification */
70 DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
71 DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
72 DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
73 DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
74
75 DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
76 request if it is being recovered */
77 DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
78 request if it is being migrated */
79 DLM_MAXSTATS, /* 41: upper limit for return code validation */
80};
81
82/* for pretty-printing dlm_status error messages */
83const char *dlm_errmsg(enum dlm_status err);
84/* for pretty-printing dlm_status error names */
85const char *dlm_errname(enum dlm_status err);
86
87/* Eventually the DLM will use standard errno values, but in the
88 * meantime this lets us track dlm errors as they bubble up. When we
89 * bring its error reporting into line with the rest of the stack,
90 * these can just be replaced with calls to mlog_errno. */
91#define dlm_error(st) do { \
92 if ((st) != DLM_RECOVERING && \
93 (st) != DLM_MIGRATING && \
94 (st) != DLM_FORWARD) \
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0)
97
98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08
102#define DLM_LKSB_UNUSED3 0x10
103#define DLM_LKSB_UNUSED4 0x20
104#define DLM_LKSB_UNUSED5 0x40
105#define DLM_LKSB_UNUSED6 0x80
106
107#define DLM_LVB_LEN 64
108
109/* Callers are only allowed access to the lvb and status members of
110 * this struct. */
111struct dlm_lockstatus {
112 enum dlm_status status;
113 u32 flags;
114 struct dlm_lock *lockid;
115 char lvb[DLM_LVB_LEN];
116};
117
118/* Valid lock modes. */
119#define LKM_IVMODE (-1) /* invalid mode */
120#define LKM_NLMODE 0 /* null lock */
121#define LKM_CRMODE 1 /* concurrent read unsupported */
122#define LKM_CWMODE 2 /* concurrent write unsupported */
123#define LKM_PRMODE 3 /* protected read */
124#define LKM_PWMODE 4 /* protected write unsupported */
125#define LKM_EXMODE 5 /* exclusive */
126#define LKM_MAXMODE 5
127#define LKM_MODEMASK 0xff
128
129/* Flags passed to dlmlock and dlmunlock:
130 * reserved: flags used by the "real" dlm
131 * only a few are supported by this dlm
132 * (U) = unsupported by ocfs2 dlm */
133#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
134#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
135#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
136#define LKM_LOCAL 0x00000080 /* local lock request */
137#define LKM_VALBLK 0x00000100 /* lock value block request */
138#define LKM_NOQUEUE 0x00000200 /* non blocking request */
139#define LKM_CONVERT 0x00000400 /* conversion request */
140#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
141#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
142#define LKM_CANCEL 0x00002000 /* cancel conversion request */
143#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
144#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
145#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
146#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
147#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
148#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
149#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
150#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
151#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
152#define LKM_FORCE 0x00800000 /* force unlock flag */
153#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
154 lock value block (U) */
155/* unused */
156#define LKM_UNUSED1 0x00000001 /* unused */
157#define LKM_UNUSED2 0x00000002 /* unused */
158#define LKM_UNUSED3 0x00000004 /* unused */
159#define LKM_UNUSED4 0x00000008 /* unused */
160#define LKM_UNUSED5 0x02000000 /* unused */
161#define LKM_UNUSED6 0x04000000 /* unused */
162#define LKM_UNUSED7 0x08000000 /* unused */
163
164/* ocfs2 extensions: internal only
165 * should never be used by caller */
166#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
167 to another node */
168#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
169 should be applied to lockres */
170#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
171 from lockres when lock is granted */
172#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
173 used to avoid recovery rwsem */
174
175
176typedef void (dlm_astlockfunc_t)(void *);
177typedef void (dlm_bastlockfunc_t)(void *, int);
178typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
179
180enum dlm_status dlmlock(struct dlm_ctxt *dlm,
181 int mode,
182 struct dlm_lockstatus *lksb,
183 int flags,
184 const char *name,
185 dlm_astlockfunc_t *ast,
186 void *data,
187 dlm_bastlockfunc_t *bast);
188
189enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
190 struct dlm_lockstatus *lksb,
191 int flags,
192 dlm_astunlockfunc_t *unlockast,
193 void *data);
194
195struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
196
197void dlm_unregister_domain(struct dlm_ctxt *dlm);
198
199void dlm_print_one_lock(struct dlm_lock *lockid);
200
201typedef void (dlm_eviction_func)(int, void *);
202struct dlm_eviction_cb {
203 struct list_head ec_item;
204 dlm_eviction_func *ec_func;
205 void *ec_data;
206};
207void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
208 dlm_eviction_func *f,
209 void *data);
210void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
211 struct dlm_eviction_cb *cb);
212void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
213
214#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 000000000000..8d17d28ef91c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmast.c
5 *
6 * AST and BAST functionality for local and remote nodes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46#include "cluster/endian.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#define MLOG_MASK_PREFIX ML_DLM
52#include "cluster/masklog.h"
53
54static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
55 struct dlm_lock *lock);
56static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
57
58/* Should be called as an ast gets queued to see if the new
59 * lock level will obsolete a pending bast.
60 * For example, if dlm_thread queued a bast for an EX lock that
61 * was blocking another EX, but before sending the bast the
62 * lock owner downconverted to NL, the bast is now obsolete.
63 * Only the ast should be sent.
64 * This is needed because the lock and convert paths can queue
65 * asts out-of-band (not waiting for dlm_thread) in order to
66 * allow for LKM_NOQUEUE to get immediate responses. */
67static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
68{
69 assert_spin_locked(&dlm->ast_lock);
70 assert_spin_locked(&lock->spinlock);
71
72 if (lock->ml.highest_blocked == LKM_IVMODE)
73 return 0;
74 BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
75
76 if (lock->bast_pending &&
77 list_empty(&lock->bast_list))
78 /* old bast already sent, ok */
79 return 0;
80
81 if (lock->ml.type == LKM_EXMODE)
82 /* EX blocks anything left, any bast still valid */
83 return 0;
84 else if (lock->ml.type == LKM_NLMODE)
85 /* NL blocks nothing, no reason to send any bast, cancel it */
86 return 1;
87 else if (lock->ml.highest_blocked != LKM_EXMODE)
88 /* PR only blocks EX */
89 return 1;
90
91 return 0;
92}
93
94static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
95{
96 mlog_entry_void();
97
98 BUG_ON(!dlm);
99 BUG_ON(!lock);
100
101 assert_spin_locked(&dlm->ast_lock);
102 if (!list_empty(&lock->ast_list)) {
103 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
104 lock->ast_pending, lock->ml.type);
105 BUG();
106 }
107 BUG_ON(!list_empty(&lock->ast_list));
108 if (lock->ast_pending)
109 mlog(0, "lock has an ast getting flushed right now\n");
110
111 /* putting lock on list, add a ref */
112 dlm_lock_get(lock);
113 spin_lock(&lock->spinlock);
114
115 /* check to see if this ast obsoletes the bast */
116 if (dlm_should_cancel_bast(dlm, lock)) {
117 struct dlm_lock_resource *res = lock->lockres;
118 mlog(0, "%s: cancelling bast for %.*s\n",
119 dlm->name, res->lockname.len, res->lockname.name);
120 lock->bast_pending = 0;
121 list_del_init(&lock->bast_list);
122 lock->ml.highest_blocked = LKM_IVMODE;
123 /* removing lock from list, remove a ref. guaranteed
124 * this won't be the last ref because of the get above,
125 * so res->spinlock will not be taken here */
126 dlm_lock_put(lock);
127 /* free up the reserved bast that we are cancelling.
128 * guaranteed that this will not be the last reserved
129 * ast because *both* an ast and a bast were reserved
130 * to get to this point. the res->spinlock will not be
131 * taken here */
132 dlm_lockres_release_ast(dlm, res);
133 }
134 list_add_tail(&lock->ast_list, &dlm->pending_asts);
135 lock->ast_pending = 1;
136 spin_unlock(&lock->spinlock);
137}
138
139void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
140{
141 mlog_entry_void();
142
143 BUG_ON(!dlm);
144 BUG_ON(!lock);
145
146 spin_lock(&dlm->ast_lock);
147 __dlm_queue_ast(dlm, lock);
148 spin_unlock(&dlm->ast_lock);
149}
150
151
152static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
153{
154 mlog_entry_void();
155
156 BUG_ON(!dlm);
157 BUG_ON(!lock);
158 assert_spin_locked(&dlm->ast_lock);
159
160 BUG_ON(!list_empty(&lock->bast_list));
161 if (lock->bast_pending)
162 mlog(0, "lock has a bast getting flushed right now\n");
163
164 /* putting lock on list, add a ref */
165 dlm_lock_get(lock);
166 spin_lock(&lock->spinlock);
167 list_add_tail(&lock->bast_list, &dlm->pending_basts);
168 lock->bast_pending = 1;
169 spin_unlock(&lock->spinlock);
170}
171
172void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
173{
174 mlog_entry_void();
175
176 BUG_ON(!dlm);
177 BUG_ON(!lock);
178
179 spin_lock(&dlm->ast_lock);
180 __dlm_queue_bast(dlm, lock);
181 spin_unlock(&dlm->ast_lock);
182}
183
184static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 struct dlm_lock *lock)
186{
187 struct dlm_lockstatus *lksb = lock->lksb;
188 BUG_ON(!lksb);
189
190 /* only updates if this node masters the lockres */
191 if (res->owner == dlm->node_num) {
192
193 spin_lock(&res->spinlock);
194 /* check the lksb flags for the direction */
195 if (lksb->flags & DLM_LKSB_GET_LVB) {
196 mlog(0, "getting lvb from lockres for %s node\n",
197 lock->ml.node == dlm->node_num ? "master" :
198 "remote");
199 memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
200 } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
201 mlog(0, "setting lvb from lockres for %s node\n",
202 lock->ml.node == dlm->node_num ? "master" :
203 "remote");
204 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
205 }
206 spin_unlock(&res->spinlock);
207 }
208
209 /* reset any lvb flags on the lksb */
210 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
211}
212
213void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
214 struct dlm_lock *lock)
215{
216 dlm_astlockfunc_t *fn;
217 struct dlm_lockstatus *lksb;
218
219 mlog_entry_void();
220
221 lksb = lock->lksb;
222 fn = lock->ast;
223 BUG_ON(lock->ml.node != dlm->node_num);
224
225 dlm_update_lvb(dlm, res, lock);
226 (*fn)(lock->astdata);
227}
228
229
230int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lock *lock)
232{
233 int ret;
234 struct dlm_lockstatus *lksb;
235 int lksbflags;
236
237 mlog_entry_void();
238
239 lksb = lock->lksb;
240 BUG_ON(lock->ml.node == dlm->node_num);
241
242 lksbflags = lksb->flags;
243 dlm_update_lvb(dlm, res, lock);
244
245 /* lock request came from another node
246 * go do the ast over there */
247 ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
248 return ret;
249}
250
251void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
252 struct dlm_lock *lock, int blocked_type)
253{
254 dlm_bastlockfunc_t *fn = lock->bast;
255
256 mlog_entry_void();
257 BUG_ON(lock->ml.node != dlm->node_num);
258
259 (*fn)(lock->astdata, blocked_type);
260}
261
262
263
264int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
265{
266 int ret;
267 unsigned int locklen;
268 struct dlm_ctxt *dlm = data;
269 struct dlm_lock_resource *res = NULL;
270 struct dlm_lock *lock = NULL;
271 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
272 char *name;
273 struct list_head *iter, *head=NULL;
274 u64 cookie;
275 u32 flags;
276
277 if (!dlm_grab(dlm)) {
278 dlm_error(DLM_REJECTED);
279 return DLM_REJECTED;
280 }
281
282 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
283 "Domain %s not fully joined!\n", dlm->name);
284
285 name = past->name;
286 locklen = past->namelen;
287 cookie = be64_to_cpu(past->cookie);
288 flags = be32_to_cpu(past->flags);
289
290 if (locklen > DLM_LOCKID_NAME_MAX) {
291 ret = DLM_IVBUFLEN;
292 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
293 goto leave;
294 }
295
296 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
297 (LKM_PUT_LVB|LKM_GET_LVB)) {
298 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
299 ret = DLM_BADARGS;
300 goto leave;
301 }
302
303 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
304 (flags & LKM_GET_LVB ? "get lvb" : "none"));
305
306 mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
307
308 if (past->type != DLM_AST &&
309 past->type != DLM_BAST) {
310 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
311 "name=%.*s\n", past->type, cookie, locklen, name);
312 ret = DLM_IVLOCKID;
313 goto leave;
314 }
315
316 res = dlm_lookup_lockres(dlm, name, locklen);
317 if (!res) {
318 mlog(ML_ERROR, "got %sast for unknown lockres! "
319 "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
320 past->type == DLM_AST ? "" : "b",
321 cookie, locklen, name, locklen);
322 ret = DLM_IVLOCKID;
323 goto leave;
324 }
325
326 /* cannot get a proxy ast message if this node owns it */
327 BUG_ON(res->owner == dlm->node_num);
328
329 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
330
331 spin_lock(&res->spinlock);
332 if (res->state & DLM_LOCK_RES_RECOVERING) {
333 mlog(0, "responding with DLM_RECOVERING!\n");
334 ret = DLM_RECOVERING;
335 goto unlock_out;
336 }
337 if (res->state & DLM_LOCK_RES_MIGRATING) {
338 mlog(0, "responding with DLM_MIGRATING!\n");
339 ret = DLM_MIGRATING;
340 goto unlock_out;
341 }
342 /* try convert queue for both ast/bast */
343 head = &res->converting;
344 lock = NULL;
345 list_for_each(iter, head) {
346 lock = list_entry (iter, struct dlm_lock, list);
347 if (be64_to_cpu(lock->ml.cookie) == cookie)
348 goto do_ast;
349 }
350
351 /* if not on convert, try blocked for ast, granted for bast */
352 if (past->type == DLM_AST)
353 head = &res->blocked;
354 else
355 head = &res->granted;
356
357 list_for_each(iter, head) {
358 lock = list_entry (iter, struct dlm_lock, list);
359 if (be64_to_cpu(lock->ml.cookie) == cookie)
360 goto do_ast;
361 }
362
363 mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", "
364 "name=%.*s, namelen=%u\n",
365 past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
366
367 ret = DLM_NORMAL;
368unlock_out:
369 spin_unlock(&res->spinlock);
370 goto leave;
371
372do_ast:
373 ret = DLM_NORMAL;
374 if (past->type == DLM_AST) {
375 /* do not alter lock refcount. switching lists. */
376 list_del_init(&lock->list);
377 list_add_tail(&lock->list, &res->granted);
378 mlog(0, "ast: adding to granted list... type=%d, "
379 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
380 if (lock->ml.convert_type != LKM_IVMODE) {
381 lock->ml.type = lock->ml.convert_type;
382 lock->ml.convert_type = LKM_IVMODE;
383 } else {
384 // should already be there....
385 }
386
387 lock->lksb->status = DLM_NORMAL;
388
389 /* if we requested the lvb, fetch it into our lksb now */
390 if (flags & LKM_GET_LVB) {
391 BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
392 memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
393 }
394 }
395 spin_unlock(&res->spinlock);
396
397 if (past->type == DLM_AST)
398 dlm_do_local_ast(dlm, res, lock);
399 else
400 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
401
402leave:
403
404 if (res)
405 dlm_lockres_put(res);
406
407 dlm_put(dlm);
408 return ret;
409}
410
411
412
413int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
414 struct dlm_lock *lock, int msg_type,
415 int blocked_type, int flags)
416{
417 int ret = 0;
418 struct dlm_proxy_ast past;
419 struct kvec vec[2];
420 size_t veclen = 1;
421 int status;
422
423 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
424 res->lockname.len, res->lockname.name, lock->ml.node,
425 msg_type, blocked_type);
426
427 memset(&past, 0, sizeof(struct dlm_proxy_ast));
428 past.node_idx = dlm->node_num;
429 past.type = msg_type;
430 past.blocked_type = blocked_type;
431 past.namelen = res->lockname.len;
432 memcpy(past.name, res->lockname.name, past.namelen);
433 past.cookie = lock->ml.cookie;
434
435 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
436 vec[0].iov_base = &past;
437 if (flags & DLM_LKSB_GET_LVB) {
438 mlog(0, "returning requested LVB data\n");
439 be32_add_cpu(&past.flags, LKM_GET_LVB);
440 vec[1].iov_len = DLM_LVB_LEN;
441 vec[1].iov_base = lock->lksb->lvb;
442 veclen++;
443 }
444
445 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
446 lock->ml.node, &status);
447 if (ret < 0)
448 mlog_errno(ret);
449 else {
450 if (status == DLM_RECOVERING) {
451 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
452 "node is dead!\n", lock->ml.node);
453 BUG();
454 } else if (status == DLM_MIGRATING) {
455 mlog(ML_ERROR, "sent AST to node %u, it returned "
456 "DLM_MIGRATING!\n", lock->ml.node);
457 BUG();
458 } else if (status != DLM_NORMAL) {
459 mlog(ML_ERROR, "AST to node %u returned %d!\n",
460 lock->ml.node, status);
461 /* ignore it */
462 }
463 ret = 0;
464 }
465 return ret;
466}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 000000000000..3fecba0a6023
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmcommon.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCOMMON_H
26#define DLMCOMMON_H
27
28#include <linux/kref.h>
29
30#define DLM_HB_NODE_DOWN_PRI (0xf000000)
31#define DLM_HB_NODE_UP_PRI (0x8000000)
32
33#define DLM_LOCKID_NAME_MAX 32
34
35#define DLM_DOMAIN_NAME_MAX_LEN 255
36#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39
40#define DLM_HASH_BITS 7
41#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
42#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
43
44enum dlm_ast_type {
45 DLM_AST = 0,
46 DLM_BAST,
47 DLM_ASTUNLOCK
48};
49
50
51#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
52 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
53 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
54
55#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
56#define DLM_RECOVERY_LOCK_NAME_LEN 9
57
58static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
59{
60 if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
61 memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
62 return 1;
63 return 0;
64}
65
66#define DLM_RECO_STATE_ACTIVE 0x0001
67
68struct dlm_recovery_ctxt
69{
70 struct list_head resources;
71 struct list_head received;
72 struct list_head node_data;
73 u8 new_master;
74 u8 dead_node;
75 u16 state;
76 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
77 wait_queue_head_t event;
78};
79
80enum dlm_ctxt_state {
81 DLM_CTXT_NEW = 0,
82 DLM_CTXT_JOINED,
83 DLM_CTXT_IN_SHUTDOWN,
84 DLM_CTXT_LEAVING,
85};
86
87struct dlm_ctxt
88{
89 struct list_head list;
90 struct list_head *resources;
91 struct list_head dirty_list;
92 struct list_head purge_list;
93 struct list_head pending_asts;
94 struct list_head pending_basts;
95 unsigned int purge_count;
96 spinlock_t spinlock;
97 spinlock_t ast_lock;
98 char *name;
99 u8 node_num;
100 u32 key;
101 u8 joining_node;
102 wait_queue_head_t dlm_join_events;
103 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
104 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
105 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
106 struct dlm_recovery_ctxt reco;
107 spinlock_t master_lock;
108 struct list_head master_list;
109 struct list_head mle_hb_events;
110
111 /* these give a really vague idea of the system load */
112 atomic_t local_resources;
113 atomic_t remote_resources;
114 atomic_t unknown_resources;
115
116 /* NOTE: Next three are protected by dlm_domain_lock */
117 struct kref dlm_refs;
118 enum dlm_ctxt_state dlm_state;
119 unsigned int num_joins;
120
121 struct o2hb_callback_func dlm_hb_up;
122 struct o2hb_callback_func dlm_hb_down;
123 struct task_struct *dlm_thread_task;
124 struct task_struct *dlm_reco_thread_task;
125 wait_queue_head_t dlm_thread_wq;
126 wait_queue_head_t dlm_reco_thread_wq;
127 wait_queue_head_t ast_wq;
128 wait_queue_head_t migration_wq;
129
130 struct work_struct dispatched_work;
131 struct list_head work_list;
132 spinlock_t work_lock;
133 struct list_head dlm_domain_handlers;
134 struct list_head dlm_eviction_callbacks;
135};
136
137/* these keventd work queue items are for less-frequently
138 * called functions that cannot be directly called from the
139 * net message handlers for some reason, usually because
140 * they need to send net messages of their own. */
141void dlm_dispatch_work(void *data);
142
143struct dlm_lock_resource;
144struct dlm_work_item;
145
146typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
147
148struct dlm_request_all_locks_priv
149{
150 u8 reco_master;
151 u8 dead_node;
152};
153
154struct dlm_mig_lockres_priv
155{
156 struct dlm_lock_resource *lockres;
157 u8 real_master;
158};
159
160struct dlm_assert_master_priv
161{
162 struct dlm_lock_resource *lockres;
163 u8 request_from;
164 u32 flags;
165 unsigned ignore_higher:1;
166};
167
168
169struct dlm_work_item
170{
171 struct list_head list;
172 dlm_workfunc_t *func;
173 struct dlm_ctxt *dlm;
174 void *data;
175 union {
176 struct dlm_request_all_locks_priv ral;
177 struct dlm_mig_lockres_priv ml;
178 struct dlm_assert_master_priv am;
179 } u;
180};
181
182static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
183 struct dlm_work_item *i,
184 dlm_workfunc_t *f, void *data)
185{
186 memset(i, 0, sizeof(*i));
187 i->func = f;
188 INIT_LIST_HEAD(&i->list);
189 i->data = data;
190 i->dlm = dlm; /* must have already done a dlm_grab on this! */
191}
192
193
194
195static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
196 u8 node)
197{
198 assert_spin_locked(&dlm->spinlock);
199
200 dlm->joining_node = node;
201 wake_up(&dlm->dlm_join_events);
202}
203
204#define DLM_LOCK_RES_UNINITED 0x00000001
205#define DLM_LOCK_RES_RECOVERING 0x00000002
206#define DLM_LOCK_RES_READY 0x00000004
207#define DLM_LOCK_RES_DIRTY 0x00000008
208#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
209#define DLM_LOCK_RES_MIGRATING 0x00000020
210
211#define DLM_PURGE_INTERVAL_MS (8 * 1000)
212
213struct dlm_lock_resource
214{
215 /* WARNING: Please see the comment in dlm_init_lockres before
216 * adding fields here. */
217 struct list_head list;
218 struct kref refs;
219
220 /* please keep these next 3 in this order
221 * some funcs want to iterate over all lists */
222 struct list_head granted;
223 struct list_head converting;
224 struct list_head blocked;
225
226 struct list_head dirty;
227 struct list_head recovering; // dlm_recovery_ctxt.resources list
228
229 /* unused lock resources have their last_used stamped and are
230 * put on a list for the dlm thread to run. */
231 struct list_head purge;
232 unsigned long last_used;
233
234 unsigned migration_pending:1;
235 atomic_t asts_reserved;
236 spinlock_t spinlock;
237 wait_queue_head_t wq;
238 u8 owner; //node which owns the lock resource, or unknown
239 u16 state;
240 struct qstr lockname;
241 char lvb[DLM_LVB_LEN];
242};
243
244struct dlm_migratable_lock
245{
246 __be64 cookie;
247
248 /* these 3 are just padding for the in-memory structure, but
249 * list and flags are actually used when sent over the wire */
250 __be16 pad1;
251 u8 list; // 0=granted, 1=converting, 2=blocked
252 u8 flags;
253
254 s8 type;
255 s8 convert_type;
256 s8 highest_blocked;
257 u8 node;
258}; // 16 bytes
259
260struct dlm_lock
261{
262 struct dlm_migratable_lock ml;
263
264 struct list_head list;
265 struct list_head ast_list;
266 struct list_head bast_list;
267 struct dlm_lock_resource *lockres;
268 spinlock_t spinlock;
269 struct kref lock_refs;
270
271 // ast and bast must be callable while holding a spinlock!
272 dlm_astlockfunc_t *ast;
273 dlm_bastlockfunc_t *bast;
274 void *astdata;
275 struct dlm_lockstatus *lksb;
276 unsigned ast_pending:1,
277 bast_pending:1,
278 convert_pending:1,
279 lock_pending:1,
280 cancel_pending:1,
281 unlock_pending:1,
282 lksb_kernel_allocated:1;
283};
284
285
286#define DLM_LKSB_UNUSED1 0x01
287#define DLM_LKSB_PUT_LVB 0x02
288#define DLM_LKSB_GET_LVB 0x04
289#define DLM_LKSB_UNUSED2 0x08
290#define DLM_LKSB_UNUSED3 0x10
291#define DLM_LKSB_UNUSED4 0x20
292#define DLM_LKSB_UNUSED5 0x40
293#define DLM_LKSB_UNUSED6 0x80
294
295
296enum dlm_lockres_list {
297 DLM_GRANTED_LIST = 0,
298 DLM_CONVERTING_LIST,
299 DLM_BLOCKED_LIST
300};
301
302static inline struct list_head *
303dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
304{
305 struct list_head *ret = NULL;
306 if (idx == DLM_GRANTED_LIST)
307 ret = &res->granted;
308 else if (idx == DLM_CONVERTING_LIST)
309 ret = &res->converting;
310 else if (idx == DLM_BLOCKED_LIST)
311 ret = &res->blocked;
312 else
313 BUG();
314 return ret;
315}
316
317
318
319
320struct dlm_node_iter
321{
322 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
323 int curnode;
324};
325
326
327enum {
328 DLM_MASTER_REQUEST_MSG = 500,
329 DLM_UNUSED_MSG1, /* 501 */
330 DLM_ASSERT_MASTER_MSG, /* 502 */
331 DLM_CREATE_LOCK_MSG, /* 503 */
332 DLM_CONVERT_LOCK_MSG, /* 504 */
333 DLM_PROXY_AST_MSG, /* 505 */
334 DLM_UNLOCK_LOCK_MSG, /* 506 */
335 DLM_UNUSED_MSG2, /* 507 */
336 DLM_MIGRATE_REQUEST_MSG, /* 508 */
337 DLM_MIG_LOCKRES_MSG, /* 509 */
338 DLM_QUERY_JOIN_MSG, /* 510 */
339 DLM_ASSERT_JOINED_MSG, /* 511 */
340 DLM_CANCEL_JOIN_MSG, /* 512 */
341 DLM_EXIT_DOMAIN_MSG, /* 513 */
342 DLM_MASTER_REQUERY_MSG, /* 514 */
343 DLM_LOCK_REQUEST_MSG, /* 515 */
344 DLM_RECO_DATA_DONE_MSG, /* 516 */
345 DLM_BEGIN_RECO_MSG, /* 517 */
346 DLM_FINALIZE_RECO_MSG /* 518 */
347};
348
349struct dlm_reco_node_data
350{
351 int state;
352 u8 node_num;
353 struct list_head list;
354};
355
356enum {
357 DLM_RECO_NODE_DATA_DEAD = -1,
358 DLM_RECO_NODE_DATA_INIT = 0,
359 DLM_RECO_NODE_DATA_REQUESTING,
360 DLM_RECO_NODE_DATA_REQUESTED,
361 DLM_RECO_NODE_DATA_RECEIVING,
362 DLM_RECO_NODE_DATA_DONE,
363 DLM_RECO_NODE_DATA_FINALIZE_SENT,
364};
365
366
367enum {
368 DLM_MASTER_RESP_NO = 0,
369 DLM_MASTER_RESP_YES,
370 DLM_MASTER_RESP_MAYBE,
371 DLM_MASTER_RESP_ERROR
372};
373
374
375struct dlm_master_request
376{
377 u8 node_idx;
378 u8 namelen;
379 __be16 pad1;
380 __be32 flags;
381
382 u8 name[O2NM_MAX_NAME_LEN];
383};
384
385#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
386#define DLM_ASSERT_MASTER_REQUERY 0x00000002
387#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
388struct dlm_assert_master
389{
390 u8 node_idx;
391 u8 namelen;
392 __be16 pad1;
393 __be32 flags;
394
395 u8 name[O2NM_MAX_NAME_LEN];
396};
397
398struct dlm_migrate_request
399{
400 u8 master;
401 u8 new_master;
402 u8 namelen;
403 u8 pad1;
404 __be32 pad2;
405 u8 name[O2NM_MAX_NAME_LEN];
406};
407
408struct dlm_master_requery
409{
410 u8 pad1;
411 u8 pad2;
412 u8 node_idx;
413 u8 namelen;
414 __be32 pad3;
415 u8 name[O2NM_MAX_NAME_LEN];
416};
417
418#define DLM_MRES_RECOVERY 0x01
419#define DLM_MRES_MIGRATION 0x02
420#define DLM_MRES_ALL_DONE 0x04
421
422/*
423 * We would like to get one whole lockres into a single network
424 * message whenever possible. Generally speaking, there will be
425 * at most one dlm_lock on a lockres for each node in the cluster,
426 * plus (infrequently) any additional locks coming in from userdlm.
427 *
428 * struct _dlm_lockres_page
429 * {
430 * dlm_migratable_lockres mres;
431 * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
432 * u8 pad[DLM_MIG_LOCKRES_RESERVED];
433 * };
434 *
435 * from ../cluster/tcp.h
436 * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
437 * (roughly 4080 bytes)
438 * and sizeof(dlm_migratable_lockres) = 112 bytes
439 * and sizeof(dlm_migratable_lock) = 16 bytes
440 *
441 * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
442 * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
443 *
444 * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
445 * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
446 * NET_MAX_PAYLOAD_BYTES
447 * (240 * 16) + 112 + 128 = 4080
448 *
449 * So a lockres would need more than 240 locks before it would
450 * use more than one network packet to recover. Not too bad.
451 */
452#define DLM_MAX_MIGRATABLE_LOCKS 240
453
454struct dlm_migratable_lockres
455{
456 u8 master;
457 u8 lockname_len;
458 u8 num_locks; // locks sent in this structure
459 u8 flags;
460 __be32 total_locks; // locks to be sent for this migration cookie
461 __be64 mig_cookie; // cookie for this lockres migration
462 // or zero if not needed
463 // 16 bytes
464 u8 lockname[DLM_LOCKID_NAME_MAX];
465 // 48 bytes
466 u8 lvb[DLM_LVB_LEN];
467 // 112 bytes
468 struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
469};
470#define DLM_MIG_LOCKRES_MAX_LEN \
471 (sizeof(struct dlm_migratable_lockres) + \
472 (sizeof(struct dlm_migratable_lock) * \
473 DLM_MAX_MIGRATABLE_LOCKS) )
474
475/* from above, 128 bytes
476 * for some undetermined future use */
477#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
478 DLM_MIG_LOCKRES_MAX_LEN)
479
480struct dlm_create_lock
481{
482 __be64 cookie;
483
484 __be32 flags;
485 u8 pad1;
486 u8 node_idx;
487 s8 requested_type;
488 u8 namelen;
489
490 u8 name[O2NM_MAX_NAME_LEN];
491};
492
493struct dlm_convert_lock
494{
495 __be64 cookie;
496
497 __be32 flags;
498 u8 pad1;
499 u8 node_idx;
500 s8 requested_type;
501 u8 namelen;
502
503 u8 name[O2NM_MAX_NAME_LEN];
504
505 s8 lvb[0];
506};
507#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
508
509struct dlm_unlock_lock
510{
511 __be64 cookie;
512
513 __be32 flags;
514 __be16 pad1;
515 u8 node_idx;
516 u8 namelen;
517
518 u8 name[O2NM_MAX_NAME_LEN];
519
520 s8 lvb[0];
521};
522#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
523
524struct dlm_proxy_ast
525{
526 __be64 cookie;
527
528 __be32 flags;
529 u8 node_idx;
530 u8 type;
531 u8 blocked_type;
532 u8 namelen;
533
534 u8 name[O2NM_MAX_NAME_LEN];
535
536 s8 lvb[0];
537};
538#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
539
540#define DLM_MOD_KEY (0x666c6172)
541enum dlm_query_join_response {
542 JOIN_DISALLOW = 0,
543 JOIN_OK,
544 JOIN_OK_NO_MAP,
545};
546
547struct dlm_lock_request
548{
549 u8 node_idx;
550 u8 dead_node;
551 __be16 pad1;
552 __be32 pad2;
553};
554
555struct dlm_reco_data_done
556{
557 u8 node_idx;
558 u8 dead_node;
559 __be16 pad1;
560 __be32 pad2;
561
562 /* unused for now */
563 /* eventually we can use this to attempt
564 * lvb recovery based on each node's info */
565 u8 reco_lvb[DLM_LVB_LEN];
566};
567
568struct dlm_begin_reco
569{
570 u8 node_idx;
571 u8 dead_node;
572 __be16 pad1;
573 __be32 pad2;
574};
575
576
577struct dlm_query_join_request
578{
579 u8 node_idx;
580 u8 pad1[2];
581 u8 name_len;
582 u8 domain[O2NM_MAX_NAME_LEN];
583};
584
585struct dlm_assert_joined
586{
587 u8 node_idx;
588 u8 pad1[2];
589 u8 name_len;
590 u8 domain[O2NM_MAX_NAME_LEN];
591};
592
593struct dlm_cancel_join
594{
595 u8 node_idx;
596 u8 pad1[2];
597 u8 name_len;
598 u8 domain[O2NM_MAX_NAME_LEN];
599};
600
601struct dlm_exit_domain
602{
603 u8 node_idx;
604 u8 pad1[3];
605};
606
607struct dlm_finalize_reco
608{
609 u8 node_idx;
610 u8 dead_node;
611 __be16 pad1;
612 __be32 pad2;
613};
614
615static inline enum dlm_status
616__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
617{
618 enum dlm_status status = DLM_NORMAL;
619
620 assert_spin_locked(&res->spinlock);
621
622 if (res->state & DLM_LOCK_RES_RECOVERING)
623 status = DLM_RECOVERING;
624 else if (res->state & DLM_LOCK_RES_MIGRATING)
625 status = DLM_MIGRATING;
626 else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
627 status = DLM_FORWARD;
628
629 return status;
630}
631
632struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
633 struct dlm_lockstatus *lksb);
634void dlm_lock_get(struct dlm_lock *lock);
635void dlm_lock_put(struct dlm_lock *lock);
636
637void dlm_lock_attach_lockres(struct dlm_lock *lock,
638 struct dlm_lock_resource *res);
639
640int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
641int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
642int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
643
644void dlm_revert_pending_convert(struct dlm_lock_resource *res,
645 struct dlm_lock *lock);
646void dlm_revert_pending_lock(struct dlm_lock_resource *res,
647 struct dlm_lock *lock);
648
649int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
650void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
651 struct dlm_lock *lock);
652void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
653 struct dlm_lock *lock);
654
655int dlm_launch_thread(struct dlm_ctxt *dlm);
656void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
660
661void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
663int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
664
665void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
666 struct dlm_lock_resource *res);
667void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
668 struct dlm_lock_resource *res);
669void dlm_purge_lockres(struct dlm_ctxt *dlm,
670 struct dlm_lock_resource *lockres);
671void dlm_lockres_get(struct dlm_lock_resource *res);
672void dlm_lockres_put(struct dlm_lock_resource *res);
673void __dlm_unhash_lockres(struct dlm_lock_resource *res);
674void __dlm_insert_lockres(struct dlm_ctxt *dlm,
675 struct dlm_lock_resource *res);
676struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
677 const char *name,
678 unsigned int len);
679struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
680 const char *name,
681 unsigned int len);
682
683int dlm_is_host_down(int errno);
684void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
685 struct dlm_lock_resource *res,
686 u8 owner);
687struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
688 const char *lockid,
689 int flags);
690struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
691 const char *name,
692 unsigned int namelen);
693
694void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
695void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
696void dlm_do_local_ast(struct dlm_ctxt *dlm,
697 struct dlm_lock_resource *res,
698 struct dlm_lock *lock);
699int dlm_do_remote_ast(struct dlm_ctxt *dlm,
700 struct dlm_lock_resource *res,
701 struct dlm_lock *lock);
702void dlm_do_local_bast(struct dlm_ctxt *dlm,
703 struct dlm_lock_resource *res,
704 struct dlm_lock *lock,
705 int blocked_type);
706int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
707 struct dlm_lock_resource *res,
708 struct dlm_lock *lock,
709 int msg_type,
710 int blocked_type, int flags);
711static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
712 struct dlm_lock_resource *res,
713 struct dlm_lock *lock,
714 int blocked_type)
715{
716 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
717 blocked_type, 0);
718}
719
720static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
721 struct dlm_lock_resource *res,
722 struct dlm_lock *lock,
723 int flags)
724{
725 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
726 0, flags);
727}
728
729void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
730void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
731
732u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
733void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
734void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
735
736
737int dlm_nm_init(struct dlm_ctxt *dlm);
738int dlm_heartbeat_init(struct dlm_ctxt *dlm);
739void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
740void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
741
742int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
743int dlm_migrate_lockres(struct dlm_ctxt *dlm,
744 struct dlm_lock_resource *res,
745 u8 target);
746int dlm_finish_migration(struct dlm_ctxt *dlm,
747 struct dlm_lock_resource *res,
748 u8 old_master);
749void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
750 struct dlm_lock_resource *res);
751void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
752
753int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
754int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
755int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
756int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
757int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
758int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
759int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
760int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
761int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
762
763int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
764 struct dlm_lock_resource *res,
765 int ignore_higher,
766 u8 request_from,
767 u32 flags);
768
769
770int dlm_send_one_lockres(struct dlm_ctxt *dlm,
771 struct dlm_lock_resource *res,
772 struct dlm_migratable_lockres *mres,
773 u8 send_to,
774 u8 flags);
775void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
776 struct dlm_lock_resource *res);
777
778/* will exit holding res->spinlock, but may drop in function */
779void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
780void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
781
782/* will exit holding res->spinlock, but may drop in function */
783static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
784{
785 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
786 DLM_LOCK_RES_RECOVERING|
787 DLM_LOCK_RES_MIGRATING));
788}
789
790
791int dlm_init_mle_cache(void);
792void dlm_destroy_mle_cache(void);
793void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
794void dlm_clean_master_list(struct dlm_ctxt *dlm,
795 u8 dead_node);
796int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
797
798
799static inline const char * dlm_lock_mode_name(int mode)
800{
801 switch (mode) {
802 case LKM_EXMODE:
803 return "EX";
804 case LKM_PRMODE:
805 return "PR";
806 case LKM_NLMODE:
807 return "NL";
808 }
809 return "UNKNOWN";
810}
811
812
813static inline int dlm_lock_compatible(int existing, int request)
814{
815 /* NO_LOCK compatible with all */
816 if (request == LKM_NLMODE ||
817 existing == LKM_NLMODE)
818 return 1;
819
820 /* EX incompatible with all non-NO_LOCK */
821 if (request == LKM_EXMODE)
822 return 0;
823
824 /* request must be PR, which is compatible with PR */
825 if (existing == LKM_PRMODE)
826 return 1;
827
828 return 0;
829}
830
831static inline int dlm_lock_on_list(struct list_head *head,
832 struct dlm_lock *lock)
833{
834 struct list_head *iter;
835 struct dlm_lock *tmplock;
836
837 list_for_each(iter, head) {
838 tmplock = list_entry(iter, struct dlm_lock, list);
839 if (tmplock == lock)
840 return 1;
841 }
842 return 0;
843}
844
845
846static inline enum dlm_status dlm_err_to_dlm_status(int err)
847{
848 enum dlm_status ret;
849 if (err == -ENOMEM)
850 ret = DLM_SYSERR;
851 else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
852 ret = DLM_NOLOCKMGR;
853 else if (err == -EINVAL)
854 ret = DLM_BADPARAM;
855 else if (err == -ENAMETOOLONG)
856 ret = DLM_IVBUFLEN;
857 else
858 ret = DLM_BADARGS;
859 return ret;
860}
861
862
863static inline void dlm_node_iter_init(unsigned long *map,
864 struct dlm_node_iter *iter)
865{
866 memcpy(iter->node_map, map, sizeof(iter->node_map));
867 iter->curnode = -1;
868}
869
870static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
871{
872 int bit;
873 bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
874 if (bit >= O2NM_MAX_NODES) {
875 iter->curnode = O2NM_MAX_NODES;
876 return -ENOENT;
877 }
878 iter->curnode = bit;
879 return bit;
880}
881
882
883
884#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 000000000000..6001b22a997d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.c
5 *
6 * underlying calls for lock conversion
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#include "dlmconvert.h"
51
52#define MLOG_MASK_PREFIX ML_DLM
53#include "cluster/masklog.h"
54
55/* NOTE: __dlmconvert_master is the only function in here that
56 * needs a spinlock held on entry (res->spinlock) and it is the
57 * only one that holds a lock on exit (res->spinlock).
58 * All other functions in here need no locks and drop all of
59 * the locks that they acquire. */
60static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock, int flags,
63 int type, int *call_ast,
64 int *kick_thread);
65static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock, int flags, int type);
68
69/*
70 * this is only called directly by dlmlock(), and only when the
71 * local node is the owner of the lockres
72 * locking:
73 * caller needs: none
74 * taken: takes and drops res->spinlock
75 * held on exit: none
76 * returns: see __dlmconvert_master
77 */
78enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
79 struct dlm_lock_resource *res,
80 struct dlm_lock *lock, int flags, int type)
81{
82 int call_ast = 0, kick_thread = 0;
83 enum dlm_status status;
84
85 spin_lock(&res->spinlock);
86 /* we are not in a network handler, this is fine */
87 __dlm_wait_on_lockres(res);
88 __dlm_lockres_reserve_ast(res);
89 res->state |= DLM_LOCK_RES_IN_PROGRESS;
90
91 status = __dlmconvert_master(dlm, res, lock, flags, type,
92 &call_ast, &kick_thread);
93
94 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
95 spin_unlock(&res->spinlock);
96 wake_up(&res->wq);
97 if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
98 dlm_error(status);
99
100 /* either queue the ast or release it */
101 if (call_ast)
102 dlm_queue_ast(dlm, lock);
103 else
104 dlm_lockres_release_ast(dlm, res);
105
106 if (kick_thread)
107 dlm_kick_thread(dlm, res);
108
109 return status;
110}
111
112/* performs lock conversion at the lockres master site
113 * locking:
114 * caller needs: res->spinlock
115 * taken: takes and drops lock->spinlock
116 * held on exit: res->spinlock
117 * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
118 * call_ast: whether ast should be called for this lock
119 * kick_thread: whether dlm_kick_thread should be called
120 */
121static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
122 struct dlm_lock_resource *res,
123 struct dlm_lock *lock, int flags,
124 int type, int *call_ast,
125 int *kick_thread)
126{
127 enum dlm_status status = DLM_NORMAL;
128 struct list_head *iter;
129 struct dlm_lock *tmplock=NULL;
130
131 assert_spin_locked(&res->spinlock);
132
133 mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
134 lock->ml.type, lock->ml.convert_type, type);
135
136 spin_lock(&lock->spinlock);
137
138 /* already converting? */
139 if (lock->ml.convert_type != LKM_IVMODE) {
140 mlog(ML_ERROR, "attempted to convert a lock with a lock "
141 "conversion pending\n");
142 status = DLM_DENIED;
143 goto unlock_exit;
144 }
145
146 /* must be on grant queue to convert */
147 if (!dlm_lock_on_list(&res->granted, lock)) {
148 mlog(ML_ERROR, "attempted to convert a lock not on grant "
149 "queue\n");
150 status = DLM_DENIED;
151 goto unlock_exit;
152 }
153
154 if (flags & LKM_VALBLK) {
155 switch (lock->ml.type) {
156 case LKM_EXMODE:
157 /* EX + LKM_VALBLK + convert == set lvb */
158 mlog(0, "will set lvb: converting %s->%s\n",
159 dlm_lock_mode_name(lock->ml.type),
160 dlm_lock_mode_name(type));
161 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
162 break;
163 case LKM_PRMODE:
164 case LKM_NLMODE:
165 /* refetch if new level is not NL */
166 if (type > LKM_NLMODE) {
167 mlog(0, "will fetch new value into "
168 "lvb: converting %s->%s\n",
169 dlm_lock_mode_name(lock->ml.type),
170 dlm_lock_mode_name(type));
171 lock->lksb->flags |= DLM_LKSB_GET_LVB;
172 } else {
173 mlog(0, "will NOT fetch new value "
174 "into lvb: converting %s->%s\n",
175 dlm_lock_mode_name(lock->ml.type),
176 dlm_lock_mode_name(type));
177 flags &= ~(LKM_VALBLK);
178 }
179 break;
180 }
181 }
182
183
184 /* in-place downconvert? */
185 if (type <= lock->ml.type)
186 goto grant;
187
188 /* upconvert from here on */
189 status = DLM_NORMAL;
190 list_for_each(iter, &res->granted) {
191 tmplock = list_entry(iter, struct dlm_lock, list);
192 if (tmplock == lock)
193 continue;
194 if (!dlm_lock_compatible(tmplock->ml.type, type))
195 goto switch_queues;
196 }
197
198 list_for_each(iter, &res->converting) {
199 tmplock = list_entry(iter, struct dlm_lock, list);
200 if (!dlm_lock_compatible(tmplock->ml.type, type))
201 goto switch_queues;
202 /* existing conversion requests take precedence */
203 if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
204 goto switch_queues;
205 }
206
207 /* fall thru to grant */
208
209grant:
210 mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
211 res->lockname.name, dlm_lock_mode_name(type));
212 /* immediately grant the new lock type */
213 lock->lksb->status = DLM_NORMAL;
214 if (lock->ml.node == dlm->node_num)
215 mlog(0, "doing in-place convert for nonlocal lock\n");
216 lock->ml.type = type;
217 status = DLM_NORMAL;
218 *call_ast = 1;
219 goto unlock_exit;
220
221switch_queues:
222 if (flags & LKM_NOQUEUE) {
223 mlog(0, "failed to convert NOQUEUE lock %.*s from "
224 "%d to %d...\n", res->lockname.len, res->lockname.name,
225 lock->ml.type, type);
226 status = DLM_NOTQUEUED;
227 goto unlock_exit;
228 }
229 mlog(0, "res %.*s, queueing...\n", res->lockname.len,
230 res->lockname.name);
231
232 lock->ml.convert_type = type;
233 /* do not alter lock refcount. switching lists. */
234 list_del_init(&lock->list);
235 list_add_tail(&lock->list, &res->converting);
236
237unlock_exit:
238 spin_unlock(&lock->spinlock);
239 if (status == DLM_DENIED) {
240 __dlm_print_one_lock_resource(res);
241 }
242 if (status == DLM_NORMAL)
243 *kick_thread = 1;
244 return status;
245}
246
247void dlm_revert_pending_convert(struct dlm_lock_resource *res,
248 struct dlm_lock *lock)
249{
250 /* do not alter lock refcount. switching lists. */
251 list_del_init(&lock->list);
252 list_add_tail(&lock->list, &res->granted);
253 lock->ml.convert_type = LKM_IVMODE;
254 lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
255}
256
257/* messages the master site to do lock conversion
258 * locking:
259 * caller needs: none
260 * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
261 * held on exit: none
262 * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
263 */
264enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
265 struct dlm_lock_resource *res,
266 struct dlm_lock *lock, int flags, int type)
267{
268 enum dlm_status status;
269
270 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
271 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
272
273 spin_lock(&res->spinlock);
274 if (res->state & DLM_LOCK_RES_RECOVERING) {
275 mlog(0, "bailing out early since res is RECOVERING "
276 "on secondary queue\n");
277 /* __dlm_print_one_lock_resource(res); */
278 status = DLM_RECOVERING;
279 goto bail;
280 }
281 /* will exit this call with spinlock held */
282 __dlm_wait_on_lockres(res);
283
284 if (lock->ml.convert_type != LKM_IVMODE) {
285 __dlm_print_one_lock_resource(res);
286 mlog(ML_ERROR, "converting a remote lock that is already "
287 "converting! (cookie=%"MLFu64", conv=%d)\n",
288 lock->ml.cookie, lock->ml.convert_type);
289 status = DLM_DENIED;
290 goto bail;
291 }
292 res->state |= DLM_LOCK_RES_IN_PROGRESS;
293 /* move lock to local convert queue */
294 /* do not alter lock refcount. switching lists. */
295 list_del_init(&lock->list);
296 list_add_tail(&lock->list, &res->converting);
297 lock->convert_pending = 1;
298 lock->ml.convert_type = type;
299
300 if (flags & LKM_VALBLK) {
301 if (lock->ml.type == LKM_EXMODE) {
302 flags |= LKM_PUT_LVB;
303 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
304 } else {
305 if (lock->ml.convert_type == LKM_NLMODE)
306 flags &= ~LKM_VALBLK;
307 else {
308 flags |= LKM_GET_LVB;
309 lock->lksb->flags |= DLM_LKSB_GET_LVB;
310 }
311 }
312 }
313 spin_unlock(&res->spinlock);
314
315 /* no locks held here.
316 * need to wait for a reply as to whether it got queued or not. */
317 status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
318
319 spin_lock(&res->spinlock);
320 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
321 lock->convert_pending = 0;
322 /* if it failed, move it back to granted queue */
323 if (status != DLM_NORMAL) {
324 if (status != DLM_NOTQUEUED)
325 dlm_error(status);
326 dlm_revert_pending_convert(res, lock);
327 }
328bail:
329 spin_unlock(&res->spinlock);
330
331 /* TODO: should this be a wake_one? */
332 /* wake up any IN_PROGRESS waiters */
333 wake_up(&res->wq);
334
335 return status;
336}
337
338/* sends DLM_CONVERT_LOCK_MSG to master site
339 * locking:
340 * caller needs: none
341 * taken: none
342 * held on exit: none
343 * returns: DLM_NOLOCKMGR, status from remote node
344 */
345static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
346 struct dlm_lock_resource *res,
347 struct dlm_lock *lock, int flags, int type)
348{
349 struct dlm_convert_lock convert;
350 int tmpret;
351 enum dlm_status ret;
352 int status = 0;
353 struct kvec vec[2];
354 size_t veclen = 1;
355
356 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
357
358 memset(&convert, 0, sizeof(struct dlm_convert_lock));
359 convert.node_idx = dlm->node_num;
360 convert.requested_type = type;
361 convert.cookie = lock->ml.cookie;
362 convert.namelen = res->lockname.len;
363 convert.flags = cpu_to_be32(flags);
364 memcpy(convert.name, res->lockname.name, convert.namelen);
365
366 vec[0].iov_len = sizeof(struct dlm_convert_lock);
367 vec[0].iov_base = &convert;
368
369 if (flags & LKM_PUT_LVB) {
370 /* extra data to send if we are updating lvb */
371 vec[1].iov_len = DLM_LVB_LEN;
372 vec[1].iov_base = lock->lksb->lvb;
373 veclen++;
374 }
375
376 tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
377 vec, veclen, res->owner, &status);
378 if (tmpret >= 0) {
379 // successfully sent and received
380 ret = status; // this is already a dlm_status
381 if (ret == DLM_RECOVERING) {
382 mlog(0, "node %u returned DLM_RECOVERING from convert "
383 "message!\n", res->owner);
384 } else if (ret == DLM_MIGRATING) {
385 mlog(0, "node %u returned DLM_MIGRATING from convert "
386 "message!\n", res->owner);
387 } else if (ret == DLM_FORWARD) {
388 mlog(0, "node %u returned DLM_FORWARD from convert "
389 "message!\n", res->owner);
390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
391 dlm_error(ret);
392 } else {
393 mlog_errno(tmpret);
394 if (dlm_is_host_down(tmpret)) {
395 ret = DLM_RECOVERING;
396 mlog(0, "node %u died so returning DLM_RECOVERING "
397 "from convert message!\n", res->owner);
398 } else {
399 ret = dlm_err_to_dlm_status(tmpret);
400 }
401 }
402
403 return ret;
404}
405
406/* handler for DLM_CONVERT_LOCK_MSG on master site
407 * locking:
408 * caller needs: none
409 * taken: takes and drop res->spinlock
410 * held on exit: none
411 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
412 * status from __dlmconvert_master
413 */
414int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
415{
416 struct dlm_ctxt *dlm = data;
417 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
418 struct dlm_lock_resource *res = NULL;
419 struct list_head *iter;
420 struct dlm_lock *lock = NULL;
421 struct dlm_lockstatus *lksb;
422 enum dlm_status status = DLM_NORMAL;
423 u32 flags;
424 int call_ast = 0, kick_thread = 0;
425
426 if (!dlm_grab(dlm)) {
427 dlm_error(DLM_REJECTED);
428 return DLM_REJECTED;
429 }
430
431 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
432 "Domain %s not fully joined!\n", dlm->name);
433
434 if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
435 status = DLM_IVBUFLEN;
436 dlm_error(status);
437 goto leave;
438 }
439
440 flags = be32_to_cpu(cnv->flags);
441
442 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
443 (LKM_PUT_LVB|LKM_GET_LVB)) {
444 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
445 status = DLM_BADARGS;
446 goto leave;
447 }
448
449 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
450 (flags & LKM_GET_LVB ? "get lvb" : "none"));
451
452 status = DLM_IVLOCKID;
453 res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
454 if (!res) {
455 dlm_error(status);
456 goto leave;
457 }
458
459 spin_lock(&res->spinlock);
460 list_for_each(iter, &res->granted) {
461 lock = list_entry(iter, struct dlm_lock, list);
462 if (lock->ml.cookie == cnv->cookie &&
463 lock->ml.node == cnv->node_idx) {
464 dlm_lock_get(lock);
465 break;
466 }
467 lock = NULL;
468 }
469 spin_unlock(&res->spinlock);
470 if (!lock) {
471 status = DLM_IVLOCKID;
472 dlm_error(status);
473 goto leave;
474 }
475
476 /* found the lock */
477 lksb = lock->lksb;
478
479 /* see if caller needed to get/put lvb */
480 if (flags & LKM_PUT_LVB) {
481 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
482 lksb->flags |= DLM_LKSB_PUT_LVB;
483 memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
484 } else if (flags & LKM_GET_LVB) {
485 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
486 lksb->flags |= DLM_LKSB_GET_LVB;
487 }
488
489 spin_lock(&res->spinlock);
490 status = __dlm_lockres_state_to_status(res);
491 if (status == DLM_NORMAL) {
492 __dlm_lockres_reserve_ast(res);
493 res->state |= DLM_LOCK_RES_IN_PROGRESS;
494 status = __dlmconvert_master(dlm, res, lock, flags,
495 cnv->requested_type,
496 &call_ast, &kick_thread);
497 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
498 }
499 spin_unlock(&res->spinlock);
500
501 if (status != DLM_NORMAL) {
502 if (status != DLM_NOTQUEUED)
503 dlm_error(status);
504 lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
505 }
506
507leave:
508 if (!lock)
509 mlog(ML_ERROR, "did not find lock to convert on grant queue! "
510 "cookie=%"MLFu64"\n",
511 cnv->cookie);
512 else
513 dlm_lock_put(lock);
514
515 /* either queue the ast or release it */
516 if (call_ast)
517 dlm_queue_ast(dlm, lock);
518 else
519 dlm_lockres_release_ast(dlm, res);
520
521 if (kick_thread)
522 dlm_kick_thread(dlm, res);
523
524 if (res)
525 dlm_lockres_put(res);
526
527 dlm_put(dlm);
528
529 return status;
530}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 000000000000..b2e3677df878
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCONVERT_H
26#define DLMCONVERT_H
27
28enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
29 struct dlm_lock_resource *res,
30 struct dlm_lock *lock, int flags, int type);
31enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
32 struct dlm_lock_resource *res,
33 struct dlm_lock *lock, int flags, int type);
34
35#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 000000000000..f339fe27975a
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.c
5 *
6 * debug functionality for the dlm
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h>
32#include <linux/spinlock.h>
33
34#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h"
36#include "cluster/tcp.h"
37
38#include "dlmapi.h"
39#include "dlmcommon.h"
40#include "dlmdebug.h"
41
42#include "dlmdomain.h"
43#include "dlmdebug.h"
44
45#define MLOG_MASK_PREFIX ML_DLM
46#include "cluster/masklog.h"
47
48void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
49{
50 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
51 res->lockname.len, res->lockname.name,
52 res->owner, res->state);
53 spin_lock(&res->spinlock);
54 __dlm_print_one_lock_resource(res);
55 spin_unlock(&res->spinlock);
56}
57
58void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
59{
60 struct list_head *iter2;
61 struct dlm_lock *lock;
62
63 assert_spin_locked(&res->spinlock);
64
65 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
66 res->lockname.len, res->lockname.name,
67 res->owner, res->state);
68 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
69 res->last_used, list_empty(&res->purge) ? "no" : "yes");
70 mlog(ML_NOTICE, " granted queue: \n");
71 list_for_each(iter2, &res->granted) {
72 lock = list_entry(iter2, struct dlm_lock, list);
73 spin_lock(&lock->spinlock);
74 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
75 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
76 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
77 list_empty(&lock->ast_list) ? 'y' : 'n',
78 lock->ast_pending ? 'y' : 'n',
79 list_empty(&lock->bast_list) ? 'y' : 'n',
80 lock->bast_pending ? 'y' : 'n');
81 spin_unlock(&lock->spinlock);
82 }
83 mlog(ML_NOTICE, " converting queue: \n");
84 list_for_each(iter2, &res->converting) {
85 lock = list_entry(iter2, struct dlm_lock, list);
86 spin_lock(&lock->spinlock);
87 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
88 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
89 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
90 list_empty(&lock->ast_list) ? 'y' : 'n',
91 lock->ast_pending ? 'y' : 'n',
92 list_empty(&lock->bast_list) ? 'y' : 'n',
93 lock->bast_pending ? 'y' : 'n');
94 spin_unlock(&lock->spinlock);
95 }
96 mlog(ML_NOTICE, " blocked queue: \n");
97 list_for_each(iter2, &res->blocked) {
98 lock = list_entry(iter2, struct dlm_lock, list);
99 spin_lock(&lock->spinlock);
100 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
101 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
102 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
103 list_empty(&lock->ast_list) ? 'y' : 'n',
104 lock->ast_pending ? 'y' : 'n',
105 list_empty(&lock->bast_list) ? 'y' : 'n',
106 lock->bast_pending ? 'y' : 'n');
107 spin_unlock(&lock->spinlock);
108 }
109}
110
111void dlm_print_one_lock(struct dlm_lock *lockid)
112{
113 dlm_print_one_lock_resource(lockid->lockres);
114}
115EXPORT_SYMBOL_GPL(dlm_print_one_lock);
116
117void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
118{
119 struct dlm_lock_resource *res;
120 struct list_head *iter;
121 struct list_head *bucket;
122 int i;
123
124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
125 dlm->name, dlm->node_num, dlm->key);
126 if (!dlm || !dlm->name) {
127 mlog(ML_ERROR, "dlm=%p\n", dlm);
128 return;
129 }
130
131 spin_lock(&dlm->spinlock);
132 for (i=0; i<DLM_HASH_SIZE; i++) {
133 bucket = &(dlm->resources[i]);
134 list_for_each(iter, bucket) {
135 res = list_entry(iter, struct dlm_lock_resource, list);
136 dlm_print_one_lock_resource(res);
137 }
138 }
139 spin_unlock(&dlm->spinlock);
140}
141
142static const char *dlm_errnames[] = {
143 [DLM_NORMAL] = "DLM_NORMAL",
144 [DLM_GRANTED] = "DLM_GRANTED",
145 [DLM_DENIED] = "DLM_DENIED",
146 [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS",
147 [DLM_WORKING] = "DLM_WORKING",
148 [DLM_BLOCKED] = "DLM_BLOCKED",
149 [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN",
150 [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD",
151 [DLM_SYSERR] = "DLM_SYSERR",
152 [DLM_NOSUPPORT] = "DLM_NOSUPPORT",
153 [DLM_CANCELGRANT] = "DLM_CANCELGRANT",
154 [DLM_IVLOCKID] = "DLM_IVLOCKID",
155 [DLM_SYNC] = "DLM_SYNC",
156 [DLM_BADTYPE] = "DLM_BADTYPE",
157 [DLM_BADRESOURCE] = "DLM_BADRESOURCE",
158 [DLM_MAXHANDLES] = "DLM_MAXHANDLES",
159 [DLM_NOCLINFO] = "DLM_NOCLINFO",
160 [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR",
161 [DLM_NOPURGED] = "DLM_NOPURGED",
162 [DLM_BADARGS] = "DLM_BADARGS",
163 [DLM_VOID] = "DLM_VOID",
164 [DLM_NOTQUEUED] = "DLM_NOTQUEUED",
165 [DLM_IVBUFLEN] = "DLM_IVBUFLEN",
166 [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT",
167 [DLM_BADPARAM] = "DLM_BADPARAM",
168 [DLM_VALNOTVALID] = "DLM_VALNOTVALID",
169 [DLM_REJECTED] = "DLM_REJECTED",
170 [DLM_ABORT] = "DLM_ABORT",
171 [DLM_CANCEL] = "DLM_CANCEL",
172 [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE",
173 [DLM_DEADLOCK] = "DLM_DEADLOCK",
174 [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS",
175 [DLM_FORWARD] = "DLM_FORWARD",
176 [DLM_TIMEOUT] = "DLM_TIMEOUT",
177 [DLM_IVGROUPID] = "DLM_IVGROUPID",
178 [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT",
179 [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH",
180 [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION",
181 [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ",
182 [DLM_RECOVERING] = "DLM_RECOVERING",
183 [DLM_MIGRATING] = "DLM_MIGRATING",
184 [DLM_MAXSTATS] = "DLM_MAXSTATS",
185};
186
187static const char *dlm_errmsgs[] = {
188 [DLM_NORMAL] = "request in progress",
189 [DLM_GRANTED] = "request granted",
190 [DLM_DENIED] = "request denied",
191 [DLM_DENIED_NOLOCKS] = "request denied, out of system resources",
192 [DLM_WORKING] = "async request in progress",
193 [DLM_BLOCKED] = "lock request blocked",
194 [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock",
195 [DLM_DENIED_GRACE_PERIOD] = "topological change in progress",
196 [DLM_SYSERR] = "system error",
197 [DLM_NOSUPPORT] = "unsupported",
198 [DLM_CANCELGRANT] = "can't cancel convert: already granted",
199 [DLM_IVLOCKID] = "bad lockid",
200 [DLM_SYNC] = "synchronous request granted",
201 [DLM_BADTYPE] = "bad resource type",
202 [DLM_BADRESOURCE] = "bad resource handle",
203 [DLM_MAXHANDLES] = "no more resource handles",
204 [DLM_NOCLINFO] = "can't contact cluster manager",
205 [DLM_NOLOCKMGR] = "can't contact lock manager",
206 [DLM_NOPURGED] = "can't contact purge daemon",
207 [DLM_BADARGS] = "bad api args",
208 [DLM_VOID] = "no status",
209 [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed",
210 [DLM_IVBUFLEN] = "invalid resource name length",
211 [DLM_CVTUNGRANT] = "attempted to convert ungranted lock",
212 [DLM_BADPARAM] = "invalid lock mode specified",
213 [DLM_VALNOTVALID] = "value block has been invalidated",
214 [DLM_REJECTED] = "request rejected, unrecognized client",
215 [DLM_ABORT] = "blocked lock request cancelled",
216 [DLM_CANCEL] = "conversion request cancelled",
217 [DLM_IVRESHANDLE] = "invalid resource handle",
218 [DLM_DEADLOCK] = "deadlock recovery refused this request",
219 [DLM_DENIED_NOASTS] = "failed to allocate AST",
220 [DLM_FORWARD] = "request must wait for primary's response",
221 [DLM_TIMEOUT] = "timeout value for lock has expired",
222 [DLM_IVGROUPID] = "invalid group specification",
223 [DLM_VERS_CONFLICT] = "version conflicts prevent request handling",
224 [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong",
225 [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device",
226 [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ",
227 [DLM_RECOVERING] = "lock resource being recovered",
228 [DLM_MIGRATING] = "lock resource being migrated",
229 [DLM_MAXSTATS] = "invalid error number",
230};
231
232const char *dlm_errmsg(enum dlm_status err)
233{
234 if (err >= DLM_MAXSTATS || err < 0)
235 return dlm_errmsgs[DLM_MAXSTATS];
236 return dlm_errmsgs[err];
237}
238EXPORT_SYMBOL_GPL(dlm_errmsg);
239
240const char *dlm_errname(enum dlm_status err)
241{
242 if (err >= DLM_MAXSTATS || err < 0)
243 return dlm_errnames[DLM_MAXSTATS];
244 return dlm_errnames[err];
245}
246EXPORT_SYMBOL_GPL(dlm_errname);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..6858510c3ccd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
29
30#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 000000000000..da3c22045f89
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.c
5 *
6 * defines domain join / leave apis
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/delay.h>
35#include <linux/err.h>
36
37#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h"
39#include "cluster/tcp.h"
40
41#include "dlmapi.h"
42#include "dlmcommon.h"
43
44#include "dlmdebug.h"
45#include "dlmdomain.h"
46
47#include "dlmver.h"
48
49#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
50#include "cluster/masklog.h"
51
52/*
53 *
54 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
55 * dlm_domain_lock
56 * struct dlm_ctxt->spinlock
57 * struct dlm_lock_resource->spinlock
58 * struct dlm_ctxt->master_lock
59 * struct dlm_ctxt->ast_lock
60 * dlm_master_list_entry->spinlock
61 * dlm_lock->spinlock
62 *
63 */
64
65spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
66LIST_HEAD(dlm_domains);
67static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
68
69#define DLM_DOMAIN_BACKOFF_MS 200
70
71static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
72static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
73static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
74static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
75
76static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
77
78void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
79{
80 list_del_init(&lockres->list);
81 dlm_lockres_put(lockres);
82}
83
84void __dlm_insert_lockres(struct dlm_ctxt *dlm,
85 struct dlm_lock_resource *res)
86{
87 struct list_head *bucket;
88 struct qstr *q;
89
90 assert_spin_locked(&dlm->spinlock);
91
92 q = &res->lockname;
93 q->hash = full_name_hash(q->name, q->len);
94 bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
95
96 /* get a reference for our hashtable */
97 dlm_lockres_get(res);
98
99 list_add_tail(&res->list, bucket);
100}
101
102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
103 const char *name,
104 unsigned int len)
105{
106 unsigned int hash;
107 struct list_head *iter;
108 struct dlm_lock_resource *tmpres=NULL;
109 struct list_head *bucket;
110
111 mlog_entry("%.*s\n", len, name);
112
113 assert_spin_locked(&dlm->spinlock);
114
115 hash = full_name_hash(name, len);
116
117 bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
118
119 /* check for pre-existing lock */
120 list_for_each(iter, bucket) {
121 tmpres = list_entry(iter, struct dlm_lock_resource, list);
122 if (tmpres->lockname.len == len &&
123 memcmp(tmpres->lockname.name, name, len) == 0) {
124 dlm_lockres_get(tmpres);
125 break;
126 }
127
128 tmpres = NULL;
129 }
130 return tmpres;
131}
132
133struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
134 const char *name,
135 unsigned int len)
136{
137 struct dlm_lock_resource *res;
138
139 spin_lock(&dlm->spinlock);
140 res = __dlm_lookup_lockres(dlm, name, len);
141 spin_unlock(&dlm->spinlock);
142 return res;
143}
144
145static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
146{
147 struct dlm_ctxt *tmp = NULL;
148 struct list_head *iter;
149
150 assert_spin_locked(&dlm_domain_lock);
151
152 /* tmp->name here is always NULL terminated,
153 * but domain may not be! */
154 list_for_each(iter, &dlm_domains) {
155 tmp = list_entry (iter, struct dlm_ctxt, list);
156 if (strlen(tmp->name) == len &&
157 memcmp(tmp->name, domain, len)==0)
158 break;
159 tmp = NULL;
160 }
161
162 return tmp;
163}
164
165/* For null terminated domain strings ONLY */
166static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
167{
168 assert_spin_locked(&dlm_domain_lock);
169
170 return __dlm_lookup_domain_full(domain, strlen(domain));
171}
172
173
174/* returns true on one of two conditions:
175 * 1) the domain does not exist
176 * 2) the domain exists and it's state is "joined" */
177static int dlm_wait_on_domain_helper(const char *domain)
178{
179 int ret = 0;
180 struct dlm_ctxt *tmp = NULL;
181
182 spin_lock(&dlm_domain_lock);
183
184 tmp = __dlm_lookup_domain(domain);
185 if (!tmp)
186 ret = 1;
187 else if (tmp->dlm_state == DLM_CTXT_JOINED)
188 ret = 1;
189
190 spin_unlock(&dlm_domain_lock);
191 return ret;
192}
193
194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
195{
196 if (dlm->resources)
197 free_page((unsigned long) dlm->resources);
198
199 if (dlm->name)
200 kfree(dlm->name);
201
202 kfree(dlm);
203}
204
205/* A little strange - this function will be called while holding
206 * dlm_domain_lock and is expected to be holding it on the way out. We
207 * will however drop and reacquire it multiple times */
208static void dlm_ctxt_release(struct kref *kref)
209{
210 struct dlm_ctxt *dlm;
211
212 dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
213
214 BUG_ON(dlm->num_joins);
215 BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
216
217 /* we may still be in the list if we hit an error during join. */
218 list_del_init(&dlm->list);
219
220 spin_unlock(&dlm_domain_lock);
221
222 mlog(0, "freeing memory from domain %s\n", dlm->name);
223
224 wake_up(&dlm_domain_events);
225
226 dlm_free_ctxt_mem(dlm);
227
228 spin_lock(&dlm_domain_lock);
229}
230
231void dlm_put(struct dlm_ctxt *dlm)
232{
233 spin_lock(&dlm_domain_lock);
234 kref_put(&dlm->dlm_refs, dlm_ctxt_release);
235 spin_unlock(&dlm_domain_lock);
236}
237
238static void __dlm_get(struct dlm_ctxt *dlm)
239{
240 kref_get(&dlm->dlm_refs);
241}
242
243/* given a questionable reference to a dlm object, gets a reference if
244 * it can find it in the list, otherwise returns NULL in which case
245 * you shouldn't trust your pointer. */
246struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
247{
248 struct list_head *iter;
249 struct dlm_ctxt *target = NULL;
250
251 spin_lock(&dlm_domain_lock);
252
253 list_for_each(iter, &dlm_domains) {
254 target = list_entry (iter, struct dlm_ctxt, list);
255
256 if (target == dlm) {
257 __dlm_get(target);
258 break;
259 }
260
261 target = NULL;
262 }
263
264 spin_unlock(&dlm_domain_lock);
265
266 return target;
267}
268
269int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
270{
271 int ret;
272
273 spin_lock(&dlm_domain_lock);
274 ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
275 (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
276 spin_unlock(&dlm_domain_lock);
277
278 return ret;
279}
280
281static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
282{
283 dlm_unregister_domain_handlers(dlm);
284 dlm_complete_thread(dlm);
285 dlm_complete_recovery_thread(dlm);
286
287 /* We've left the domain. Now we can take ourselves out of the
288 * list and allow the kref stuff to help us free the
289 * memory. */
290 spin_lock(&dlm_domain_lock);
291 list_del_init(&dlm->list);
292 spin_unlock(&dlm_domain_lock);
293
294 /* Wake up anyone waiting for us to remove this domain */
295 wake_up(&dlm_domain_events);
296}
297
298static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
299{
300 int i;
301 struct dlm_lock_resource *res;
302
303 mlog(0, "Migrating locks from domain %s\n", dlm->name);
304restart:
305 spin_lock(&dlm->spinlock);
306 for (i=0; i<DLM_HASH_SIZE; i++) {
307 while (!list_empty(&dlm->resources[i])) {
308 res = list_entry(dlm->resources[i].next,
309 struct dlm_lock_resource, list);
310 /* need reference when manually grabbing lockres */
311 dlm_lockres_get(res);
312 /* this should unhash the lockres
313 * and exit with dlm->spinlock */
314 mlog(0, "purging res=%p\n", res);
315 if (dlm_lockres_is_dirty(dlm, res)) {
316 /* HACK! this should absolutely go.
317 * need to figure out why some empty
318 * lockreses are still marked dirty */
319 mlog(ML_ERROR, "lockres %.*s dirty!\n",
320 res->lockname.len, res->lockname.name);
321
322 spin_unlock(&dlm->spinlock);
323 dlm_kick_thread(dlm, res);
324 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
325 dlm_lockres_put(res);
326 goto restart;
327 }
328 dlm_purge_lockres(dlm, res);
329 dlm_lockres_put(res);
330 }
331 }
332 spin_unlock(&dlm->spinlock);
333
334 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
335}
336
337static int dlm_no_joining_node(struct dlm_ctxt *dlm)
338{
339 int ret;
340
341 spin_lock(&dlm->spinlock);
342 ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
343 spin_unlock(&dlm->spinlock);
344
345 return ret;
346}
347
348static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
349{
350 /* Yikes, a double spinlock! I need domain_lock for the dlm
351 * state and the dlm spinlock for join state... Sorry! */
352again:
353 spin_lock(&dlm_domain_lock);
354 spin_lock(&dlm->spinlock);
355
356 if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
357 mlog(0, "Node %d is joining, we wait on it.\n",
358 dlm->joining_node);
359 spin_unlock(&dlm->spinlock);
360 spin_unlock(&dlm_domain_lock);
361
362 wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
363 goto again;
364 }
365
366 dlm->dlm_state = DLM_CTXT_LEAVING;
367 spin_unlock(&dlm->spinlock);
368 spin_unlock(&dlm_domain_lock);
369}
370
371static void __dlm_print_nodes(struct dlm_ctxt *dlm)
372{
373 int node = -1;
374
375 assert_spin_locked(&dlm->spinlock);
376
377 mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
378
379 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
380 node + 1)) < O2NM_MAX_NODES) {
381 mlog(ML_NOTICE, " node %d\n", node);
382 }
383}
384
385static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
386{
387 struct dlm_ctxt *dlm = data;
388 unsigned int node;
389 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
390
391 mlog_entry("%p %u %p", msg, len, data);
392
393 if (!dlm_grab(dlm))
394 return 0;
395
396 node = exit_msg->node_idx;
397
398 mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
399
400 spin_lock(&dlm->spinlock);
401 clear_bit(node, dlm->domain_map);
402 __dlm_print_nodes(dlm);
403
404 /* notify anything attached to the heartbeat events */
405 dlm_hb_event_notify_attached(dlm, node, 0);
406
407 spin_unlock(&dlm->spinlock);
408
409 dlm_put(dlm);
410
411 return 0;
412}
413
414static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
415 unsigned int node)
416{
417 int status;
418 struct dlm_exit_domain leave_msg;
419
420 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
421 node, dlm->name, dlm->node_num);
422
423 memset(&leave_msg, 0, sizeof(leave_msg));
424 leave_msg.node_idx = dlm->node_num;
425
426 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
427 &leave_msg, sizeof(leave_msg), node,
428 NULL);
429
430 mlog(0, "status return %d from o2net_send_message\n", status);
431
432 return status;
433}
434
435
436static void dlm_leave_domain(struct dlm_ctxt *dlm)
437{
438 int node, clear_node, status;
439
440 /* At this point we've migrated away all our locks and won't
441 * accept mastership of new ones. The dlm is responsible for
442 * almost nothing now. We make sure not to confuse any joining
443 * nodes and then commence shutdown procedure. */
444
445 spin_lock(&dlm->spinlock);
446 /* Clear ourselves from the domain map */
447 clear_bit(dlm->node_num, dlm->domain_map);
448 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
449 0)) < O2NM_MAX_NODES) {
450 /* Drop the dlm spinlock. This is safe wrt the domain_map.
451 * -nodes cannot be added now as the
452 * query_join_handlers knows to respond with OK_NO_MAP
453 * -we catch the right network errors if a node is
454 * removed from the map while we're sending him the
455 * exit message. */
456 spin_unlock(&dlm->spinlock);
457
458 clear_node = 1;
459
460 status = dlm_send_one_domain_exit(dlm, node);
461 if (status < 0 &&
462 status != -ENOPROTOOPT &&
463 status != -ENOTCONN) {
464 mlog(ML_NOTICE, "Error %d sending domain exit message "
465 "to node %d\n", status, node);
466
467 /* Not sure what to do here but lets sleep for
468 * a bit in case this was a transient
469 * error... */
470 msleep(DLM_DOMAIN_BACKOFF_MS);
471 clear_node = 0;
472 }
473
474 spin_lock(&dlm->spinlock);
475 /* If we're not clearing the node bit then we intend
476 * to loop back around to try again. */
477 if (clear_node)
478 clear_bit(node, dlm->domain_map);
479 }
480 spin_unlock(&dlm->spinlock);
481}
482
483int dlm_joined(struct dlm_ctxt *dlm)
484{
485 int ret = 0;
486
487 spin_lock(&dlm_domain_lock);
488
489 if (dlm->dlm_state == DLM_CTXT_JOINED)
490 ret = 1;
491
492 spin_unlock(&dlm_domain_lock);
493
494 return ret;
495}
496
497int dlm_shutting_down(struct dlm_ctxt *dlm)
498{
499 int ret = 0;
500
501 spin_lock(&dlm_domain_lock);
502
503 if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
504 ret = 1;
505
506 spin_unlock(&dlm_domain_lock);
507
508 return ret;
509}
510
511void dlm_unregister_domain(struct dlm_ctxt *dlm)
512{
513 int leave = 0;
514
515 spin_lock(&dlm_domain_lock);
516 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
517 BUG_ON(!dlm->num_joins);
518
519 dlm->num_joins--;
520 if (!dlm->num_joins) {
521 /* We mark it "in shutdown" now so new register
522 * requests wait until we've completely left the
523 * domain. Don't use DLM_CTXT_LEAVING yet as we still
524 * want new domain joins to communicate with us at
525 * least until we've completed migration of our
526 * resources. */
527 dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
528 leave = 1;
529 }
530 spin_unlock(&dlm_domain_lock);
531
532 if (leave) {
533 mlog(0, "shutting down domain %s\n", dlm->name);
534
535 /* We changed dlm state, notify the thread */
536 dlm_kick_thread(dlm, NULL);
537
538 dlm_migrate_all_locks(dlm);
539 dlm_mark_domain_leaving(dlm);
540 dlm_leave_domain(dlm);
541 dlm_complete_dlm_shutdown(dlm);
542 }
543 dlm_put(dlm);
544}
545EXPORT_SYMBOL_GPL(dlm_unregister_domain);
546
547static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
548{
549 struct dlm_query_join_request *query;
550 enum dlm_query_join_response response;
551 struct dlm_ctxt *dlm = NULL;
552
553 query = (struct dlm_query_join_request *) msg->buf;
554
555 mlog(0, "node %u wants to join domain %s\n", query->node_idx,
556 query->domain);
557
558 /*
559 * If heartbeat doesn't consider the node live, tell it
560 * to back off and try again. This gives heartbeat a chance
561 * to catch up.
562 */
563 if (!o2hb_check_node_heartbeating(query->node_idx)) {
564 mlog(0, "node %u is not in our live map yet\n",
565 query->node_idx);
566
567 response = JOIN_DISALLOW;
568 goto respond;
569 }
570
571 response = JOIN_OK_NO_MAP;
572
573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
578 spin_lock(&dlm->spinlock);
579
580 if (dlm->dlm_state == DLM_CTXT_NEW &&
581 dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
582 /*If this is a brand new context and we
583 * haven't started our join process yet, then
584 * the other node won the race. */
585 response = JOIN_OK_NO_MAP;
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW;
589 } else {
590 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining
592 * and indicate to him that needs to be fixed
593 * up. */
594 response = JOIN_OK;
595 __dlm_set_joining_node(dlm, query->node_idx);
596 }
597
598 spin_unlock(&dlm->spinlock);
599 }
600 spin_unlock(&dlm_domain_lock);
601
602respond:
603 mlog(0, "We respond with %u\n", response);
604
605 return response;
606}
607
608static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
609{
610 struct dlm_assert_joined *assert;
611 struct dlm_ctxt *dlm = NULL;
612
613 assert = (struct dlm_assert_joined *) msg->buf;
614
615 mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
616 assert->domain);
617
618 spin_lock(&dlm_domain_lock);
619 dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
620 /* XXX should we consider no dlm ctxt an error? */
621 if (dlm) {
622 spin_lock(&dlm->spinlock);
623
624 /* Alright, this node has officially joined our
625 * domain. Set him in the map and clean up our
626 * leftover join state. */
627 BUG_ON(dlm->joining_node != assert->node_idx);
628 set_bit(assert->node_idx, dlm->domain_map);
629 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
630
631 __dlm_print_nodes(dlm);
632
633 /* notify anything attached to the heartbeat events */
634 dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
635
636 spin_unlock(&dlm->spinlock);
637 }
638 spin_unlock(&dlm_domain_lock);
639
640 return 0;
641}
642
643static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
644{
645 struct dlm_cancel_join *cancel;
646 struct dlm_ctxt *dlm = NULL;
647
648 cancel = (struct dlm_cancel_join *) msg->buf;
649
650 mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
651 cancel->domain);
652
653 spin_lock(&dlm_domain_lock);
654 dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
655
656 if (dlm) {
657 spin_lock(&dlm->spinlock);
658
659 /* Yikes, this guy wants to cancel his join. No
660 * problem, we simply cleanup our join state. */
661 BUG_ON(dlm->joining_node != cancel->node_idx);
662 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
663
664 spin_unlock(&dlm->spinlock);
665 }
666 spin_unlock(&dlm_domain_lock);
667
668 return 0;
669}
670
671static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
672 unsigned int node)
673{
674 int status;
675 struct dlm_cancel_join cancel_msg;
676
677 memset(&cancel_msg, 0, sizeof(cancel_msg));
678 cancel_msg.node_idx = dlm->node_num;
679 cancel_msg.name_len = strlen(dlm->name);
680 memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
681
682 status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
683 &cancel_msg, sizeof(cancel_msg), node,
684 NULL);
685 if (status < 0) {
686 mlog_errno(status);
687 goto bail;
688 }
689
690bail:
691 return status;
692}
693
694/* map_size should be in bytes. */
695static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
696 unsigned long *node_map,
697 unsigned int map_size)
698{
699 int status, tmpstat;
700 unsigned int node;
701
702 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
703 sizeof(unsigned long))) {
704 mlog(ML_ERROR,
705 "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
706 map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
707 return -EINVAL;
708 }
709
710 status = 0;
711 node = -1;
712 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
713 node + 1)) < O2NM_MAX_NODES) {
714 if (node == dlm->node_num)
715 continue;
716
717 tmpstat = dlm_send_one_join_cancel(dlm, node);
718 if (tmpstat) {
719 mlog(ML_ERROR, "Error return %d cancelling join on "
720 "node %d\n", tmpstat, node);
721 if (!status)
722 status = tmpstat;
723 }
724 }
725
726 if (status)
727 mlog_errno(status);
728 return status;
729}
730
731static int dlm_request_join(struct dlm_ctxt *dlm,
732 int node,
733 enum dlm_query_join_response *response)
734{
735 int status, retval;
736 struct dlm_query_join_request join_msg;
737
738 mlog(0, "querying node %d\n", node);
739
740 memset(&join_msg, 0, sizeof(join_msg));
741 join_msg.node_idx = dlm->node_num;
742 join_msg.name_len = strlen(dlm->name);
743 memcpy(join_msg.domain, dlm->name, join_msg.name_len);
744
745 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
746 sizeof(join_msg), node, &retval);
747 if (status < 0 && status != -ENOPROTOOPT) {
748 mlog_errno(status);
749 goto bail;
750 }
751
752 /* -ENOPROTOOPT from the net code means the other side isn't
753 listening for our message type -- that's fine, it means
754 his dlm isn't up, so we can consider him a 'yes' but not
755 joined into the domain. */
756 if (status == -ENOPROTOOPT) {
757 status = 0;
758 *response = JOIN_OK_NO_MAP;
759 } else if (retval == JOIN_DISALLOW ||
760 retval == JOIN_OK ||
761 retval == JOIN_OK_NO_MAP) {
762 *response = retval;
763 } else {
764 status = -EINVAL;
765 mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
766 node);
767 }
768
769 mlog(0, "status %d, node %d response is %d\n", status, node,
770 *response);
771
772bail:
773 return status;
774}
775
776static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
777 unsigned int node)
778{
779 int status;
780 struct dlm_assert_joined assert_msg;
781
782 mlog(0, "Sending join assert to node %u\n", node);
783
784 memset(&assert_msg, 0, sizeof(assert_msg));
785 assert_msg.node_idx = dlm->node_num;
786 assert_msg.name_len = strlen(dlm->name);
787 memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
788
789 status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
790 &assert_msg, sizeof(assert_msg), node,
791 NULL);
792 if (status < 0)
793 mlog_errno(status);
794
795 return status;
796}
797
798static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
799 unsigned long *node_map)
800{
801 int status, node, live;
802
803 status = 0;
804 node = -1;
805 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
806 node + 1)) < O2NM_MAX_NODES) {
807 if (node == dlm->node_num)
808 continue;
809
810 do {
811 /* It is very important that this message be
812 * received so we spin until either the node
813 * has died or it gets the message. */
814 status = dlm_send_one_join_assert(dlm, node);
815
816 spin_lock(&dlm->spinlock);
817 live = test_bit(node, dlm->live_nodes_map);
818 spin_unlock(&dlm->spinlock);
819
820 if (status) {
821 mlog(ML_ERROR, "Error return %d asserting "
822 "join on node %d\n", status, node);
823
824 /* give us some time between errors... */
825 if (live)
826 msleep(DLM_DOMAIN_BACKOFF_MS);
827 }
828 } while (status && live);
829 }
830}
831
832struct domain_join_ctxt {
833 unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
834 unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
835};
836
837static int dlm_should_restart_join(struct dlm_ctxt *dlm,
838 struct domain_join_ctxt *ctxt,
839 enum dlm_query_join_response response)
840{
841 int ret;
842
843 if (response == JOIN_DISALLOW) {
844 mlog(0, "Latest response of disallow -- should restart\n");
845 return 1;
846 }
847
848 spin_lock(&dlm->spinlock);
849 /* For now, we restart the process if the node maps have
850 * changed at all */
851 ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
852 sizeof(dlm->live_nodes_map));
853 spin_unlock(&dlm->spinlock);
854
855 if (ret)
856 mlog(0, "Node maps changed -- should restart\n");
857
858 return ret;
859}
860
861static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
862{
863 int status = 0, tmpstat, node;
864 struct domain_join_ctxt *ctxt;
865 enum dlm_query_join_response response;
866
867 mlog_entry("%p", dlm);
868
869 ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
870 if (!ctxt) {
871 status = -ENOMEM;
872 mlog_errno(status);
873 goto bail;
874 }
875
876 /* group sem locking should work for us here -- we're already
877 * registered for heartbeat events so filling this should be
878 * atomic wrt getting those handlers called. */
879 o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
880
881 spin_lock(&dlm->spinlock);
882 memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
883
884 __dlm_set_joining_node(dlm, dlm->node_num);
885
886 spin_unlock(&dlm->spinlock);
887
888 node = -1;
889 while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
890 node + 1)) < O2NM_MAX_NODES) {
891 if (node == dlm->node_num)
892 continue;
893
894 status = dlm_request_join(dlm, node, &response);
895 if (status < 0) {
896 mlog_errno(status);
897 goto bail;
898 }
899
900 /* Ok, either we got a response or the node doesn't have a
901 * dlm up. */
902 if (response == JOIN_OK)
903 set_bit(node, ctxt->yes_resp_map);
904
905 if (dlm_should_restart_join(dlm, ctxt, response)) {
906 status = -EAGAIN;
907 goto bail;
908 }
909 }
910
911 mlog(0, "Yay, done querying nodes!\n");
912
913 /* Yay, everyone agree's we can join the domain. My domain is
914 * comprised of all nodes who were put in the
915 * yes_resp_map. Copy that into our domain map and send a join
916 * assert message to clean up everyone elses state. */
917 spin_lock(&dlm->spinlock);
918 memcpy(dlm->domain_map, ctxt->yes_resp_map,
919 sizeof(ctxt->yes_resp_map));
920 set_bit(dlm->node_num, dlm->domain_map);
921 spin_unlock(&dlm->spinlock);
922
923 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
924
925 /* Joined state *must* be set before the joining node
926 * information, otherwise the query_join handler may read no
927 * current joiner but a state of NEW and tell joining nodes
928 * we're not in the domain. */
929 spin_lock(&dlm_domain_lock);
930 dlm->dlm_state = DLM_CTXT_JOINED;
931 dlm->num_joins++;
932 spin_unlock(&dlm_domain_lock);
933
934bail:
935 spin_lock(&dlm->spinlock);
936 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
937 if (!status)
938 __dlm_print_nodes(dlm);
939 spin_unlock(&dlm->spinlock);
940
941 if (ctxt) {
942 /* Do we need to send a cancel message to any nodes? */
943 if (status < 0) {
944 tmpstat = dlm_send_join_cancels(dlm,
945 ctxt->yes_resp_map,
946 sizeof(ctxt->yes_resp_map));
947 if (tmpstat < 0)
948 mlog_errno(tmpstat);
949 }
950 kfree(ctxt);
951 }
952
953 mlog(0, "returning %d\n", status);
954 return status;
955}
956
957static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
958{
959 o2hb_unregister_callback(&dlm->dlm_hb_up);
960 o2hb_unregister_callback(&dlm->dlm_hb_down);
961 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
962}
963
964static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
965{
966 int status;
967
968 mlog(0, "registering handlers.\n");
969
970 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
971 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
972 status = o2hb_register_callback(&dlm->dlm_hb_down);
973 if (status)
974 goto bail;
975
976 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
977 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
978 status = o2hb_register_callback(&dlm->dlm_hb_up);
979 if (status)
980 goto bail;
981
982 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
983 sizeof(struct dlm_master_request),
984 dlm_master_request_handler,
985 dlm, &dlm->dlm_domain_handlers);
986 if (status)
987 goto bail;
988
989 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
990 sizeof(struct dlm_assert_master),
991 dlm_assert_master_handler,
992 dlm, &dlm->dlm_domain_handlers);
993 if (status)
994 goto bail;
995
996 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
997 sizeof(struct dlm_create_lock),
998 dlm_create_lock_handler,
999 dlm, &dlm->dlm_domain_handlers);
1000 if (status)
1001 goto bail;
1002
1003 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
1004 DLM_CONVERT_LOCK_MAX_LEN,
1005 dlm_convert_lock_handler,
1006 dlm, &dlm->dlm_domain_handlers);
1007 if (status)
1008 goto bail;
1009
1010 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
1011 DLM_UNLOCK_LOCK_MAX_LEN,
1012 dlm_unlock_lock_handler,
1013 dlm, &dlm->dlm_domain_handlers);
1014 if (status)
1015 goto bail;
1016
1017 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
1018 DLM_PROXY_AST_MAX_LEN,
1019 dlm_proxy_ast_handler,
1020 dlm, &dlm->dlm_domain_handlers);
1021 if (status)
1022 goto bail;
1023
1024 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
1025 sizeof(struct dlm_exit_domain),
1026 dlm_exit_domain_handler,
1027 dlm, &dlm->dlm_domain_handlers);
1028 if (status)
1029 goto bail;
1030
1031 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
1032 sizeof(struct dlm_migrate_request),
1033 dlm_migrate_request_handler,
1034 dlm, &dlm->dlm_domain_handlers);
1035 if (status)
1036 goto bail;
1037
1038 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
1039 DLM_MIG_LOCKRES_MAX_LEN,
1040 dlm_mig_lockres_handler,
1041 dlm, &dlm->dlm_domain_handlers);
1042 if (status)
1043 goto bail;
1044
1045 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
1046 sizeof(struct dlm_master_requery),
1047 dlm_master_requery_handler,
1048 dlm, &dlm->dlm_domain_handlers);
1049 if (status)
1050 goto bail;
1051
1052 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
1053 sizeof(struct dlm_lock_request),
1054 dlm_request_all_locks_handler,
1055 dlm, &dlm->dlm_domain_handlers);
1056 if (status)
1057 goto bail;
1058
1059 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
1060 sizeof(struct dlm_reco_data_done),
1061 dlm_reco_data_done_handler,
1062 dlm, &dlm->dlm_domain_handlers);
1063 if (status)
1064 goto bail;
1065
1066 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
1067 sizeof(struct dlm_begin_reco),
1068 dlm_begin_reco_handler,
1069 dlm, &dlm->dlm_domain_handlers);
1070 if (status)
1071 goto bail;
1072
1073 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
1074 sizeof(struct dlm_finalize_reco),
1075 dlm_finalize_reco_handler,
1076 dlm, &dlm->dlm_domain_handlers);
1077 if (status)
1078 goto bail;
1079
1080bail:
1081 if (status)
1082 dlm_unregister_domain_handlers(dlm);
1083
1084 return status;
1085}
1086
1087static int dlm_join_domain(struct dlm_ctxt *dlm)
1088{
1089 int status;
1090
1091 BUG_ON(!dlm);
1092
1093 mlog(0, "Join domain %s\n", dlm->name);
1094
1095 status = dlm_register_domain_handlers(dlm);
1096 if (status) {
1097 mlog_errno(status);
1098 goto bail;
1099 }
1100
1101 status = dlm_launch_thread(dlm);
1102 if (status < 0) {
1103 mlog_errno(status);
1104 goto bail;
1105 }
1106
1107 status = dlm_launch_recovery_thread(dlm);
1108 if (status < 0) {
1109 mlog_errno(status);
1110 goto bail;
1111 }
1112
1113 do {
1114 unsigned int backoff;
1115 status = dlm_try_to_join_domain(dlm);
1116
1117 /* If we're racing another node to the join, then we
1118 * need to back off temporarily and let them
1119 * complete. */
1120 if (status == -EAGAIN) {
1121 if (signal_pending(current)) {
1122 status = -ERESTARTSYS;
1123 goto bail;
1124 }
1125
1126 /*
1127 * <chip> After you!
1128 * <dale> No, after you!
1129 * <chip> I insist!
1130 * <dale> But you first!
1131 * ...
1132 */
1133 backoff = (unsigned int)(jiffies & 0x3);
1134 backoff *= DLM_DOMAIN_BACKOFF_MS;
1135 mlog(0, "backoff %d\n", backoff);
1136 msleep(backoff);
1137 }
1138 } while (status == -EAGAIN);
1139
1140 if (status < 0) {
1141 mlog_errno(status);
1142 goto bail;
1143 }
1144
1145 status = 0;
1146bail:
1147 wake_up(&dlm_domain_events);
1148
1149 if (status) {
1150 dlm_unregister_domain_handlers(dlm);
1151 dlm_complete_thread(dlm);
1152 dlm_complete_recovery_thread(dlm);
1153 }
1154
1155 return status;
1156}
1157
1158static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1159 u32 key)
1160{
1161 int i;
1162 struct dlm_ctxt *dlm = NULL;
1163
1164 dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
1165 if (!dlm) {
1166 mlog_errno(-ENOMEM);
1167 goto leave;
1168 }
1169
1170 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
1171 if (dlm->name == NULL) {
1172 mlog_errno(-ENOMEM);
1173 kfree(dlm);
1174 dlm = NULL;
1175 goto leave;
1176 }
1177
1178 dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
1179 if (!dlm->resources) {
1180 mlog_errno(-ENOMEM);
1181 kfree(dlm->name);
1182 kfree(dlm);
1183 dlm = NULL;
1184 goto leave;
1185 }
1186 memset(dlm->resources, 0, PAGE_SIZE);
1187
1188 for (i=0; i<DLM_HASH_SIZE; i++)
1189 INIT_LIST_HEAD(&dlm->resources[i]);
1190
1191 strcpy(dlm->name, domain);
1192 dlm->key = key;
1193 dlm->node_num = o2nm_this_node();
1194
1195 spin_lock_init(&dlm->spinlock);
1196 spin_lock_init(&dlm->master_lock);
1197 spin_lock_init(&dlm->ast_lock);
1198 INIT_LIST_HEAD(&dlm->list);
1199 INIT_LIST_HEAD(&dlm->dirty_list);
1200 INIT_LIST_HEAD(&dlm->reco.resources);
1201 INIT_LIST_HEAD(&dlm->reco.received);
1202 INIT_LIST_HEAD(&dlm->reco.node_data);
1203 INIT_LIST_HEAD(&dlm->purge_list);
1204 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1205 dlm->reco.state = 0;
1206
1207 INIT_LIST_HEAD(&dlm->pending_asts);
1208 INIT_LIST_HEAD(&dlm->pending_basts);
1209
1210 mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
1211 dlm->recovery_map, &(dlm->recovery_map[0]));
1212
1213 memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
1214 memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
1215 memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
1216
1217 dlm->dlm_thread_task = NULL;
1218 dlm->dlm_reco_thread_task = NULL;
1219 init_waitqueue_head(&dlm->dlm_thread_wq);
1220 init_waitqueue_head(&dlm->dlm_reco_thread_wq);
1221 init_waitqueue_head(&dlm->reco.event);
1222 init_waitqueue_head(&dlm->ast_wq);
1223 init_waitqueue_head(&dlm->migration_wq);
1224 INIT_LIST_HEAD(&dlm->master_list);
1225 INIT_LIST_HEAD(&dlm->mle_hb_events);
1226
1227 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
1228 init_waitqueue_head(&dlm->dlm_join_events);
1229
1230 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1231 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1232 atomic_set(&dlm->local_resources, 0);
1233 atomic_set(&dlm->remote_resources, 0);
1234 atomic_set(&dlm->unknown_resources, 0);
1235
1236 spin_lock_init(&dlm->work_lock);
1237 INIT_LIST_HEAD(&dlm->work_list);
1238 INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
1239
1240 kref_init(&dlm->dlm_refs);
1241 dlm->dlm_state = DLM_CTXT_NEW;
1242
1243 INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
1244
1245 mlog(0, "context init: refcount %u\n",
1246 atomic_read(&dlm->dlm_refs.refcount));
1247
1248leave:
1249 return dlm;
1250}
1251
1252/*
1253 * dlm_register_domain: one-time setup per "domain"
1254 */
1255struct dlm_ctxt * dlm_register_domain(const char *domain,
1256 u32 key)
1257{
1258 int ret;
1259 struct dlm_ctxt *dlm = NULL;
1260 struct dlm_ctxt *new_ctxt = NULL;
1261
1262 if (strlen(domain) > O2NM_MAX_NAME_LEN) {
1263 ret = -ENAMETOOLONG;
1264 mlog(ML_ERROR, "domain name length too long\n");
1265 goto leave;
1266 }
1267
1268 if (!o2hb_check_local_node_heartbeating()) {
1269 mlog(ML_ERROR, "the local node has not been configured, or is "
1270 "not heartbeating\n");
1271 ret = -EPROTO;
1272 goto leave;
1273 }
1274
1275 mlog(0, "register called for domain \"%s\"\n", domain);
1276
1277retry:
1278 dlm = NULL;
1279 if (signal_pending(current)) {
1280 ret = -ERESTARTSYS;
1281 mlog_errno(ret);
1282 goto leave;
1283 }
1284
1285 spin_lock(&dlm_domain_lock);
1286
1287 dlm = __dlm_lookup_domain(domain);
1288 if (dlm) {
1289 if (dlm->dlm_state != DLM_CTXT_JOINED) {
1290 spin_unlock(&dlm_domain_lock);
1291
1292 mlog(0, "This ctxt is not joined yet!\n");
1293 wait_event_interruptible(dlm_domain_events,
1294 dlm_wait_on_domain_helper(
1295 domain));
1296 goto retry;
1297 }
1298
1299 __dlm_get(dlm);
1300 dlm->num_joins++;
1301
1302 spin_unlock(&dlm_domain_lock);
1303
1304 ret = 0;
1305 goto leave;
1306 }
1307
1308 /* doesn't exist */
1309 if (!new_ctxt) {
1310 spin_unlock(&dlm_domain_lock);
1311
1312 new_ctxt = dlm_alloc_ctxt(domain, key);
1313 if (new_ctxt)
1314 goto retry;
1315
1316 ret = -ENOMEM;
1317 mlog_errno(ret);
1318 goto leave;
1319 }
1320
1321 /* a little variable switch-a-roo here... */
1322 dlm = new_ctxt;
1323 new_ctxt = NULL;
1324
1325 /* add the new domain */
1326 list_add_tail(&dlm->list, &dlm_domains);
1327 spin_unlock(&dlm_domain_lock);
1328
1329 ret = dlm_join_domain(dlm);
1330 if (ret) {
1331 mlog_errno(ret);
1332 dlm_put(dlm);
1333 goto leave;
1334 }
1335
1336 ret = 0;
1337leave:
1338 if (new_ctxt)
1339 dlm_free_ctxt_mem(new_ctxt);
1340
1341 if (ret < 0)
1342 dlm = ERR_PTR(ret);
1343
1344 return dlm;
1345}
1346EXPORT_SYMBOL_GPL(dlm_register_domain);
1347
1348static LIST_HEAD(dlm_join_handlers);
1349
1350static void dlm_unregister_net_handlers(void)
1351{
1352 o2net_unregister_handler_list(&dlm_join_handlers);
1353}
1354
1355static int dlm_register_net_handlers(void)
1356{
1357 int status = 0;
1358
1359 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1360 sizeof(struct dlm_query_join_request),
1361 dlm_query_join_handler,
1362 NULL, &dlm_join_handlers);
1363 if (status)
1364 goto bail;
1365
1366 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1367 sizeof(struct dlm_assert_joined),
1368 dlm_assert_joined_handler,
1369 NULL, &dlm_join_handlers);
1370 if (status)
1371 goto bail;
1372
1373 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
1374 sizeof(struct dlm_cancel_join),
1375 dlm_cancel_join_handler,
1376 NULL, &dlm_join_handlers);
1377
1378bail:
1379 if (status < 0)
1380 dlm_unregister_net_handlers();
1381
1382 return status;
1383}
1384
1385/* Domain eviction callback handling.
1386 *
1387 * The file system requires notification of node death *before* the
1388 * dlm completes it's recovery work, otherwise it may be able to
1389 * acquire locks on resources requiring recovery. Since the dlm can
1390 * evict a node from it's domain *before* heartbeat fires, a similar
1391 * mechanism is required. */
1392
1393/* Eviction is not expected to happen often, so a per-domain lock is
1394 * not necessary. Eviction callbacks are allowed to sleep for short
1395 * periods of time. */
1396static DECLARE_RWSEM(dlm_callback_sem);
1397
1398void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
1399 int node_num)
1400{
1401 struct list_head *iter;
1402 struct dlm_eviction_cb *cb;
1403
1404 down_read(&dlm_callback_sem);
1405 list_for_each(iter, &dlm->dlm_eviction_callbacks) {
1406 cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
1407
1408 cb->ec_func(node_num, cb->ec_data);
1409 }
1410 up_read(&dlm_callback_sem);
1411}
1412
1413void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
1414 dlm_eviction_func *f,
1415 void *data)
1416{
1417 INIT_LIST_HEAD(&cb->ec_item);
1418 cb->ec_func = f;
1419 cb->ec_data = data;
1420}
1421EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
1422
1423void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
1424 struct dlm_eviction_cb *cb)
1425{
1426 down_write(&dlm_callback_sem);
1427 list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
1428 up_write(&dlm_callback_sem);
1429}
1430EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
1431
1432void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
1433{
1434 down_write(&dlm_callback_sem);
1435 list_del_init(&cb->ec_item);
1436 up_write(&dlm_callback_sem);
1437}
1438EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
1439
1440static int __init dlm_init(void)
1441{
1442 int status;
1443
1444 dlm_print_version();
1445
1446 status = dlm_init_mle_cache();
1447 if (status)
1448 return -1;
1449
1450 status = dlm_register_net_handlers();
1451 if (status) {
1452 dlm_destroy_mle_cache();
1453 return -1;
1454 }
1455
1456 return 0;
1457}
1458
1459static void __exit dlm_exit (void)
1460{
1461 dlm_unregister_net_handlers();
1462 dlm_destroy_mle_cache();
1463}
1464
1465MODULE_AUTHOR("Oracle");
1466MODULE_LICENSE("GPL");
1467
1468module_init(dlm_init);
1469module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 000000000000..2f7f60bfeb3b
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDOMAIN_H
26#define DLMDOMAIN_H
27
28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains;
30
31int dlm_joined(struct dlm_ctxt *dlm);
32int dlm_shutting_down(struct dlm_ctxt *dlm);
33void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
34 int node_num);
35
36#endif
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 000000000000..dd2d24dc25e0
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfs.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM. This file handles the virtual file system
8 * used for communication with userspace. Credit should go to ramfs,
9 * which was a template for the fs side of this module.
10 *
11 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License as published by the Free Software Foundation; either
16 * version 2 of the License, or (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public
24 * License along with this program; if not, write to the
25 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 * Boston, MA 021110-1307, USA.
27 */
28
29/* Simple VFS hooks based on: */
30/*
31 * Resizable simple ram filesystem for Linux.
32 *
33 * Copyright (C) 2000 Linus Torvalds.
34 * 2000 Transmeta Corp.
35 */
36
37#include <linux/module.h>
38#include <linux/fs.h>
39#include <linux/pagemap.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43#include <linux/init.h>
44#include <linux/string.h>
45#include <linux/smp_lock.h>
46#include <linux/backing-dev.h>
47
48#include <asm/uaccess.h>
49
50
51#include "cluster/nodemanager.h"
52#include "cluster/heartbeat.h"
53#include "cluster/tcp.h"
54
55#include "dlmapi.h"
56
57#include "userdlm.h"
58
59#include "dlmfsver.h"
60
61#define MLOG_MASK_PREFIX ML_DLMFS
62#include "cluster/masklog.h"
63
64static struct super_operations dlmfs_ops;
65static struct file_operations dlmfs_file_operations;
66static struct inode_operations dlmfs_dir_inode_operations;
67static struct inode_operations dlmfs_root_inode_operations;
68static struct inode_operations dlmfs_file_inode_operations;
69static kmem_cache_t *dlmfs_inode_cache;
70
71struct workqueue_struct *user_dlm_worker;
72
73/*
74 * decodes a set of open flags into a valid lock level and a set of flags.
75 * returns < 0 if we have invalid flags
76 * flags which mean something to us:
77 * O_RDONLY -> PRMODE level
78 * O_WRONLY -> EXMODE level
79 *
80 * O_NONBLOCK -> LKM_NOQUEUE
81 */
82static int dlmfs_decode_open_flags(int open_flags,
83 int *level,
84 int *flags)
85{
86 if (open_flags & (O_WRONLY|O_RDWR))
87 *level = LKM_EXMODE;
88 else
89 *level = LKM_PRMODE;
90
91 *flags = 0;
92 if (open_flags & O_NONBLOCK)
93 *flags |= LKM_NOQUEUE;
94
95 return 0;
96}
97
98static int dlmfs_file_open(struct inode *inode,
99 struct file *file)
100{
101 int status, level, flags;
102 struct dlmfs_filp_private *fp = NULL;
103 struct dlmfs_inode_private *ip;
104
105 if (S_ISDIR(inode->i_mode))
106 BUG();
107
108 mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
109 file->f_flags);
110
111 status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
112 if (status < 0)
113 goto bail;
114
115 /* We don't want to honor O_APPEND at read/write time as it
116 * doesn't make sense for LVB writes. */
117 file->f_flags &= ~O_APPEND;
118
119 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
120 if (!fp) {
121 status = -ENOMEM;
122 goto bail;
123 }
124 fp->fp_lock_level = level;
125
126 ip = DLMFS_I(inode);
127
128 status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
129 if (status < 0) {
130 /* this is a strange error to return here but I want
131 * to be able userspace to be able to distinguish a
132 * valid lock request from one that simply couldn't be
133 * granted. */
134 if (flags & LKM_NOQUEUE && status == -EAGAIN)
135 status = -ETXTBSY;
136 kfree(fp);
137 goto bail;
138 }
139
140 file->private_data = fp;
141bail:
142 return status;
143}
144
145static int dlmfs_file_release(struct inode *inode,
146 struct file *file)
147{
148 int level, status;
149 struct dlmfs_inode_private *ip = DLMFS_I(inode);
150 struct dlmfs_filp_private *fp =
151 (struct dlmfs_filp_private *) file->private_data;
152
153 if (S_ISDIR(inode->i_mode))
154 BUG();
155
156 mlog(0, "close called on inode %lu\n", inode->i_ino);
157
158 status = 0;
159 if (fp) {
160 level = fp->fp_lock_level;
161 if (level != LKM_IVMODE)
162 user_dlm_cluster_unlock(&ip->ip_lockres, level);
163
164 kfree(fp);
165 file->private_data = NULL;
166 }
167
168 return 0;
169}
170
171static ssize_t dlmfs_file_read(struct file *filp,
172 char __user *buf,
173 size_t count,
174 loff_t *ppos)
175{
176 int bytes_left;
177 ssize_t readlen;
178 char *lvb_buf;
179 struct inode *inode = filp->f_dentry->d_inode;
180
181 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
182 inode->i_ino, count, *ppos);
183
184 if (*ppos >= i_size_read(inode))
185 return 0;
186
187 if (!count)
188 return 0;
189
190 if (!access_ok(VERIFY_WRITE, buf, count))
191 return -EFAULT;
192
193 /* don't read past the lvb */
194 if ((count + *ppos) > i_size_read(inode))
195 readlen = i_size_read(inode) - *ppos;
196 else
197 readlen = count - *ppos;
198
199 lvb_buf = kmalloc(readlen, GFP_KERNEL);
200 if (!lvb_buf)
201 return -ENOMEM;
202
203 user_dlm_read_lvb(inode, lvb_buf, readlen);
204 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
205 readlen -= bytes_left;
206
207 kfree(lvb_buf);
208
209 *ppos = *ppos + readlen;
210
211 mlog(0, "read %zd bytes\n", readlen);
212 return readlen;
213}
214
215static ssize_t dlmfs_file_write(struct file *filp,
216 const char __user *buf,
217 size_t count,
218 loff_t *ppos)
219{
220 int bytes_left;
221 ssize_t writelen;
222 char *lvb_buf;
223 struct inode *inode = filp->f_dentry->d_inode;
224
225 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
226 inode->i_ino, count, *ppos);
227
228 if (*ppos >= i_size_read(inode))
229 return -ENOSPC;
230
231 if (!count)
232 return 0;
233
234 if (!access_ok(VERIFY_READ, buf, count))
235 return -EFAULT;
236
237 /* don't write past the lvb */
238 if ((count + *ppos) > i_size_read(inode))
239 writelen = i_size_read(inode) - *ppos;
240 else
241 writelen = count - *ppos;
242
243 lvb_buf = kmalloc(writelen, GFP_KERNEL);
244 if (!lvb_buf)
245 return -ENOMEM;
246
247 bytes_left = copy_from_user(lvb_buf, buf, writelen);
248 writelen -= bytes_left;
249 if (writelen)
250 user_dlm_write_lvb(inode, lvb_buf, writelen);
251
252 kfree(lvb_buf);
253
254 *ppos = *ppos + writelen;
255 mlog(0, "wrote %zd bytes\n", writelen);
256 return writelen;
257}
258
259static void dlmfs_init_once(void *foo,
260 kmem_cache_t *cachep,
261 unsigned long flags)
262{
263 struct dlmfs_inode_private *ip =
264 (struct dlmfs_inode_private *) foo;
265
266 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
267 SLAB_CTOR_CONSTRUCTOR) {
268 ip->ip_dlm = NULL;
269 ip->ip_parent = NULL;
270
271 inode_init_once(&ip->ip_vfs_inode);
272 }
273}
274
275static struct inode *dlmfs_alloc_inode(struct super_block *sb)
276{
277 struct dlmfs_inode_private *ip;
278
279 ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
280 if (!ip)
281 return NULL;
282
283 return &ip->ip_vfs_inode;
284}
285
286static void dlmfs_destroy_inode(struct inode *inode)
287{
288 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
289}
290
291static void dlmfs_clear_inode(struct inode *inode)
292{
293 int status;
294 struct dlmfs_inode_private *ip;
295
296 if (!inode)
297 return;
298
299 mlog(0, "inode %lu\n", inode->i_ino);
300
301 ip = DLMFS_I(inode);
302
303 if (S_ISREG(inode->i_mode)) {
304 status = user_dlm_destroy_lock(&ip->ip_lockres);
305 if (status < 0)
306 mlog_errno(status);
307 iput(ip->ip_parent);
308 goto clear_fields;
309 }
310
311 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
312 /* we must be a directory. If required, lets unregister the
313 * dlm context now. */
314 if (ip->ip_dlm)
315 user_dlm_unregister_context(ip->ip_dlm);
316clear_fields:
317 ip->ip_parent = NULL;
318 ip->ip_dlm = NULL;
319}
320
321static struct backing_dev_info dlmfs_backing_dev_info = {
322 .ra_pages = 0, /* No readahead */
323 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
324};
325
326static struct inode *dlmfs_get_root_inode(struct super_block *sb)
327{
328 struct inode *inode = new_inode(sb);
329 int mode = S_IFDIR | 0755;
330 struct dlmfs_inode_private *ip;
331
332 if (inode) {
333 ip = DLMFS_I(inode);
334
335 inode->i_mode = mode;
336 inode->i_uid = current->fsuid;
337 inode->i_gid = current->fsgid;
338 inode->i_blksize = PAGE_CACHE_SIZE;
339 inode->i_blocks = 0;
340 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
342 inode->i_nlink++;
343
344 inode->i_fop = &simple_dir_operations;
345 inode->i_op = &dlmfs_root_inode_operations;
346 }
347
348 return inode;
349}
350
351static struct inode *dlmfs_get_inode(struct inode *parent,
352 struct dentry *dentry,
353 int mode)
354{
355 struct super_block *sb = parent->i_sb;
356 struct inode * inode = new_inode(sb);
357 struct dlmfs_inode_private *ip;
358
359 if (!inode)
360 return NULL;
361
362 inode->i_mode = mode;
363 inode->i_uid = current->fsuid;
364 inode->i_gid = current->fsgid;
365 inode->i_blksize = PAGE_CACHE_SIZE;
366 inode->i_blocks = 0;
367 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
368 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
369
370 ip = DLMFS_I(inode);
371 ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
372
373 switch (mode & S_IFMT) {
374 default:
375 /* for now we don't support anything other than
376 * directories and regular files. */
377 BUG();
378 break;
379 case S_IFREG:
380 inode->i_op = &dlmfs_file_inode_operations;
381 inode->i_fop = &dlmfs_file_operations;
382
383 i_size_write(inode, DLM_LVB_LEN);
384
385 user_dlm_lock_res_init(&ip->ip_lockres, dentry);
386
387 /* released at clear_inode time, this insures that we
388 * get to drop the dlm reference on each lock *before*
389 * we call the unregister code for releasing parent
390 * directories. */
391 ip->ip_parent = igrab(parent);
392 BUG_ON(!ip->ip_parent);
393 break;
394 case S_IFDIR:
395 inode->i_op = &dlmfs_dir_inode_operations;
396 inode->i_fop = &simple_dir_operations;
397
398 /* directory inodes start off with i_nlink ==
399 * 2 (for "." entry) */
400 inode->i_nlink++;
401 break;
402 }
403
404 if (parent->i_mode & S_ISGID) {
405 inode->i_gid = parent->i_gid;
406 if (S_ISDIR(mode))
407 inode->i_mode |= S_ISGID;
408 }
409
410 return inode;
411}
412
413/*
414 * File creation. Allocate an inode, and we're done..
415 */
416/* SMP-safe */
417static int dlmfs_mkdir(struct inode * dir,
418 struct dentry * dentry,
419 int mode)
420{
421 int status;
422 struct inode *inode = NULL;
423 struct qstr *domain = &dentry->d_name;
424 struct dlmfs_inode_private *ip;
425 struct dlm_ctxt *dlm;
426
427 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
428
429 /* verify that we have a proper domain */
430 if (domain->len >= O2NM_MAX_NAME_LEN) {
431 status = -EINVAL;
432 mlog(ML_ERROR, "invalid domain name for directory.\n");
433 goto bail;
434 }
435
436 inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
437 if (!inode) {
438 status = -ENOMEM;
439 mlog_errno(status);
440 goto bail;
441 }
442
443 ip = DLMFS_I(inode);
444
445 dlm = user_dlm_register_context(domain);
446 if (IS_ERR(dlm)) {
447 status = PTR_ERR(dlm);
448 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
449 status, domain->len, domain->name);
450 goto bail;
451 }
452 ip->ip_dlm = dlm;
453
454 dir->i_nlink++;
455 d_instantiate(dentry, inode);
456 dget(dentry); /* Extra count - pin the dentry in core */
457
458 status = 0;
459bail:
460 if (status < 0)
461 iput(inode);
462 return status;
463}
464
465static int dlmfs_create(struct inode *dir,
466 struct dentry *dentry,
467 int mode,
468 struct nameidata *nd)
469{
470 int status = 0;
471 struct inode *inode;
472 struct qstr *name = &dentry->d_name;
473
474 mlog(0, "create %.*s\n", name->len, name->name);
475
476 /* verify name is valid and doesn't contain any dlm reserved
477 * characters */
478 if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
479 name->name[0] == '$') {
480 status = -EINVAL;
481 mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
482 name->name);
483 goto bail;
484 }
485
486 inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
487 if (!inode) {
488 status = -ENOMEM;
489 mlog_errno(status);
490 goto bail;
491 }
492
493 d_instantiate(dentry, inode);
494 dget(dentry); /* Extra count - pin the dentry in core */
495bail:
496 return status;
497}
498
499static int dlmfs_unlink(struct inode *dir,
500 struct dentry *dentry)
501{
502 int status;
503 struct inode *inode = dentry->d_inode;
504
505 mlog(0, "unlink inode %lu\n", inode->i_ino);
506
507 /* if there are no current holders, or none that are waiting
508 * to acquire a lock, this basically destroys our lockres. */
509 status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
510 if (status < 0) {
511 mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
512 dentry->d_name.len, dentry->d_name.name, status);
513 goto bail;
514 }
515 status = simple_unlink(dir, dentry);
516bail:
517 return status;
518}
519
520static int dlmfs_fill_super(struct super_block * sb,
521 void * data,
522 int silent)
523{
524 struct inode * inode;
525 struct dentry * root;
526
527 sb->s_maxbytes = MAX_LFS_FILESIZE;
528 sb->s_blocksize = PAGE_CACHE_SIZE;
529 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
530 sb->s_magic = DLMFS_MAGIC;
531 sb->s_op = &dlmfs_ops;
532 inode = dlmfs_get_root_inode(sb);
533 if (!inode)
534 return -ENOMEM;
535
536 root = d_alloc_root(inode);
537 if (!root) {
538 iput(inode);
539 return -ENOMEM;
540 }
541 sb->s_root = root;
542 return 0;
543}
544
545static struct file_operations dlmfs_file_operations = {
546 .open = dlmfs_file_open,
547 .release = dlmfs_file_release,
548 .read = dlmfs_file_read,
549 .write = dlmfs_file_write,
550};
551
552static struct inode_operations dlmfs_dir_inode_operations = {
553 .create = dlmfs_create,
554 .lookup = simple_lookup,
555 .unlink = dlmfs_unlink,
556};
557
558/* this way we can restrict mkdir to only the toplevel of the fs. */
559static struct inode_operations dlmfs_root_inode_operations = {
560 .lookup = simple_lookup,
561 .mkdir = dlmfs_mkdir,
562 .rmdir = simple_rmdir,
563};
564
565static struct super_operations dlmfs_ops = {
566 .statfs = simple_statfs,
567 .alloc_inode = dlmfs_alloc_inode,
568 .destroy_inode = dlmfs_destroy_inode,
569 .clear_inode = dlmfs_clear_inode,
570 .drop_inode = generic_delete_inode,
571};
572
573static struct inode_operations dlmfs_file_inode_operations = {
574 .getattr = simple_getattr,
575};
576
577static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
578 int flags, const char *dev_name, void *data)
579{
580 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
581}
582
583static struct file_system_type dlmfs_fs_type = {
584 .owner = THIS_MODULE,
585 .name = "ocfs2_dlmfs",
586 .get_sb = dlmfs_get_sb,
587 .kill_sb = kill_litter_super,
588};
589
590static int __init init_dlmfs_fs(void)
591{
592 int status;
593 int cleanup_inode = 0, cleanup_worker = 0;
594
595 dlmfs_print_version();
596
597 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
598 sizeof(struct dlmfs_inode_private),
599 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
600 dlmfs_init_once, NULL);
601 if (!dlmfs_inode_cache)
602 return -ENOMEM;
603 cleanup_inode = 1;
604
605 user_dlm_worker = create_singlethread_workqueue("user_dlm");
606 if (!user_dlm_worker) {
607 status = -ENOMEM;
608 goto bail;
609 }
610 cleanup_worker = 1;
611
612 status = register_filesystem(&dlmfs_fs_type);
613bail:
614 if (status) {
615 if (cleanup_inode)
616 kmem_cache_destroy(dlmfs_inode_cache);
617 if (cleanup_worker)
618 destroy_workqueue(user_dlm_worker);
619 } else
620 printk("OCFS2 User DLM kernel interface loaded\n");
621 return status;
622}
623
624static void __exit exit_dlmfs_fs(void)
625{
626 unregister_filesystem(&dlmfs_fs_type);
627
628 flush_workqueue(user_dlm_worker);
629 destroy_workqueue(user_dlm_worker);
630
631 if (kmem_cache_destroy(dlmfs_inode_cache))
632 printk(KERN_INFO "dlmfs_inode_cache: not all structures "
633 "were freed\n");
634}
635
636MODULE_AUTHOR("Oracle");
637MODULE_LICENSE("GPL");
638
639module_init(init_dlmfs_fs)
640module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 000000000000..d2be3ad841f9
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 000000000000..f35eadbed25c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 000000000000..d1a0038557a3
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmlock.c
5 *
6 * underlying calls for lock creation
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#include "dlmconvert.h"
52
53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h"
55
56static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
57static u64 dlm_next_cookie = 1;
58
59static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
60 struct dlm_lock_resource *res,
61 struct dlm_lock *lock, int flags);
62static void dlm_init_lock(struct dlm_lock *newlock, int type,
63 u8 node, u64 cookie);
64static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66
67/* Tell us whether we can grant a new lock request.
68 * locking:
69 * caller needs: res->spinlock
70 * taken: none
71 * held on exit: none
72 * returns: 1 if the lock can be granted, 0 otherwise.
73 */
74static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
75 struct dlm_lock *lock)
76{
77 struct list_head *iter;
78 struct dlm_lock *tmplock;
79
80 list_for_each(iter, &res->granted) {
81 tmplock = list_entry(iter, struct dlm_lock, list);
82
83 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
84 return 0;
85 }
86
87 list_for_each(iter, &res->converting) {
88 tmplock = list_entry(iter, struct dlm_lock, list);
89
90 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
91 return 0;
92 }
93
94 return 1;
95}
96
97/* performs lock creation at the lockres master site
98 * locking:
99 * caller needs: none
100 * taken: takes and drops res->spinlock
101 * held on exit: none
102 * returns: DLM_NORMAL, DLM_NOTQUEUED
103 */
104static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
105 struct dlm_lock_resource *res,
106 struct dlm_lock *lock, int flags)
107{
108 int call_ast = 0, kick_thread = 0;
109 enum dlm_status status = DLM_NORMAL;
110
111 mlog_entry("type=%d\n", lock->ml.type);
112
113 spin_lock(&res->spinlock);
114 /* if called from dlm_create_lock_handler, need to
115 * ensure it will not sleep in dlm_wait_on_lockres */
116 status = __dlm_lockres_state_to_status(res);
117 if (status != DLM_NORMAL &&
118 lock->ml.node != dlm->node_num) {
119 /* erf. state changed after lock was dropped. */
120 spin_unlock(&res->spinlock);
121 dlm_error(status);
122 return status;
123 }
124 __dlm_wait_on_lockres(res);
125 __dlm_lockres_reserve_ast(res);
126
127 if (dlm_can_grant_new_lock(res, lock)) {
128 mlog(0, "I can grant this lock right away\n");
129 /* got it right away */
130 lock->lksb->status = DLM_NORMAL;
131 status = DLM_NORMAL;
132 dlm_lock_get(lock);
133 list_add_tail(&lock->list, &res->granted);
134
135 /* for the recovery lock, we can't allow the ast
136 * to be queued since the dlmthread is already
137 * frozen. but the recovery lock is always locked
138 * with LKM_NOQUEUE so we do not need the ast in
139 * this special case */
140 if (!dlm_is_recovery_lock(res->lockname.name,
141 res->lockname.len)) {
142 kick_thread = 1;
143 call_ast = 1;
144 }
145 } else {
146 /* for NOQUEUE request, unless we get the
147 * lock right away, return DLM_NOTQUEUED */
148 if (flags & LKM_NOQUEUE)
149 status = DLM_NOTQUEUED;
150 else {
151 dlm_lock_get(lock);
152 list_add_tail(&lock->list, &res->blocked);
153 kick_thread = 1;
154 }
155 }
156
157 spin_unlock(&res->spinlock);
158 wake_up(&res->wq);
159
160 /* either queue the ast or release it */
161 if (call_ast)
162 dlm_queue_ast(dlm, lock);
163 else
164 dlm_lockres_release_ast(dlm, res);
165
166 dlm_lockres_calc_usage(dlm, res);
167 if (kick_thread)
168 dlm_kick_thread(dlm, res);
169
170 return status;
171}
172
173void dlm_revert_pending_lock(struct dlm_lock_resource *res,
174 struct dlm_lock *lock)
175{
176 /* remove from local queue if it failed */
177 list_del_init(&lock->list);
178 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
179}
180
181
182/*
183 * locking:
184 * caller needs: none
185 * taken: takes and drops res->spinlock
186 * held on exit: none
187 * returns: DLM_DENIED, DLM_RECOVERING, or net status
188 */
189static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
190 struct dlm_lock_resource *res,
191 struct dlm_lock *lock, int flags)
192{
193 enum dlm_status status = DLM_DENIED;
194
195 mlog_entry("type=%d\n", lock->ml.type);
196 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
197 res->lockname.name, flags);
198
199 spin_lock(&res->spinlock);
200
201 /* will exit this call with spinlock held */
202 __dlm_wait_on_lockres(res);
203 res->state |= DLM_LOCK_RES_IN_PROGRESS;
204
205 /* add lock to local (secondary) queue */
206 dlm_lock_get(lock);
207 list_add_tail(&lock->list, &res->blocked);
208 lock->lock_pending = 1;
209 spin_unlock(&res->spinlock);
210
211 /* spec seems to say that you will get DLM_NORMAL when the lock
212 * has been queued, meaning we need to wait for a reply here. */
213 status = dlm_send_remote_lock_request(dlm, res, lock, flags);
214
215 spin_lock(&res->spinlock);
216 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
217 lock->lock_pending = 0;
218 if (status != DLM_NORMAL) {
219 if (status != DLM_NOTQUEUED)
220 dlm_error(status);
221 dlm_revert_pending_lock(res, lock);
222 dlm_lock_put(lock);
223 }
224 spin_unlock(&res->spinlock);
225
226 dlm_lockres_calc_usage(dlm, res);
227
228 wake_up(&res->wq);
229 return status;
230}
231
232
233/* for remote lock creation.
234 * locking:
235 * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
236 * taken: none
237 * held on exit: none
238 * returns: DLM_NOLOCKMGR, or net status
239 */
240static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
241 struct dlm_lock_resource *res,
242 struct dlm_lock *lock, int flags)
243{
244 struct dlm_create_lock create;
245 int tmpret, status = 0;
246 enum dlm_status ret;
247
248 mlog_entry_void();
249
250 memset(&create, 0, sizeof(create));
251 create.node_idx = dlm->node_num;
252 create.requested_type = lock->ml.type;
253 create.cookie = lock->ml.cookie;
254 create.namelen = res->lockname.len;
255 create.flags = cpu_to_be32(flags);
256 memcpy(create.name, res->lockname.name, create.namelen);
257
258 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
259 sizeof(create), res->owner, &status);
260 if (tmpret >= 0) {
261 // successfully sent and received
262 ret = status; // this is already a dlm_status
263 } else {
264 mlog_errno(tmpret);
265 if (dlm_is_host_down(tmpret)) {
266 ret = DLM_RECOVERING;
267 mlog(0, "node %u died so returning DLM_RECOVERING "
268 "from lock message!\n", res->owner);
269 } else {
270 ret = dlm_err_to_dlm_status(tmpret);
271 }
272 }
273
274 return ret;
275}
276
277void dlm_lock_get(struct dlm_lock *lock)
278{
279 kref_get(&lock->lock_refs);
280}
281
282void dlm_lock_put(struct dlm_lock *lock)
283{
284 kref_put(&lock->lock_refs, dlm_lock_release);
285}
286
287static void dlm_lock_release(struct kref *kref)
288{
289 struct dlm_lock *lock;
290
291 lock = container_of(kref, struct dlm_lock, lock_refs);
292
293 BUG_ON(!list_empty(&lock->list));
294 BUG_ON(!list_empty(&lock->ast_list));
295 BUG_ON(!list_empty(&lock->bast_list));
296 BUG_ON(lock->ast_pending);
297 BUG_ON(lock->bast_pending);
298
299 dlm_lock_detach_lockres(lock);
300
301 if (lock->lksb_kernel_allocated) {
302 mlog(0, "freeing kernel-allocated lksb\n");
303 kfree(lock->lksb);
304 }
305 kfree(lock);
306}
307
308/* associate a lock with it's lockres, getting a ref on the lockres */
309void dlm_lock_attach_lockres(struct dlm_lock *lock,
310 struct dlm_lock_resource *res)
311{
312 dlm_lockres_get(res);
313 lock->lockres = res;
314}
315
316/* drop ref on lockres, if there is still one associated with lock */
317static void dlm_lock_detach_lockres(struct dlm_lock *lock)
318{
319 struct dlm_lock_resource *res;
320
321 res = lock->lockres;
322 if (res) {
323 lock->lockres = NULL;
324 mlog(0, "removing lock's lockres reference\n");
325 dlm_lockres_put(res);
326 }
327}
328
329static void dlm_init_lock(struct dlm_lock *newlock, int type,
330 u8 node, u64 cookie)
331{
332 INIT_LIST_HEAD(&newlock->list);
333 INIT_LIST_HEAD(&newlock->ast_list);
334 INIT_LIST_HEAD(&newlock->bast_list);
335 spin_lock_init(&newlock->spinlock);
336 newlock->ml.type = type;
337 newlock->ml.convert_type = LKM_IVMODE;
338 newlock->ml.highest_blocked = LKM_IVMODE;
339 newlock->ml.node = node;
340 newlock->ml.pad1 = 0;
341 newlock->ml.list = 0;
342 newlock->ml.flags = 0;
343 newlock->ast = NULL;
344 newlock->bast = NULL;
345 newlock->astdata = NULL;
346 newlock->ml.cookie = cpu_to_be64(cookie);
347 newlock->ast_pending = 0;
348 newlock->bast_pending = 0;
349 newlock->convert_pending = 0;
350 newlock->lock_pending = 0;
351 newlock->unlock_pending = 0;
352 newlock->cancel_pending = 0;
353 newlock->lksb_kernel_allocated = 0;
354
355 kref_init(&newlock->lock_refs);
356}
357
358struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
359 struct dlm_lockstatus *lksb)
360{
361 struct dlm_lock *lock;
362 int kernel_allocated = 0;
363
364 lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
365 if (!lock)
366 return NULL;
367
368 if (!lksb) {
369 /* zero memory only if kernel-allocated */
370 lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
371 if (!lksb) {
372 kfree(lock);
373 return NULL;
374 }
375 kernel_allocated = 1;
376 }
377
378 dlm_init_lock(lock, type, node, cookie);
379 if (kernel_allocated)
380 lock->lksb_kernel_allocated = 1;
381 lock->lksb = lksb;
382 lksb->lockid = lock;
383 return lock;
384}
385
386/* handler for lock creation net message
387 * locking:
388 * caller needs: none
389 * taken: takes and drops res->spinlock
390 * held on exit: none
391 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
392 */
393int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
394{
395 struct dlm_ctxt *dlm = data;
396 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
397 struct dlm_lock_resource *res = NULL;
398 struct dlm_lock *newlock = NULL;
399 struct dlm_lockstatus *lksb = NULL;
400 enum dlm_status status = DLM_NORMAL;
401 char *name;
402 unsigned int namelen;
403
404 BUG_ON(!dlm);
405
406 mlog_entry_void();
407
408 if (!dlm_grab(dlm))
409 return DLM_REJECTED;
410
411 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
412 "Domain %s not fully joined!\n", dlm->name);
413
414 name = create->name;
415 namelen = create->namelen;
416
417 status = DLM_IVBUFLEN;
418 if (namelen > DLM_LOCKID_NAME_MAX) {
419 dlm_error(status);
420 goto leave;
421 }
422
423 status = DLM_SYSERR;
424 newlock = dlm_new_lock(create->requested_type,
425 create->node_idx,
426 be64_to_cpu(create->cookie), NULL);
427 if (!newlock) {
428 dlm_error(status);
429 goto leave;
430 }
431
432 lksb = newlock->lksb;
433
434 if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
435 lksb->flags |= DLM_LKSB_GET_LVB;
436 mlog(0, "set DLM_LKSB_GET_LVB flag\n");
437 }
438
439 status = DLM_IVLOCKID;
440 res = dlm_lookup_lockres(dlm, name, namelen);
441 if (!res) {
442 dlm_error(status);
443 goto leave;
444 }
445
446 spin_lock(&res->spinlock);
447 status = __dlm_lockres_state_to_status(res);
448 spin_unlock(&res->spinlock);
449
450 if (status != DLM_NORMAL) {
451 mlog(0, "lockres recovering/migrating/in-progress\n");
452 goto leave;
453 }
454
455 dlm_lock_attach_lockres(newlock, res);
456
457 status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
458leave:
459 if (status != DLM_NORMAL)
460 if (newlock)
461 dlm_lock_put(newlock);
462
463 if (res)
464 dlm_lockres_put(res);
465
466 dlm_put(dlm);
467
468 return status;
469}
470
471
472/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
473static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
474{
475 u64 tmpnode = node_num;
476
477 /* shift single byte of node num into top 8 bits */
478 tmpnode <<= 56;
479
480 spin_lock(&dlm_cookie_lock);
481 *cookie = (dlm_next_cookie | tmpnode);
482 if (++dlm_next_cookie & 0xff00000000000000ull) {
483 mlog(0, "This node's cookie will now wrap!\n");
484 dlm_next_cookie = 1;
485 }
486 spin_unlock(&dlm_cookie_lock);
487}
488
489enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
490 struct dlm_lockstatus *lksb, int flags,
491 const char *name, dlm_astlockfunc_t *ast, void *data,
492 dlm_bastlockfunc_t *bast)
493{
494 enum dlm_status status;
495 struct dlm_lock_resource *res = NULL;
496 struct dlm_lock *lock = NULL;
497 int convert = 0, recovery = 0;
498
499 /* yes this function is a mess.
500 * TODO: clean this up. lots of common code in the
501 * lock and convert paths, especially in the retry blocks */
502 if (!lksb) {
503 dlm_error(DLM_BADARGS);
504 return DLM_BADARGS;
505 }
506
507 status = DLM_BADPARAM;
508 if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
509 dlm_error(status);
510 goto error;
511 }
512
513 if (flags & ~LKM_VALID_FLAGS) {
514 dlm_error(status);
515 goto error;
516 }
517
518 convert = (flags & LKM_CONVERT);
519 recovery = (flags & LKM_RECOVERY);
520
521 if (recovery &&
522 (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
523 dlm_error(status);
524 goto error;
525 }
526 if (convert && (flags & LKM_LOCAL)) {
527 mlog(ML_ERROR, "strange LOCAL convert request!\n");
528 goto error;
529 }
530
531 if (convert) {
532 /* CONVERT request */
533
534 /* if converting, must pass in a valid dlm_lock */
535 lock = lksb->lockid;
536 if (!lock) {
537 mlog(ML_ERROR, "NULL lock pointer in convert "
538 "request\n");
539 goto error;
540 }
541
542 res = lock->lockres;
543 if (!res) {
544 mlog(ML_ERROR, "NULL lockres pointer in convert "
545 "request\n");
546 goto error;
547 }
548 dlm_lockres_get(res);
549
550 /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
551 * static after the original lock call. convert requests will
552 * ensure that everything is the same, or return DLM_BADARGS.
553 * this means that DLM_DENIED_NOASTS will never be returned.
554 */
555 if (lock->lksb != lksb || lock->ast != ast ||
556 lock->bast != bast || lock->astdata != data) {
557 status = DLM_BADARGS;
558 mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
559 "astdata=%p\n", lksb, ast, bast, data);
560 mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
561 "astdata=%p\n", lock->lksb, lock->ast,
562 lock->bast, lock->astdata);
563 goto error;
564 }
565retry_convert:
566 dlm_wait_for_recovery(dlm);
567
568 if (res->owner == dlm->node_num)
569 status = dlmconvert_master(dlm, res, lock, flags, mode);
570 else
571 status = dlmconvert_remote(dlm, res, lock, flags, mode);
572 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
573 status == DLM_FORWARD) {
574 /* for now, see how this works without sleeping
575 * and just retry right away. I suspect the reco
576 * or migration will complete fast enough that
577 * no waiting will be necessary */
578 mlog(0, "retrying convert with migration/recovery/"
579 "in-progress\n");
580 msleep(100);
581 goto retry_convert;
582 }
583 } else {
584 u64 tmpcookie;
585
586 /* LOCK request */
587 status = DLM_BADARGS;
588 if (!name) {
589 dlm_error(status);
590 goto error;
591 }
592
593 status = DLM_IVBUFLEN;
594 if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
595 dlm_error(status);
596 goto error;
597 }
598
599 dlm_get_next_cookie(dlm->node_num, &tmpcookie);
600 lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
601 if (!lock) {
602 dlm_error(status);
603 goto error;
604 }
605
606 if (!recovery)
607 dlm_wait_for_recovery(dlm);
608
609 /* find or create the lock resource */
610 res = dlm_get_lock_resource(dlm, name, flags);
611 if (!res) {
612 status = DLM_IVLOCKID;
613 dlm_error(status);
614 goto error;
615 }
616
617 mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
618 mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
619
620 dlm_lock_attach_lockres(lock, res);
621 lock->ast = ast;
622 lock->bast = bast;
623 lock->astdata = data;
624
625retry_lock:
626 if (flags & LKM_VALBLK) {
627 mlog(0, "LKM_VALBLK passed by caller\n");
628
629 /* LVB requests for non PR, PW or EX locks are
630 * ignored. */
631 if (mode < LKM_PRMODE)
632 flags &= ~LKM_VALBLK;
633 else {
634 flags |= LKM_GET_LVB;
635 lock->lksb->flags |= DLM_LKSB_GET_LVB;
636 }
637 }
638
639 if (res->owner == dlm->node_num)
640 status = dlmlock_master(dlm, res, lock, flags);
641 else
642 status = dlmlock_remote(dlm, res, lock, flags);
643
644 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
645 status == DLM_FORWARD) {
646 mlog(0, "retrying lock with migration/"
647 "recovery/in progress\n");
648 msleep(100);
649 dlm_wait_for_recovery(dlm);
650 goto retry_lock;
651 }
652
653 if (status != DLM_NORMAL) {
654 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
655 if (status != DLM_NOTQUEUED)
656 dlm_error(status);
657 goto error;
658 }
659 }
660
661error:
662 if (status != DLM_NORMAL) {
663 if (lock && !convert)
664 dlm_lock_put(lock);
665 // this is kind of unnecessary
666 lksb->status = status;
667 }
668
669 /* put lockres ref from the convert path
670 * or from dlm_get_lock_resource */
671 if (res)
672 dlm_lockres_put(res);
673
674 return status;
675}
676EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 000000000000..27e984f7e4cd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2664 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmmod.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdebug.h"
51#include "dlmdomain.h"
52
53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
54#include "cluster/masklog.h"
55
56enum dlm_mle_type {
57 DLM_MLE_BLOCK,
58 DLM_MLE_MASTER,
59 DLM_MLE_MIGRATION
60};
61
62struct dlm_lock_name
63{
64 u8 len;
65 u8 name[DLM_LOCKID_NAME_MAX];
66};
67
68struct dlm_master_list_entry
69{
70 struct list_head list;
71 struct list_head hb_events;
72 struct dlm_ctxt *dlm;
73 spinlock_t spinlock;
74 wait_queue_head_t wq;
75 atomic_t woken;
76 struct kref mle_refs;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node,
95 int idx);
96static void dlm_mle_node_up(struct dlm_ctxt *dlm,
97 struct dlm_master_list_entry *mle,
98 struct o2nm_node *node,
99 int idx);
100
101static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
102static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
103 unsigned int namelen, void *nodemap,
104 u32 flags);
105
106static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
107 struct dlm_master_list_entry *mle,
108 const char *name,
109 unsigned int namelen)
110{
111 struct dlm_lock_resource *res;
112
113 if (dlm != mle->dlm)
114 return 0;
115
116 if (mle->type == DLM_MLE_BLOCK ||
117 mle->type == DLM_MLE_MIGRATION) {
118 if (namelen != mle->u.name.len ||
119 memcmp(name, mle->u.name.name, namelen)!=0)
120 return 0;
121 } else {
122 res = mle->u.res;
123 if (namelen != res->lockname.len ||
124 memcmp(res->lockname.name, name, namelen) != 0)
125 return 0;
126 }
127 return 1;
128}
129
130#if 0
131/* Code here is included but defined out as it aids debugging */
132
133void dlm_print_one_mle(struct dlm_master_list_entry *mle)
134{
135 int i = 0, refs;
136 char *type;
137 char attached;
138 u8 master;
139 unsigned int namelen;
140 const char *name;
141 struct kref *k;
142
143 k = &mle->mle_refs;
144 if (mle->type == DLM_MLE_BLOCK)
145 type = "BLK";
146 else if (mle->type == DLM_MLE_MASTER)
147 type = "MAS";
148 else
149 type = "MIG";
150 refs = atomic_read(&k->refcount);
151 master = mle->master;
152 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
153
154 if (mle->type != DLM_MLE_MASTER) {
155 namelen = mle->u.name.len;
156 name = mle->u.name.name;
157 } else {
158 namelen = mle->u.res->lockname.len;
159 name = mle->u.res->lockname.name;
160 }
161
162 mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n",
163 i, type, refs, master, mle->new_master, attached,
164 namelen, namelen, name);
165}
166
167static void dlm_dump_mles(struct dlm_ctxt *dlm)
168{
169 struct dlm_master_list_entry *mle;
170 struct list_head *iter;
171
172 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
173 mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
174 spin_lock(&dlm->master_lock);
175 list_for_each(iter, &dlm->master_list) {
176 mle = list_entry(iter, struct dlm_master_list_entry, list);
177 dlm_print_one_mle(mle);
178 }
179 spin_unlock(&dlm->master_lock);
180}
181
182int dlm_dump_all_mles(const char __user *data, unsigned int len)
183{
184 struct list_head *iter;
185 struct dlm_ctxt *dlm;
186
187 spin_lock(&dlm_domain_lock);
188 list_for_each(iter, &dlm_domains) {
189 dlm = list_entry (iter, struct dlm_ctxt, list);
190 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
191 dlm_dump_mles(dlm);
192 }
193 spin_unlock(&dlm_domain_lock);
194 return len;
195}
196EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
197
198#endif /* 0 */
199
200
201static kmem_cache_t *dlm_mle_cache = NULL;
202
203
204static void dlm_mle_release(struct kref *kref);
205static void dlm_init_mle(struct dlm_master_list_entry *mle,
206 enum dlm_mle_type type,
207 struct dlm_ctxt *dlm,
208 struct dlm_lock_resource *res,
209 const char *name,
210 unsigned int namelen);
211static void dlm_put_mle(struct dlm_master_list_entry *mle);
212static void __dlm_put_mle(struct dlm_master_list_entry *mle);
213static int dlm_find_mle(struct dlm_ctxt *dlm,
214 struct dlm_master_list_entry **mle,
215 char *name, unsigned int namelen);
216
217static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
218
219
220static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
221 struct dlm_lock_resource *res,
222 struct dlm_master_list_entry *mle,
223 int *blocked);
224static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
225 struct dlm_lock_resource *res,
226 struct dlm_master_list_entry *mle,
227 int blocked);
228static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
229 struct dlm_lock_resource *res,
230 struct dlm_master_list_entry *mle,
231 struct dlm_master_list_entry **oldmle,
232 const char *name, unsigned int namelen,
233 u8 new_master, u8 master);
234
235static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
236 struct dlm_lock_resource *res);
237static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
238 struct dlm_lock_resource *res);
239static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
240 struct dlm_lock_resource *res,
241 u8 target);
242
243
244int dlm_is_host_down(int errno)
245{
246 switch (errno) {
247 case -EBADF:
248 case -ECONNREFUSED:
249 case -ENOTCONN:
250 case -ECONNRESET:
251 case -EPIPE:
252 case -EHOSTDOWN:
253 case -EHOSTUNREACH:
254 case -ETIMEDOUT:
255 case -ECONNABORTED:
256 case -ENETDOWN:
257 case -ENETUNREACH:
258 case -ENETRESET:
259 case -ESHUTDOWN:
260 case -ENOPROTOOPT:
261 case -EINVAL: /* if returned from our tcp code,
262 this means there is no socket */
263 return 1;
264 }
265 return 0;
266}
267
268
269/*
270 * MASTER LIST FUNCTIONS
271 */
272
273
274/*
275 * regarding master list entries and heartbeat callbacks:
276 *
277 * in order to avoid sleeping and allocation that occurs in
278 * heartbeat, master list entries are simply attached to the
279 * dlm's established heartbeat callbacks. the mle is attached
280 * when it is created, and since the dlm->spinlock is held at
281 * that time, any heartbeat event will be properly discovered
282 * by the mle. the mle needs to be detached from the
283 * dlm->mle_hb_events list as soon as heartbeat events are no
284 * longer useful to the mle, and before the mle is freed.
285 *
286 * as a general rule, heartbeat events are no longer needed by
287 * the mle once an "answer" regarding the lock master has been
288 * received.
289 */
290static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
291 struct dlm_master_list_entry *mle)
292{
293 assert_spin_locked(&dlm->spinlock);
294
295 list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
296}
297
298
299static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
300 struct dlm_master_list_entry *mle)
301{
302 if (!list_empty(&mle->hb_events))
303 list_del_init(&mle->hb_events);
304}
305
306
307static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
308 struct dlm_master_list_entry *mle)
309{
310 spin_lock(&dlm->spinlock);
311 __dlm_mle_detach_hb_events(dlm, mle);
312 spin_unlock(&dlm->spinlock);
313}
314
315/* remove from list and free */
316static void __dlm_put_mle(struct dlm_master_list_entry *mle)
317{
318 struct dlm_ctxt *dlm;
319 dlm = mle->dlm;
320
321 assert_spin_locked(&dlm->spinlock);
322 assert_spin_locked(&dlm->master_lock);
323 BUG_ON(!atomic_read(&mle->mle_refs.refcount));
324
325 kref_put(&mle->mle_refs, dlm_mle_release);
326}
327
328
329/* must not have any spinlocks coming in */
330static void dlm_put_mle(struct dlm_master_list_entry *mle)
331{
332 struct dlm_ctxt *dlm;
333 dlm = mle->dlm;
334
335 spin_lock(&dlm->spinlock);
336 spin_lock(&dlm->master_lock);
337 __dlm_put_mle(mle);
338 spin_unlock(&dlm->master_lock);
339 spin_unlock(&dlm->spinlock);
340}
341
342static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
343{
344 kref_get(&mle->mle_refs);
345}
346
347static void dlm_init_mle(struct dlm_master_list_entry *mle,
348 enum dlm_mle_type type,
349 struct dlm_ctxt *dlm,
350 struct dlm_lock_resource *res,
351 const char *name,
352 unsigned int namelen)
353{
354 assert_spin_locked(&dlm->spinlock);
355
356 mle->dlm = dlm;
357 mle->type = type;
358 INIT_LIST_HEAD(&mle->list);
359 INIT_LIST_HEAD(&mle->hb_events);
360 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
361 spin_lock_init(&mle->spinlock);
362 init_waitqueue_head(&mle->wq);
363 atomic_set(&mle->woken, 0);
364 kref_init(&mle->mle_refs);
365 memset(mle->response_map, 0, sizeof(mle->response_map));
366 mle->master = O2NM_MAX_NODES;
367 mle->new_master = O2NM_MAX_NODES;
368
369 if (mle->type == DLM_MLE_MASTER) {
370 BUG_ON(!res);
371 mle->u.res = res;
372 } else if (mle->type == DLM_MLE_BLOCK) {
373 BUG_ON(!name);
374 memcpy(mle->u.name.name, name, namelen);
375 mle->u.name.len = namelen;
376 } else /* DLM_MLE_MIGRATION */ {
377 BUG_ON(!name);
378 memcpy(mle->u.name.name, name, namelen);
379 mle->u.name.len = namelen;
380 }
381
382 /* copy off the node_map and register hb callbacks on our copy */
383 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
384 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
385 clear_bit(dlm->node_num, mle->vote_map);
386 clear_bit(dlm->node_num, mle->node_map);
387
388 /* attach the mle to the domain node up/down events */
389 __dlm_mle_attach_hb_events(dlm, mle);
390}
391
392
393/* returns 1 if found, 0 if not */
394static int dlm_find_mle(struct dlm_ctxt *dlm,
395 struct dlm_master_list_entry **mle,
396 char *name, unsigned int namelen)
397{
398 struct dlm_master_list_entry *tmpmle;
399 struct list_head *iter;
400
401 assert_spin_locked(&dlm->master_lock);
402
403 list_for_each(iter, &dlm->master_list) {
404 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
405 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
406 continue;
407 dlm_get_mle(tmpmle);
408 *mle = tmpmle;
409 return 1;
410 }
411 return 0;
412}
413
414void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
415{
416 struct dlm_master_list_entry *mle;
417 struct list_head *iter;
418
419 assert_spin_locked(&dlm->spinlock);
420
421 list_for_each(iter, &dlm->mle_hb_events) {
422 mle = list_entry(iter, struct dlm_master_list_entry,
423 hb_events);
424 if (node_up)
425 dlm_mle_node_up(dlm, mle, NULL, idx);
426 else
427 dlm_mle_node_down(dlm, mle, NULL, idx);
428 }
429}
430
431static void dlm_mle_node_down(struct dlm_ctxt *dlm,
432 struct dlm_master_list_entry *mle,
433 struct o2nm_node *node, int idx)
434{
435 spin_lock(&mle->spinlock);
436
437 if (!test_bit(idx, mle->node_map))
438 mlog(0, "node %u already removed from nodemap!\n", idx);
439 else
440 clear_bit(idx, mle->node_map);
441
442 spin_unlock(&mle->spinlock);
443}
444
445static void dlm_mle_node_up(struct dlm_ctxt *dlm,
446 struct dlm_master_list_entry *mle,
447 struct o2nm_node *node, int idx)
448{
449 spin_lock(&mle->spinlock);
450
451 if (test_bit(idx, mle->node_map))
452 mlog(0, "node %u already in node map!\n", idx);
453 else
454 set_bit(idx, mle->node_map);
455
456 spin_unlock(&mle->spinlock);
457}
458
459
460int dlm_init_mle_cache(void)
461{
462 dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
463 sizeof(struct dlm_master_list_entry),
464 0, SLAB_HWCACHE_ALIGN,
465 NULL, NULL);
466 if (dlm_mle_cache == NULL)
467 return -ENOMEM;
468 return 0;
469}
470
471void dlm_destroy_mle_cache(void)
472{
473 if (dlm_mle_cache)
474 kmem_cache_destroy(dlm_mle_cache);
475}
476
477static void dlm_mle_release(struct kref *kref)
478{
479 struct dlm_master_list_entry *mle;
480 struct dlm_ctxt *dlm;
481
482 mlog_entry_void();
483
484 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
485 dlm = mle->dlm;
486
487 if (mle->type != DLM_MLE_MASTER) {
488 mlog(0, "calling mle_release for %.*s, type %d\n",
489 mle->u.name.len, mle->u.name.name, mle->type);
490 } else {
491 mlog(0, "calling mle_release for %.*s, type %d\n",
492 mle->u.res->lockname.len,
493 mle->u.res->lockname.name, mle->type);
494 }
495 assert_spin_locked(&dlm->spinlock);
496 assert_spin_locked(&dlm->master_lock);
497
498 /* remove from list if not already */
499 if (!list_empty(&mle->list))
500 list_del_init(&mle->list);
501
502 /* detach the mle from the domain node up/down events */
503 __dlm_mle_detach_hb_events(dlm, mle);
504
505 /* NOTE: kfree under spinlock here.
506 * if this is bad, we can move this to a freelist. */
507 kmem_cache_free(dlm_mle_cache, mle);
508}
509
510
511/*
512 * LOCK RESOURCE FUNCTIONS
513 */
514
515static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
516 struct dlm_lock_resource *res,
517 u8 owner)
518{
519 assert_spin_locked(&res->spinlock);
520
521 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
522
523 if (owner == dlm->node_num)
524 atomic_inc(&dlm->local_resources);
525 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
526 atomic_inc(&dlm->unknown_resources);
527 else
528 atomic_inc(&dlm->remote_resources);
529
530 res->owner = owner;
531}
532
533void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
534 struct dlm_lock_resource *res, u8 owner)
535{
536 assert_spin_locked(&res->spinlock);
537
538 if (owner == res->owner)
539 return;
540
541 if (res->owner == dlm->node_num)
542 atomic_dec(&dlm->local_resources);
543 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
544 atomic_dec(&dlm->unknown_resources);
545 else
546 atomic_dec(&dlm->remote_resources);
547
548 dlm_set_lockres_owner(dlm, res, owner);
549}
550
551
552static void dlm_lockres_release(struct kref *kref)
553{
554 struct dlm_lock_resource *res;
555
556 res = container_of(kref, struct dlm_lock_resource, refs);
557
558 /* This should not happen -- all lockres' have a name
559 * associated with them at init time. */
560 BUG_ON(!res->lockname.name);
561
562 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
563 res->lockname.name);
564
565 /* By the time we're ready to blow this guy away, we shouldn't
566 * be on any lists. */
567 BUG_ON(!list_empty(&res->list));
568 BUG_ON(!list_empty(&res->granted));
569 BUG_ON(!list_empty(&res->converting));
570 BUG_ON(!list_empty(&res->blocked));
571 BUG_ON(!list_empty(&res->dirty));
572 BUG_ON(!list_empty(&res->recovering));
573 BUG_ON(!list_empty(&res->purge));
574
575 kfree(res->lockname.name);
576
577 kfree(res);
578}
579
580void dlm_lockres_get(struct dlm_lock_resource *res)
581{
582 kref_get(&res->refs);
583}
584
585void dlm_lockres_put(struct dlm_lock_resource *res)
586{
587 kref_put(&res->refs, dlm_lockres_release);
588}
589
590static void dlm_init_lockres(struct dlm_ctxt *dlm,
591 struct dlm_lock_resource *res,
592 const char *name, unsigned int namelen)
593{
594 char *qname;
595
596 /* If we memset here, we lose our reference to the kmalloc'd
597 * res->lockname.name, so be sure to init every field
598 * correctly! */
599
600 qname = (char *) res->lockname.name;
601 memcpy(qname, name, namelen);
602
603 res->lockname.len = namelen;
604 res->lockname.hash = full_name_hash(name, namelen);
605
606 init_waitqueue_head(&res->wq);
607 spin_lock_init(&res->spinlock);
608 INIT_LIST_HEAD(&res->list);
609 INIT_LIST_HEAD(&res->granted);
610 INIT_LIST_HEAD(&res->converting);
611 INIT_LIST_HEAD(&res->blocked);
612 INIT_LIST_HEAD(&res->dirty);
613 INIT_LIST_HEAD(&res->recovering);
614 INIT_LIST_HEAD(&res->purge);
615 atomic_set(&res->asts_reserved, 0);
616 res->migration_pending = 0;
617
618 kref_init(&res->refs);
619
620 /* just for consistency */
621 spin_lock(&res->spinlock);
622 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
623 spin_unlock(&res->spinlock);
624
625 res->state = DLM_LOCK_RES_IN_PROGRESS;
626
627 res->last_used = 0;
628
629 memset(res->lvb, 0, DLM_LVB_LEN);
630}
631
632struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
633 const char *name,
634 unsigned int namelen)
635{
636 struct dlm_lock_resource *res;
637
638 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
639 if (!res)
640 return NULL;
641
642 res->lockname.name = kmalloc(namelen, GFP_KERNEL);
643 if (!res->lockname.name) {
644 kfree(res);
645 return NULL;
646 }
647
648 dlm_init_lockres(dlm, res, name, namelen);
649 return res;
650}
651
652/*
653 * lookup a lock resource by name.
654 * may already exist in the hashtable.
655 * lockid is null terminated
656 *
657 * if not, allocate enough for the lockres and for
658 * the temporary structure used in doing the mastering.
659 *
660 * also, do a lookup in the dlm->master_list to see
661 * if another node has begun mastering the same lock.
662 * if so, there should be a block entry in there
663 * for this name, and we should *not* attempt to master
664 * the lock here. need to wait around for that node
665 * to assert_master (or die).
666 *
667 */
668struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
669 const char *lockid,
670 int flags)
671{
672 struct dlm_lock_resource *tmpres=NULL, *res=NULL;
673 struct dlm_master_list_entry *mle = NULL;
674 struct dlm_master_list_entry *alloc_mle = NULL;
675 int blocked = 0;
676 int ret, nodenum;
677 struct dlm_node_iter iter;
678 unsigned int namelen;
679 int tries = 0;
680
681 BUG_ON(!lockid);
682
683 namelen = strlen(lockid);
684
685 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
686
687lookup:
688 spin_lock(&dlm->spinlock);
689 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
690 if (tmpres) {
691 spin_unlock(&dlm->spinlock);
692 mlog(0, "found in hash!\n");
693 if (res)
694 dlm_lockres_put(res);
695 res = tmpres;
696 goto leave;
697 }
698
699 if (!res) {
700 spin_unlock(&dlm->spinlock);
701 mlog(0, "allocating a new resource\n");
702 /* nothing found and we need to allocate one. */
703 alloc_mle = (struct dlm_master_list_entry *)
704 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
705 if (!alloc_mle)
706 goto leave;
707 res = dlm_new_lockres(dlm, lockid, namelen);
708 if (!res)
709 goto leave;
710 goto lookup;
711 }
712
713 mlog(0, "no lockres found, allocated our own: %p\n", res);
714
715 if (flags & LKM_LOCAL) {
716 /* caller knows it's safe to assume it's not mastered elsewhere
717 * DONE! return right away */
718 spin_lock(&res->spinlock);
719 dlm_change_lockres_owner(dlm, res, dlm->node_num);
720 __dlm_insert_lockres(dlm, res);
721 spin_unlock(&res->spinlock);
722 spin_unlock(&dlm->spinlock);
723 /* lockres still marked IN_PROGRESS */
724 goto wake_waiters;
725 }
726
727 /* check master list to see if another node has started mastering it */
728 spin_lock(&dlm->master_lock);
729
730 /* if we found a block, wait for lock to be mastered by another node */
731 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
732 if (blocked) {
733 if (mle->type == DLM_MLE_MASTER) {
734 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
735 BUG();
736 } else if (mle->type == DLM_MLE_MIGRATION) {
737 /* migration is in progress! */
738 /* the good news is that we now know the
739 * "current" master (mle->master). */
740
741 spin_unlock(&dlm->master_lock);
742 assert_spin_locked(&dlm->spinlock);
743
744 /* set the lockres owner and hash it */
745 spin_lock(&res->spinlock);
746 dlm_set_lockres_owner(dlm, res, mle->master);
747 __dlm_insert_lockres(dlm, res);
748 spin_unlock(&res->spinlock);
749 spin_unlock(&dlm->spinlock);
750
751 /* master is known, detach */
752 dlm_mle_detach_hb_events(dlm, mle);
753 dlm_put_mle(mle);
754 mle = NULL;
755 goto wake_waiters;
756 }
757 } else {
758 /* go ahead and try to master lock on this node */
759 mle = alloc_mle;
760 /* make sure this does not get freed below */
761 alloc_mle = NULL;
762 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
763 set_bit(dlm->node_num, mle->maybe_map);
764 list_add(&mle->list, &dlm->master_list);
765 }
766
767 /* at this point there is either a DLM_MLE_BLOCK or a
768 * DLM_MLE_MASTER on the master list, so it's safe to add the
769 * lockres to the hashtable. anyone who finds the lock will
770 * still have to wait on the IN_PROGRESS. */
771
772 /* finally add the lockres to its hash bucket */
773 __dlm_insert_lockres(dlm, res);
774 /* get an extra ref on the mle in case this is a BLOCK
775 * if so, the creator of the BLOCK may try to put the last
776 * ref at this time in the assert master handler, so we
777 * need an extra one to keep from a bad ptr deref. */
778 dlm_get_mle(mle);
779 spin_unlock(&dlm->master_lock);
780 spin_unlock(&dlm->spinlock);
781
782 /* must wait for lock to be mastered elsewhere */
783 if (blocked)
784 goto wait;
785
786redo_request:
787 ret = -EINVAL;
788 dlm_node_iter_init(mle->vote_map, &iter);
789 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
790 ret = dlm_do_master_request(mle, nodenum);
791 if (ret < 0)
792 mlog_errno(ret);
793 if (mle->master != O2NM_MAX_NODES) {
794 /* found a master ! */
795 break;
796 }
797 }
798
799wait:
800 /* keep going until the response map includes all nodes */
801 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
802 if (ret < 0) {
803 mlog(0, "%s:%.*s: node map changed, redo the "
804 "master request now, blocked=%d\n",
805 dlm->name, res->lockname.len,
806 res->lockname.name, blocked);
807 if (++tries > 20) {
808 mlog(ML_ERROR, "%s:%.*s: spinning on "
809 "dlm_wait_for_lock_mastery, blocked=%d\n",
810 dlm->name, res->lockname.len,
811 res->lockname.name, blocked);
812 dlm_print_one_lock_resource(res);
813 /* dlm_print_one_mle(mle); */
814 tries = 0;
815 }
816 goto redo_request;
817 }
818
819 mlog(0, "lockres mastered by %u\n", res->owner);
820 /* make sure we never continue without this */
821 BUG_ON(res->owner == O2NM_MAX_NODES);
822
823 /* master is known, detach if not already detached */
824 dlm_mle_detach_hb_events(dlm, mle);
825 dlm_put_mle(mle);
826 /* put the extra ref */
827 dlm_put_mle(mle);
828
829wake_waiters:
830 spin_lock(&res->spinlock);
831 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
832 spin_unlock(&res->spinlock);
833 wake_up(&res->wq);
834
835leave:
836 /* need to free the unused mle */
837 if (alloc_mle)
838 kmem_cache_free(dlm_mle_cache, alloc_mle);
839
840 return res;
841}
842
843
844#define DLM_MASTERY_TIMEOUT_MS 5000
845
846static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
847 struct dlm_lock_resource *res,
848 struct dlm_master_list_entry *mle,
849 int *blocked)
850{
851 u8 m;
852 int ret, bit;
853 int map_changed, voting_done;
854 int assert, sleep;
855
856recheck:
857 ret = 0;
858 assert = 0;
859
860 /* check if another node has already become the owner */
861 spin_lock(&res->spinlock);
862 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
863 spin_unlock(&res->spinlock);
864 goto leave;
865 }
866 spin_unlock(&res->spinlock);
867
868 spin_lock(&mle->spinlock);
869 m = mle->master;
870 map_changed = (memcmp(mle->vote_map, mle->node_map,
871 sizeof(mle->vote_map)) != 0);
872 voting_done = (memcmp(mle->vote_map, mle->response_map,
873 sizeof(mle->vote_map)) == 0);
874
875 /* restart if we hit any errors */
876 if (map_changed) {
877 int b;
878 mlog(0, "%s: %.*s: node map changed, restarting\n",
879 dlm->name, res->lockname.len, res->lockname.name);
880 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
881 b = (mle->type == DLM_MLE_BLOCK);
882 if ((*blocked && !b) || (!*blocked && b)) {
883 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
884 dlm->name, res->lockname.len, res->lockname.name,
885 *blocked, b);
886 *blocked = b;
887 }
888 spin_unlock(&mle->spinlock);
889 if (ret < 0) {
890 mlog_errno(ret);
891 goto leave;
892 }
893 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
894 "rechecking now\n", dlm->name, res->lockname.len,
895 res->lockname.name);
896 goto recheck;
897 }
898
899 if (m != O2NM_MAX_NODES) {
900 /* another node has done an assert!
901 * all done! */
902 sleep = 0;
903 } else {
904 sleep = 1;
905 /* have all nodes responded? */
906 if (voting_done && !*blocked) {
907 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
908 if (dlm->node_num <= bit) {
909 /* my node number is lowest.
910 * now tell other nodes that I am
911 * mastering this. */
912 mle->master = dlm->node_num;
913 assert = 1;
914 sleep = 0;
915 }
916 /* if voting is done, but we have not received
917 * an assert master yet, we must sleep */
918 }
919 }
920
921 spin_unlock(&mle->spinlock);
922
923 /* sleep if we haven't finished voting yet */
924 if (sleep) {
925 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
926
927 /*
928 if (atomic_read(&mle->mle_refs.refcount) < 2)
929 mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
930 atomic_read(&mle->mle_refs.refcount),
931 res->lockname.len, res->lockname.name);
932 */
933 atomic_set(&mle->woken, 0);
934 (void)wait_event_timeout(mle->wq,
935 (atomic_read(&mle->woken) == 1),
936 timeo);
937 if (res->owner == O2NM_MAX_NODES) {
938 mlog(0, "waiting again\n");
939 goto recheck;
940 }
941 mlog(0, "done waiting, master is %u\n", res->owner);
942 ret = 0;
943 goto leave;
944 }
945
946 ret = 0; /* done */
947 if (assert) {
948 m = dlm->node_num;
949 mlog(0, "about to master %.*s here, this=%u\n",
950 res->lockname.len, res->lockname.name, m);
951 ret = dlm_do_assert_master(dlm, res->lockname.name,
952 res->lockname.len, mle->vote_map, 0);
953 if (ret) {
954 /* This is a failure in the network path,
955 * not in the response to the assert_master
956 * (any nonzero response is a BUG on this node).
957 * Most likely a socket just got disconnected
958 * due to node death. */
959 mlog_errno(ret);
960 }
961 /* no longer need to restart lock mastery.
962 * all living nodes have been contacted. */
963 ret = 0;
964 }
965
966 /* set the lockres owner */
967 spin_lock(&res->spinlock);
968 dlm_change_lockres_owner(dlm, res, m);
969 spin_unlock(&res->spinlock);
970
971leave:
972 return ret;
973}
974
975struct dlm_bitmap_diff_iter
976{
977 int curnode;
978 unsigned long *orig_bm;
979 unsigned long *cur_bm;
980 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
981};
982
983enum dlm_node_state_change
984{
985 NODE_DOWN = -1,
986 NODE_NO_CHANGE = 0,
987 NODE_UP
988};
989
990static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
991 unsigned long *orig_bm,
992 unsigned long *cur_bm)
993{
994 unsigned long p1, p2;
995 int i;
996
997 iter->curnode = -1;
998 iter->orig_bm = orig_bm;
999 iter->cur_bm = cur_bm;
1000
1001 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1002 p1 = *(iter->orig_bm + i);
1003 p2 = *(iter->cur_bm + i);
1004 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1005 }
1006}
1007
1008static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1009 enum dlm_node_state_change *state)
1010{
1011 int bit;
1012
1013 if (iter->curnode >= O2NM_MAX_NODES)
1014 return -ENOENT;
1015
1016 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1017 iter->curnode+1);
1018 if (bit >= O2NM_MAX_NODES) {
1019 iter->curnode = O2NM_MAX_NODES;
1020 return -ENOENT;
1021 }
1022
1023 /* if it was there in the original then this node died */
1024 if (test_bit(bit, iter->orig_bm))
1025 *state = NODE_DOWN;
1026 else
1027 *state = NODE_UP;
1028
1029 iter->curnode = bit;
1030 return bit;
1031}
1032
1033
1034static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1035 struct dlm_lock_resource *res,
1036 struct dlm_master_list_entry *mle,
1037 int blocked)
1038{
1039 struct dlm_bitmap_diff_iter bdi;
1040 enum dlm_node_state_change sc;
1041 int node;
1042 int ret = 0;
1043
1044 mlog(0, "something happened such that the "
1045 "master process may need to be restarted!\n");
1046
1047 assert_spin_locked(&mle->spinlock);
1048
1049 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) {
1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need
1054 * to talk to it if its node number is higher
1055 * or if we are already blocked. */
1056 mlog(0, "node up! %d\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064
1065 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n");
1067 clear_bit(node, mle->response_map);
1068 set_bit(node, mle->vote_map);
1069 } else {
1070 mlog(ML_ERROR, "node down! %d\n", node);
1071
1072 /* if the node wasn't involved in mastery skip it,
1073 * but clear it out from the maps so that it will
1074 * not affect mastery of this lockres */
1075 clear_bit(node, mle->response_map);
1076 clear_bit(node, mle->vote_map);
1077 if (!test_bit(node, mle->maybe_map))
1078 goto next;
1079
1080 /* if we're already blocked on lock mastery, and the
1081 * dead node wasn't the expected master, or there is
1082 * another node in the maybe_map, keep waiting */
1083 if (blocked) {
1084 int lowest = find_next_bit(mle->maybe_map,
1085 O2NM_MAX_NODES, 0);
1086
1087 /* act like it was never there */
1088 clear_bit(node, mle->maybe_map);
1089
1090 if (node != lowest)
1091 goto next;
1092
1093 mlog(ML_ERROR, "expected master %u died while "
1094 "this node was blocked waiting on it!\n",
1095 node);
1096 lowest = find_next_bit(mle->maybe_map,
1097 O2NM_MAX_NODES,
1098 lowest+1);
1099 if (lowest < O2NM_MAX_NODES) {
1100 mlog(0, "still blocked. waiting "
1101 "on %u now\n", lowest);
1102 goto next;
1103 }
1104
1105 /* mle is an MLE_BLOCK, but there is now
1106 * nothing left to block on. we need to return
1107 * all the way back out and try again with
1108 * an MLE_MASTER. dlm_do_local_recovery_cleanup
1109 * has already run, so the mle refcount is ok */
1110 mlog(0, "no longer blocking. we can "
1111 "try to master this here\n");
1112 mle->type = DLM_MLE_MASTER;
1113 memset(mle->maybe_map, 0,
1114 sizeof(mle->maybe_map));
1115 memset(mle->response_map, 0,
1116 sizeof(mle->maybe_map));
1117 memcpy(mle->vote_map, mle->node_map,
1118 sizeof(mle->node_map));
1119 mle->u.res = res;
1120 set_bit(dlm->node_num, mle->maybe_map);
1121
1122 ret = -EAGAIN;
1123 goto next;
1124 }
1125
1126 clear_bit(node, mle->maybe_map);
1127 if (node > dlm->node_num)
1128 goto next;
1129
1130 mlog(0, "dead node in map!\n");
1131 /* yuck. go back and re-contact all nodes
1132 * in the vote_map, removing this node. */
1133 memset(mle->response_map, 0,
1134 sizeof(mle->response_map));
1135 }
1136 ret = -EAGAIN;
1137next:
1138 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1139 }
1140 return ret;
1141}
1142
1143
1144/*
1145 * DLM_MASTER_REQUEST_MSG
1146 *
1147 * returns: 0 on success,
1148 * -errno on a network error
1149 *
1150 * on error, the caller should assume the target node is "dead"
1151 *
1152 */
1153
1154static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
1155{
1156 struct dlm_ctxt *dlm = mle->dlm;
1157 struct dlm_master_request request;
1158 int ret, response=0, resend;
1159
1160 memset(&request, 0, sizeof(request));
1161 request.node_idx = dlm->node_num;
1162
1163 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1164
1165 if (mle->type != DLM_MLE_MASTER) {
1166 request.namelen = mle->u.name.len;
1167 memcpy(request.name, mle->u.name.name, request.namelen);
1168 } else {
1169 request.namelen = mle->u.res->lockname.len;
1170 memcpy(request.name, mle->u.res->lockname.name,
1171 request.namelen);
1172 }
1173
1174again:
1175 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1176 sizeof(request), to, &response);
1177 if (ret < 0) {
1178 if (ret == -ESRCH) {
1179 /* should never happen */
1180 mlog(ML_ERROR, "TCP stack not ready!\n");
1181 BUG();
1182 } else if (ret == -EINVAL) {
1183 mlog(ML_ERROR, "bad args passed to o2net!\n");
1184 BUG();
1185 } else if (ret == -ENOMEM) {
1186 mlog(ML_ERROR, "out of memory while trying to send "
1187 "network message! retrying\n");
1188 /* this is totally crude */
1189 msleep(50);
1190 goto again;
1191 } else if (!dlm_is_host_down(ret)) {
1192 /* not a network error. bad. */
1193 mlog_errno(ret);
1194 mlog(ML_ERROR, "unhandled error!");
1195 BUG();
1196 }
1197 /* all other errors should be network errors,
1198 * and likely indicate node death */
1199 mlog(ML_ERROR, "link to %d went down!\n", to);
1200 goto out;
1201 }
1202
1203 ret = 0;
1204 resend = 0;
1205 spin_lock(&mle->spinlock);
1206 switch (response) {
1207 case DLM_MASTER_RESP_YES:
1208 set_bit(to, mle->response_map);
1209 mlog(0, "node %u is the master, response=YES\n", to);
1210 mle->master = to;
1211 break;
1212 case DLM_MASTER_RESP_NO:
1213 mlog(0, "node %u not master, response=NO\n", to);
1214 set_bit(to, mle->response_map);
1215 break;
1216 case DLM_MASTER_RESP_MAYBE:
1217 mlog(0, "node %u not master, response=MAYBE\n", to);
1218 set_bit(to, mle->response_map);
1219 set_bit(to, mle->maybe_map);
1220 break;
1221 case DLM_MASTER_RESP_ERROR:
1222 mlog(0, "node %u hit an error, resending\n", to);
1223 resend = 1;
1224 response = 0;
1225 break;
1226 default:
1227 mlog(ML_ERROR, "bad response! %u\n", response);
1228 BUG();
1229 }
1230 spin_unlock(&mle->spinlock);
1231 if (resend) {
1232 /* this is also totally crude */
1233 msleep(50);
1234 goto again;
1235 }
1236
1237out:
1238 return ret;
1239}
1240
1241/*
1242 * locks that can be taken here:
1243 * dlm->spinlock
1244 * res->spinlock
1245 * mle->spinlock
1246 * dlm->master_list
1247 *
1248 * if possible, TRIM THIS DOWN!!!
1249 */
1250int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1251{
1252 u8 response = DLM_MASTER_RESP_MAYBE;
1253 struct dlm_ctxt *dlm = data;
1254 struct dlm_lock_resource *res;
1255 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1256 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1257 char *name;
1258 unsigned int namelen;
1259 int found, ret;
1260 int set_maybe;
1261
1262 if (!dlm_grab(dlm))
1263 return DLM_MASTER_RESP_NO;
1264
1265 if (!dlm_domain_fully_joined(dlm)) {
1266 response = DLM_MASTER_RESP_NO;
1267 goto send_response;
1268 }
1269
1270 name = request->name;
1271 namelen = request->namelen;
1272
1273 if (namelen > DLM_LOCKID_NAME_MAX) {
1274 response = DLM_IVBUFLEN;
1275 goto send_response;
1276 }
1277
1278way_up_top:
1279 spin_lock(&dlm->spinlock);
1280 res = __dlm_lookup_lockres(dlm, name, namelen);
1281 if (res) {
1282 spin_unlock(&dlm->spinlock);
1283
1284 /* take care of the easy cases up front */
1285 spin_lock(&res->spinlock);
1286 if (res->state & DLM_LOCK_RES_RECOVERING) {
1287 spin_unlock(&res->spinlock);
1288 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1289 "being recovered\n");
1290 response = DLM_MASTER_RESP_ERROR;
1291 if (mle)
1292 kmem_cache_free(dlm_mle_cache, mle);
1293 goto send_response;
1294 }
1295
1296 if (res->owner == dlm->node_num) {
1297 u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
1298 spin_unlock(&res->spinlock);
1299 // mlog(0, "this node is the master\n");
1300 response = DLM_MASTER_RESP_YES;
1301 if (mle)
1302 kmem_cache_free(dlm_mle_cache, mle);
1303
1304 /* this node is the owner.
1305 * there is some extra work that needs to
1306 * happen now. the requesting node has
1307 * caused all nodes up to this one to
1308 * create mles. this node now needs to
1309 * go back and clean those up. */
1310 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1311 dlm->node_num, res->lockname.len, res->lockname.name);
1312 ret = dlm_dispatch_assert_master(dlm, res, 1,
1313 request->node_idx,
1314 flags);
1315 if (ret < 0) {
1316 mlog(ML_ERROR, "failed to dispatch assert "
1317 "master work\n");
1318 response = DLM_MASTER_RESP_ERROR;
1319 }
1320 goto send_response;
1321 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1322 spin_unlock(&res->spinlock);
1323 // mlog(0, "node %u is the master\n", res->owner);
1324 response = DLM_MASTER_RESP_NO;
1325 if (mle)
1326 kmem_cache_free(dlm_mle_cache, mle);
1327 goto send_response;
1328 }
1329
1330 /* ok, there is no owner. either this node is
1331 * being blocked, or it is actively trying to
1332 * master this lock. */
1333 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1334 mlog(ML_ERROR, "lock with no owner should be "
1335 "in-progress!\n");
1336 BUG();
1337 }
1338
1339 // mlog(0, "lockres is in progress...\n");
1340 spin_lock(&dlm->master_lock);
1341 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1342 if (!found) {
1343 mlog(ML_ERROR, "no mle found for this lock!\n");
1344 BUG();
1345 }
1346 set_maybe = 1;
1347 spin_lock(&tmpmle->spinlock);
1348 if (tmpmle->type == DLM_MLE_BLOCK) {
1349 // mlog(0, "this node is waiting for "
1350 // "lockres to be mastered\n");
1351 response = DLM_MASTER_RESP_NO;
1352 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1353 mlog(0, "node %u is master, but trying to migrate to "
1354 "node %u.\n", tmpmle->master, tmpmle->new_master);
1355 if (tmpmle->master == dlm->node_num) {
1356 response = DLM_MASTER_RESP_YES;
1357 mlog(ML_ERROR, "no owner on lockres, but this "
1358 "node is trying to migrate it to %u?!\n",
1359 tmpmle->new_master);
1360 BUG();
1361 } else {
1362 /* the real master can respond on its own */
1363 response = DLM_MASTER_RESP_NO;
1364 }
1365 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1366 set_maybe = 0;
1367 if (tmpmle->master == dlm->node_num)
1368 response = DLM_MASTER_RESP_YES;
1369 else
1370 response = DLM_MASTER_RESP_NO;
1371 } else {
1372 // mlog(0, "this node is attempting to "
1373 // "master lockres\n");
1374 response = DLM_MASTER_RESP_MAYBE;
1375 }
1376 if (set_maybe)
1377 set_bit(request->node_idx, tmpmle->maybe_map);
1378 spin_unlock(&tmpmle->spinlock);
1379
1380 spin_unlock(&dlm->master_lock);
1381 spin_unlock(&res->spinlock);
1382
1383 /* keep the mle attached to heartbeat events */
1384 dlm_put_mle(tmpmle);
1385 if (mle)
1386 kmem_cache_free(dlm_mle_cache, mle);
1387 goto send_response;
1388 }
1389
1390 /*
1391 * lockres doesn't exist on this node
1392 * if there is an MLE_BLOCK, return NO
1393 * if there is an MLE_MASTER, return MAYBE
1394 * otherwise, add an MLE_BLOCK, return NO
1395 */
1396 spin_lock(&dlm->master_lock);
1397 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1398 if (!found) {
1399 /* this lockid has never been seen on this node yet */
1400 // mlog(0, "no mle found\n");
1401 if (!mle) {
1402 spin_unlock(&dlm->master_lock);
1403 spin_unlock(&dlm->spinlock);
1404
1405 mle = (struct dlm_master_list_entry *)
1406 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
1407 if (!mle) {
1408 // bad bad bad... this sucks.
1409 response = DLM_MASTER_RESP_ERROR;
1410 goto send_response;
1411 }
1412 spin_lock(&dlm->spinlock);
1413 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
1414 name, namelen);
1415 spin_unlock(&dlm->spinlock);
1416 goto way_up_top;
1417 }
1418
1419 // mlog(0, "this is second time thru, already allocated, "
1420 // "add the block.\n");
1421 set_bit(request->node_idx, mle->maybe_map);
1422 list_add(&mle->list, &dlm->master_list);
1423 response = DLM_MASTER_RESP_NO;
1424 } else {
1425 // mlog(0, "mle was found\n");
1426 set_maybe = 1;
1427 spin_lock(&tmpmle->spinlock);
1428 if (tmpmle->type == DLM_MLE_BLOCK)
1429 response = DLM_MASTER_RESP_NO;
1430 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1431 mlog(0, "migration mle was found (%u->%u)\n",
1432 tmpmle->master, tmpmle->new_master);
1433 if (tmpmle->master == dlm->node_num) {
1434 mlog(ML_ERROR, "no lockres, but migration mle "
1435 "says that this node is master!\n");
1436 BUG();
1437 }
1438 /* real master can respond on its own */
1439 response = DLM_MASTER_RESP_NO;
1440 } else {
1441 if (tmpmle->master == dlm->node_num) {
1442 response = DLM_MASTER_RESP_YES;
1443 set_maybe = 0;
1444 } else
1445 response = DLM_MASTER_RESP_MAYBE;
1446 }
1447 if (set_maybe)
1448 set_bit(request->node_idx, tmpmle->maybe_map);
1449 spin_unlock(&tmpmle->spinlock);
1450 }
1451 spin_unlock(&dlm->master_lock);
1452 spin_unlock(&dlm->spinlock);
1453
1454 if (found) {
1455 /* keep the mle attached to heartbeat events */
1456 dlm_put_mle(tmpmle);
1457 }
1458send_response:
1459 dlm_put(dlm);
1460 return response;
1461}
1462
1463/*
1464 * DLM_ASSERT_MASTER_MSG
1465 */
1466
1467
1468/*
1469 * NOTE: this can be used for debugging
1470 * can periodically run all locks owned by this node
1471 * and re-assert across the cluster...
1472 */
1473static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
1474 unsigned int namelen, void *nodemap,
1475 u32 flags)
1476{
1477 struct dlm_assert_master assert;
1478 int to, tmpret;
1479 struct dlm_node_iter iter;
1480 int ret = 0;
1481
1482 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1483
1484 /* note that if this nodemap is empty, it returns 0 */
1485 dlm_node_iter_init(nodemap, &iter);
1486 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1487 int r = 0;
1488 mlog(0, "sending assert master to %d (%.*s)\n", to,
1489 namelen, lockname);
1490 memset(&assert, 0, sizeof(assert));
1491 assert.node_idx = dlm->node_num;
1492 assert.namelen = namelen;
1493 memcpy(assert.name, lockname, namelen);
1494 assert.flags = cpu_to_be32(flags);
1495
1496 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1497 &assert, sizeof(assert), to, &r);
1498 if (tmpret < 0) {
1499 mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
1500 if (!dlm_is_host_down(tmpret)) {
1501 mlog(ML_ERROR, "unhandled error!\n");
1502 BUG();
1503 }
1504 /* a node died. finish out the rest of the nodes. */
1505 mlog(ML_ERROR, "link to %d went down!\n", to);
1506 /* any nonzero status return will do */
1507 ret = tmpret;
1508 } else if (r < 0) {
1509 /* ok, something horribly messed. kill thyself. */
1510 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1511 "got %d.\n", namelen, lockname, to, r);
1512 dlm_dump_lock_resources(dlm);
1513 BUG();
1514 }
1515 }
1516
1517 return ret;
1518}
1519
1520/*
1521 * locks that can be taken here:
1522 * dlm->spinlock
1523 * res->spinlock
1524 * mle->spinlock
1525 * dlm->master_list
1526 *
1527 * if possible, TRIM THIS DOWN!!!
1528 */
1529int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1530{
1531 struct dlm_ctxt *dlm = data;
1532 struct dlm_master_list_entry *mle = NULL;
1533 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1534 struct dlm_lock_resource *res = NULL;
1535 char *name;
1536 unsigned int namelen;
1537 u32 flags;
1538
1539 if (!dlm_grab(dlm))
1540 return 0;
1541
1542 name = assert->name;
1543 namelen = assert->namelen;
1544 flags = be32_to_cpu(assert->flags);
1545
1546 if (namelen > DLM_LOCKID_NAME_MAX) {
1547 mlog(ML_ERROR, "Invalid name length!");
1548 goto done;
1549 }
1550
1551 spin_lock(&dlm->spinlock);
1552
1553 if (flags)
1554 mlog(0, "assert_master with flags: %u\n", flags);
1555
1556 /* find the MLE */
1557 spin_lock(&dlm->master_lock);
1558 if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1559 /* not an error, could be master just re-asserting */
1560 mlog(0, "just got an assert_master from %u, but no "
1561 "MLE for it! (%.*s)\n", assert->node_idx,
1562 namelen, name);
1563 } else {
1564 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1565 if (bit >= O2NM_MAX_NODES) {
1566 /* not necessarily an error, though less likely.
1567 * could be master just re-asserting. */
1568 mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
1569 "is asserting! (%.*s)\n", assert->node_idx,
1570 namelen, name);
1571 } else if (bit != assert->node_idx) {
1572 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1573 mlog(0, "master %u was found, %u should "
1574 "back off\n", assert->node_idx, bit);
1575 } else {
1576 /* with the fix for bug 569, a higher node
1577 * number winning the mastery will respond
1578 * YES to mastery requests, but this node
1579 * had no way of knowing. let it pass. */
1580 mlog(ML_ERROR, "%u is the lowest node, "
1581 "%u is asserting. (%.*s) %u must "
1582 "have begun after %u won.\n", bit,
1583 assert->node_idx, namelen, name, bit,
1584 assert->node_idx);
1585 }
1586 }
1587 }
1588 spin_unlock(&dlm->master_lock);
1589
1590 /* ok everything checks out with the MLE
1591 * now check to see if there is a lockres */
1592 res = __dlm_lookup_lockres(dlm, name, namelen);
1593 if (res) {
1594 spin_lock(&res->spinlock);
1595 if (res->state & DLM_LOCK_RES_RECOVERING) {
1596 mlog(ML_ERROR, "%u asserting but %.*s is "
1597 "RECOVERING!\n", assert->node_idx, namelen, name);
1598 goto kill;
1599 }
1600 if (!mle) {
1601 if (res->owner != assert->node_idx) {
1602 mlog(ML_ERROR, "assert_master from "
1603 "%u, but current owner is "
1604 "%u! (%.*s)\n",
1605 assert->node_idx, res->owner,
1606 namelen, name);
1607 goto kill;
1608 }
1609 } else if (mle->type != DLM_MLE_MIGRATION) {
1610 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1611 /* owner is just re-asserting */
1612 if (res->owner == assert->node_idx) {
1613 mlog(0, "owner %u re-asserting on "
1614 "lock %.*s\n", assert->node_idx,
1615 namelen, name);
1616 goto ok;
1617 }
1618 mlog(ML_ERROR, "got assert_master from "
1619 "node %u, but %u is the owner! "
1620 "(%.*s)\n", assert->node_idx,
1621 res->owner, namelen, name);
1622 goto kill;
1623 }
1624 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1625 mlog(ML_ERROR, "got assert from %u, but lock "
1626 "with no owner should be "
1627 "in-progress! (%.*s)\n",
1628 assert->node_idx,
1629 namelen, name);
1630 goto kill;
1631 }
1632 } else /* mle->type == DLM_MLE_MIGRATION */ {
1633 /* should only be getting an assert from new master */
1634 if (assert->node_idx != mle->new_master) {
1635 mlog(ML_ERROR, "got assert from %u, but "
1636 "new master is %u, and old master "
1637 "was %u (%.*s)\n",
1638 assert->node_idx, mle->new_master,
1639 mle->master, namelen, name);
1640 goto kill;
1641 }
1642
1643 }
1644ok:
1645 spin_unlock(&res->spinlock);
1646 }
1647 spin_unlock(&dlm->spinlock);
1648
1649 // mlog(0, "woo! got an assert_master from node %u!\n",
1650 // assert->node_idx);
1651 if (mle) {
1652 int extra_ref;
1653
1654 spin_lock(&mle->spinlock);
1655 extra_ref = !!(mle->type == DLM_MLE_BLOCK
1656 || mle->type == DLM_MLE_MIGRATION);
1657 mle->master = assert->node_idx;
1658 atomic_set(&mle->woken, 1);
1659 wake_up(&mle->wq);
1660 spin_unlock(&mle->spinlock);
1661
1662 if (mle->type == DLM_MLE_MIGRATION && res) {
1663 mlog(0, "finishing off migration of lockres %.*s, "
1664 "from %u to %u\n",
1665 res->lockname.len, res->lockname.name,
1666 dlm->node_num, mle->new_master);
1667 spin_lock(&res->spinlock);
1668 res->state &= ~DLM_LOCK_RES_MIGRATING;
1669 dlm_change_lockres_owner(dlm, res, mle->new_master);
1670 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1671 spin_unlock(&res->spinlock);
1672 }
1673 /* master is known, detach if not already detached */
1674 dlm_mle_detach_hb_events(dlm, mle);
1675 dlm_put_mle(mle);
1676
1677 if (extra_ref) {
1678 /* the assert master message now balances the extra
1679 * ref given by the master / migration request message.
1680 * if this is the last put, it will be removed
1681 * from the list. */
1682 dlm_put_mle(mle);
1683 }
1684 }
1685
1686done:
1687 if (res)
1688 dlm_lockres_put(res);
1689 dlm_put(dlm);
1690 return 0;
1691
1692kill:
1693 /* kill the caller! */
1694 spin_unlock(&res->spinlock);
1695 spin_unlock(&dlm->spinlock);
1696 dlm_lockres_put(res);
1697 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
1698 "and killing the other node now! This node is OK and can continue.\n");
1699 dlm_dump_lock_resources(dlm);
1700 dlm_put(dlm);
1701 return -EINVAL;
1702}
1703
1704int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1705 struct dlm_lock_resource *res,
1706 int ignore_higher, u8 request_from, u32 flags)
1707{
1708 struct dlm_work_item *item;
1709 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1710 if (!item)
1711 return -ENOMEM;
1712
1713
1714 /* queue up work for dlm_assert_master_worker */
1715 dlm_grab(dlm); /* get an extra ref for the work item */
1716 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
1717 item->u.am.lockres = res; /* already have a ref */
1718 /* can optionally ignore node numbers higher than this node */
1719 item->u.am.ignore_higher = ignore_higher;
1720 item->u.am.request_from = request_from;
1721 item->u.am.flags = flags;
1722
1723 spin_lock(&dlm->work_lock);
1724 list_add_tail(&item->list, &dlm->work_list);
1725 spin_unlock(&dlm->work_lock);
1726
1727 schedule_work(&dlm->dispatched_work);
1728 return 0;
1729}
1730
1731static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1732{
1733 struct dlm_ctxt *dlm = data;
1734 int ret = 0;
1735 struct dlm_lock_resource *res;
1736 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
1737 int ignore_higher;
1738 int bit;
1739 u8 request_from;
1740 u32 flags;
1741
1742 dlm = item->dlm;
1743 res = item->u.am.lockres;
1744 ignore_higher = item->u.am.ignore_higher;
1745 request_from = item->u.am.request_from;
1746 flags = item->u.am.flags;
1747
1748 spin_lock(&dlm->spinlock);
1749 memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
1750 spin_unlock(&dlm->spinlock);
1751
1752 clear_bit(dlm->node_num, nodemap);
1753 if (ignore_higher) {
1754 /* if is this just to clear up mles for nodes below
1755 * this node, do not send the message to the original
1756 * caller or any node number higher than this */
1757 clear_bit(request_from, nodemap);
1758 bit = dlm->node_num;
1759 while (1) {
1760 bit = find_next_bit(nodemap, O2NM_MAX_NODES,
1761 bit+1);
1762 if (bit >= O2NM_MAX_NODES)
1763 break;
1764 clear_bit(bit, nodemap);
1765 }
1766 }
1767
1768 /* this call now finishes out the nodemap
1769 * even if one or more nodes die */
1770 mlog(0, "worker about to master %.*s here, this=%u\n",
1771 res->lockname.len, res->lockname.name, dlm->node_num);
1772 ret = dlm_do_assert_master(dlm, res->lockname.name,
1773 res->lockname.len,
1774 nodemap, flags);
1775 if (ret < 0) {
1776 /* no need to restart, we are done */
1777 mlog_errno(ret);
1778 }
1779
1780 dlm_lockres_put(res);
1781
1782 mlog(0, "finished with dlm_assert_master_worker\n");
1783}
1784
1785
1786/*
1787 * DLM_MIGRATE_LOCKRES
1788 */
1789
1790
1791int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1792 u8 target)
1793{
1794 struct dlm_master_list_entry *mle = NULL;
1795 struct dlm_master_list_entry *oldmle = NULL;
1796 struct dlm_migratable_lockres *mres = NULL;
1797 int ret = -EINVAL;
1798 const char *name;
1799 unsigned int namelen;
1800 int mle_added = 0;
1801 struct list_head *queue, *iter;
1802 int i;
1803 struct dlm_lock *lock;
1804 int empty = 1;
1805
1806 if (!dlm_grab(dlm))
1807 return -EINVAL;
1808
1809 name = res->lockname.name;
1810 namelen = res->lockname.len;
1811
1812 mlog(0, "migrating %.*s to %u\n", namelen, name, target);
1813
1814 /*
1815 * ensure this lockres is a proper candidate for migration
1816 */
1817 spin_lock(&res->spinlock);
1818 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
1819 mlog(0, "cannot migrate lockres with unknown owner!\n");
1820 spin_unlock(&res->spinlock);
1821 goto leave;
1822 }
1823 if (res->owner != dlm->node_num) {
1824 mlog(0, "cannot migrate lockres this node doesn't own!\n");
1825 spin_unlock(&res->spinlock);
1826 goto leave;
1827 }
1828 mlog(0, "checking queues...\n");
1829 queue = &res->granted;
1830 for (i=0; i<3; i++) {
1831 list_for_each(iter, queue) {
1832 lock = list_entry (iter, struct dlm_lock, list);
1833 empty = 0;
1834 if (lock->ml.node == dlm->node_num) {
1835 mlog(0, "found a lock owned by this node "
1836 "still on the %s queue! will not "
1837 "migrate this lockres\n",
1838 i==0 ? "granted" :
1839 (i==1 ? "converting" : "blocked"));
1840 spin_unlock(&res->spinlock);
1841 ret = -ENOTEMPTY;
1842 goto leave;
1843 }
1844 }
1845 queue++;
1846 }
1847 mlog(0, "all locks on this lockres are nonlocal. continuing\n");
1848 spin_unlock(&res->spinlock);
1849
1850 /* no work to do */
1851 if (empty) {
1852 mlog(0, "no locks were found on this lockres! done!\n");
1853 ret = 0;
1854 goto leave;
1855 }
1856
1857 /*
1858 * preallocate up front
1859 * if this fails, abort
1860 */
1861
1862 ret = -ENOMEM;
1863 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
1864 if (!mres) {
1865 mlog_errno(ret);
1866 goto leave;
1867 }
1868
1869 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
1870 GFP_KERNEL);
1871 if (!mle) {
1872 mlog_errno(ret);
1873 goto leave;
1874 }
1875 ret = 0;
1876
1877 /*
1878 * find a node to migrate the lockres to
1879 */
1880
1881 mlog(0, "picking a migration node\n");
1882 spin_lock(&dlm->spinlock);
1883 /* pick a new node */
1884 if (!test_bit(target, dlm->domain_map) ||
1885 target >= O2NM_MAX_NODES) {
1886 target = dlm_pick_migration_target(dlm, res);
1887 }
1888 mlog(0, "node %u chosen for migration\n", target);
1889
1890 if (target >= O2NM_MAX_NODES ||
1891 !test_bit(target, dlm->domain_map)) {
1892 /* target chosen is not alive */
1893 ret = -EINVAL;
1894 }
1895
1896 if (ret) {
1897 spin_unlock(&dlm->spinlock);
1898 goto fail;
1899 }
1900
1901 mlog(0, "continuing with target = %u\n", target);
1902
1903 /*
1904 * clear any existing master requests and
1905 * add the migration mle to the list
1906 */
1907 spin_lock(&dlm->master_lock);
1908 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
1909 namelen, target, dlm->node_num);
1910 spin_unlock(&dlm->master_lock);
1911 spin_unlock(&dlm->spinlock);
1912
1913 if (ret == -EEXIST) {
1914 mlog(0, "another process is already migrating it\n");
1915 goto fail;
1916 }
1917 mle_added = 1;
1918
1919 /*
1920 * set the MIGRATING flag and flush asts
1921 * if we fail after this we need to re-dirty the lockres
1922 */
1923 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
1924 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
1925 "the target went down.\n", res->lockname.len,
1926 res->lockname.name, target);
1927 spin_lock(&res->spinlock);
1928 res->state &= ~DLM_LOCK_RES_MIGRATING;
1929 spin_unlock(&res->spinlock);
1930 ret = -EINVAL;
1931 }
1932
1933fail:
1934 if (oldmle) {
1935 /* master is known, detach if not already detached */
1936 dlm_mle_detach_hb_events(dlm, oldmle);
1937 dlm_put_mle(oldmle);
1938 }
1939
1940 if (ret < 0) {
1941 if (mle_added) {
1942 dlm_mle_detach_hb_events(dlm, mle);
1943 dlm_put_mle(mle);
1944 } else if (mle) {
1945 kmem_cache_free(dlm_mle_cache, mle);
1946 }
1947 goto leave;
1948 }
1949
1950 /*
1951 * at this point, we have a migration target, an mle
1952 * in the master list, and the MIGRATING flag set on
1953 * the lockres
1954 */
1955
1956
1957 /* get an extra reference on the mle.
1958 * otherwise the assert_master from the new
1959 * master will destroy this.
1960 * also, make sure that all callers of dlm_get_mle
1961 * take both dlm->spinlock and dlm->master_lock */
1962 spin_lock(&dlm->spinlock);
1963 spin_lock(&dlm->master_lock);
1964 dlm_get_mle(mle);
1965 spin_unlock(&dlm->master_lock);
1966 spin_unlock(&dlm->spinlock);
1967
1968 /* notify new node and send all lock state */
1969 /* call send_one_lockres with migration flag.
1970 * this serves as notice to the target node that a
1971 * migration is starting. */
1972 ret = dlm_send_one_lockres(dlm, res, mres, target,
1973 DLM_MRES_MIGRATION);
1974
1975 if (ret < 0) {
1976 mlog(0, "migration to node %u failed with %d\n",
1977 target, ret);
1978 /* migration failed, detach and clean up mle */
1979 dlm_mle_detach_hb_events(dlm, mle);
1980 dlm_put_mle(mle);
1981 dlm_put_mle(mle);
1982 goto leave;
1983 }
1984
1985 /* at this point, the target sends a message to all nodes,
1986 * (using dlm_do_migrate_request). this node is skipped since
1987 * we had to put an mle in the list to begin the process. this
1988 * node now waits for target to do an assert master. this node
1989 * will be the last one notified, ensuring that the migration
1990 * is complete everywhere. if the target dies while this is
1991 * going on, some nodes could potentially see the target as the
1992 * master, so it is important that my recovery finds the migration
1993 * mle and sets the master to UNKNONWN. */
1994
1995
1996 /* wait for new node to assert master */
1997 while (1) {
1998 ret = wait_event_interruptible_timeout(mle->wq,
1999 (atomic_read(&mle->woken) == 1),
2000 msecs_to_jiffies(5000));
2001
2002 if (ret >= 0) {
2003 if (atomic_read(&mle->woken) == 1 ||
2004 res->owner == target)
2005 break;
2006
2007 mlog(0, "timed out during migration\n");
2008 }
2009 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */
2011 dlm_mle_detach_hb_events(dlm, mle);
2012 dlm_put_mle(mle);
2013 dlm_put_mle(mle);
2014 goto leave;
2015 }
2016 /* TODO: if node died: stop, clean up, return error */
2017 }
2018
2019 /* all done, set the owner, clear the flag */
2020 spin_lock(&res->spinlock);
2021 dlm_set_lockres_owner(dlm, res, target);
2022 res->state &= ~DLM_LOCK_RES_MIGRATING;
2023 dlm_remove_nonlocal_locks(dlm, res);
2024 spin_unlock(&res->spinlock);
2025 wake_up(&res->wq);
2026
2027 /* master is known, detach if not already detached */
2028 dlm_mle_detach_hb_events(dlm, mle);
2029 dlm_put_mle(mle);
2030 ret = 0;
2031
2032 dlm_lockres_calc_usage(dlm, res);
2033
2034leave:
2035 /* re-dirty the lockres if we failed */
2036 if (ret < 0)
2037 dlm_kick_thread(dlm, res);
2038
2039 /* TODO: cleanup */
2040 if (mres)
2041 free_page((unsigned long)mres);
2042
2043 dlm_put(dlm);
2044
2045 mlog(0, "returning %d\n", ret);
2046 return ret;
2047}
2048EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
2049
2050int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2051{
2052 int ret;
2053 spin_lock(&dlm->ast_lock);
2054 spin_lock(&lock->spinlock);
2055 ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2056 spin_unlock(&lock->spinlock);
2057 spin_unlock(&dlm->ast_lock);
2058 return ret;
2059}
2060
2061static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2062 struct dlm_lock_resource *res,
2063 u8 mig_target)
2064{
2065 int can_proceed;
2066 spin_lock(&res->spinlock);
2067 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2068 spin_unlock(&res->spinlock);
2069
2070 /* target has died, so make the caller break out of the
2071 * wait_event, but caller must recheck the domain_map */
2072 spin_lock(&dlm->spinlock);
2073 if (!test_bit(mig_target, dlm->domain_map))
2074 can_proceed = 1;
2075 spin_unlock(&dlm->spinlock);
2076 return can_proceed;
2077}
2078
2079int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2080{
2081 int ret;
2082 spin_lock(&res->spinlock);
2083 ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2084 spin_unlock(&res->spinlock);
2085 return ret;
2086}
2087
2088
2089static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2090 struct dlm_lock_resource *res,
2091 u8 target)
2092{
2093 int ret = 0;
2094
2095 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2096 res->lockname.len, res->lockname.name, dlm->node_num,
2097 target);
2098 /* need to set MIGRATING flag on lockres. this is done by
2099 * ensuring that all asts have been flushed for this lockres. */
2100 spin_lock(&res->spinlock);
2101 BUG_ON(res->migration_pending);
2102 res->migration_pending = 1;
2103 /* strategy is to reserve an extra ast then release
2104 * it below, letting the release do all of the work */
2105 __dlm_lockres_reserve_ast(res);
2106 spin_unlock(&res->spinlock);
2107
2108 /* now flush all the pending asts.. hang out for a bit */
2109 dlm_kick_thread(dlm, res);
2110 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2111 dlm_lockres_release_ast(dlm, res);
2112
2113 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2114 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2115 /* if the extra ref we just put was the final one, this
2116 * will pass thru immediately. otherwise, we need to wait
2117 * for the last ast to finish. */
2118again:
2119 ret = wait_event_interruptible_timeout(dlm->migration_wq,
2120 dlm_migration_can_proceed(dlm, res, target),
2121 msecs_to_jiffies(1000));
2122 if (ret < 0) {
2123 mlog(0, "woken again: migrating? %s, dead? %s\n",
2124 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2125 test_bit(target, dlm->domain_map) ? "no":"yes");
2126 } else {
2127 mlog(0, "all is well: migrating? %s, dead? %s\n",
2128 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2129 test_bit(target, dlm->domain_map) ? "no":"yes");
2130 }
2131 if (!dlm_migration_can_proceed(dlm, res, target)) {
2132 mlog(0, "trying again...\n");
2133 goto again;
2134 }
2135
2136 /* did the target go down or die? */
2137 spin_lock(&dlm->spinlock);
2138 if (!test_bit(target, dlm->domain_map)) {
2139 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2140 target);
2141 ret = -EHOSTDOWN;
2142 }
2143 spin_unlock(&dlm->spinlock);
2144
2145 /*
2146 * at this point:
2147 *
2148 * o the DLM_LOCK_RES_MIGRATING flag is set
2149 * o there are no pending asts on this lockres
2150 * o all processes trying to reserve an ast on this
2151 * lockres must wait for the MIGRATING flag to clear
2152 */
2153 return ret;
2154}
2155
2156/* last step in the migration process.
2157 * original master calls this to free all of the dlm_lock
2158 * structures that used to be for other nodes. */
2159static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2160 struct dlm_lock_resource *res)
2161{
2162 struct list_head *iter, *iter2;
2163 struct list_head *queue = &res->granted;
2164 int i;
2165 struct dlm_lock *lock;
2166
2167 assert_spin_locked(&res->spinlock);
2168
2169 BUG_ON(res->owner == dlm->node_num);
2170
2171 for (i=0; i<3; i++) {
2172 list_for_each_safe(iter, iter2, queue) {
2173 lock = list_entry (iter, struct dlm_lock, list);
2174 if (lock->ml.node != dlm->node_num) {
2175 mlog(0, "putting lock for node %u\n",
2176 lock->ml.node);
2177 /* be extra careful */
2178 BUG_ON(!list_empty(&lock->ast_list));
2179 BUG_ON(!list_empty(&lock->bast_list));
2180 BUG_ON(lock->ast_pending);
2181 BUG_ON(lock->bast_pending);
2182 list_del_init(&lock->list);
2183 dlm_lock_put(lock);
2184 }
2185 }
2186 queue++;
2187 }
2188}
2189
2190/* for now this is not too intelligent. we will
2191 * need stats to make this do the right thing.
2192 * this just finds the first lock on one of the
2193 * queues and uses that node as the target. */
2194static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2195 struct dlm_lock_resource *res)
2196{
2197 int i;
2198 struct list_head *queue = &res->granted;
2199 struct list_head *iter;
2200 struct dlm_lock *lock;
2201 int nodenum;
2202
2203 assert_spin_locked(&dlm->spinlock);
2204
2205 spin_lock(&res->spinlock);
2206 for (i=0; i<3; i++) {
2207 list_for_each(iter, queue) {
2208 /* up to the caller to make sure this node
2209 * is alive */
2210 lock = list_entry (iter, struct dlm_lock, list);
2211 if (lock->ml.node != dlm->node_num) {
2212 spin_unlock(&res->spinlock);
2213 return lock->ml.node;
2214 }
2215 }
2216 queue++;
2217 }
2218 spin_unlock(&res->spinlock);
2219 mlog(0, "have not found a suitable target yet! checking domain map\n");
2220
2221 /* ok now we're getting desperate. pick anyone alive. */
2222 nodenum = -1;
2223 while (1) {
2224 nodenum = find_next_bit(dlm->domain_map,
2225 O2NM_MAX_NODES, nodenum+1);
2226 mlog(0, "found %d in domain map\n", nodenum);
2227 if (nodenum >= O2NM_MAX_NODES)
2228 break;
2229 if (nodenum != dlm->node_num) {
2230 mlog(0, "picking %d\n", nodenum);
2231 return nodenum;
2232 }
2233 }
2234
2235 mlog(0, "giving up. no master to migrate to\n");
2236 return DLM_LOCK_RES_OWNER_UNKNOWN;
2237}
2238
2239
2240
2241/* this is called by the new master once all lockres
2242 * data has been received */
2243static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2244 struct dlm_lock_resource *res,
2245 u8 master, u8 new_master,
2246 struct dlm_node_iter *iter)
2247{
2248 struct dlm_migrate_request migrate;
2249 int ret, status = 0;
2250 int nodenum;
2251
2252 memset(&migrate, 0, sizeof(migrate));
2253 migrate.namelen = res->lockname.len;
2254 memcpy(migrate.name, res->lockname.name, migrate.namelen);
2255 migrate.new_master = new_master;
2256 migrate.master = master;
2257
2258 ret = 0;
2259
2260 /* send message to all nodes, except the master and myself */
2261 while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2262 if (nodenum == master ||
2263 nodenum == new_master)
2264 continue;
2265
2266 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2267 &migrate, sizeof(migrate), nodenum,
2268 &status);
2269 if (ret < 0)
2270 mlog_errno(ret);
2271 else if (status < 0) {
2272 mlog(0, "migrate request (node %u) returned %d!\n",
2273 nodenum, status);
2274 ret = status;
2275 }
2276 }
2277
2278 if (ret < 0)
2279 mlog_errno(ret);
2280
2281 mlog(0, "returning ret=%d\n", ret);
2282 return ret;
2283}
2284
2285
2286/* if there is an existing mle for this lockres, we now know who the master is.
2287 * (the one who sent us *this* message) we can clear it up right away.
2288 * since the process that put the mle on the list still has a reference to it,
2289 * we can unhash it now, set the master and wake the process. as a result,
2290 * we will have no mle in the list to start with. now we can add an mle for
2291 * the migration and this should be the only one found for those scanning the
2292 * list. */
2293int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2294{
2295 struct dlm_ctxt *dlm = data;
2296 struct dlm_lock_resource *res = NULL;
2297 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
2298 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
2299 const char *name;
2300 unsigned int namelen;
2301 int ret = 0;
2302
2303 if (!dlm_grab(dlm))
2304 return -EINVAL;
2305
2306 name = migrate->name;
2307 namelen = migrate->namelen;
2308
2309 /* preallocate.. if this fails, abort */
2310 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2311 GFP_KERNEL);
2312
2313 if (!mle) {
2314 ret = -ENOMEM;
2315 goto leave;
2316 }
2317
2318 /* check for pre-existing lock */
2319 spin_lock(&dlm->spinlock);
2320 res = __dlm_lookup_lockres(dlm, name, namelen);
2321 spin_lock(&dlm->master_lock);
2322
2323 if (res) {
2324 spin_lock(&res->spinlock);
2325 if (res->state & DLM_LOCK_RES_RECOVERING) {
2326 /* if all is working ok, this can only mean that we got
2327 * a migrate request from a node that we now see as
2328 * dead. what can we do here? drop it to the floor? */
2329 spin_unlock(&res->spinlock);
2330 mlog(ML_ERROR, "Got a migrate request, but the "
2331 "lockres is marked as recovering!");
2332 kmem_cache_free(dlm_mle_cache, mle);
2333 ret = -EINVAL; /* need a better solution */
2334 goto unlock;
2335 }
2336 res->state |= DLM_LOCK_RES_MIGRATING;
2337 spin_unlock(&res->spinlock);
2338 }
2339
2340 /* ignore status. only nonzero status would BUG. */
2341 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
2342 name, namelen,
2343 migrate->new_master,
2344 migrate->master);
2345
2346unlock:
2347 spin_unlock(&dlm->master_lock);
2348 spin_unlock(&dlm->spinlock);
2349
2350 if (oldmle) {
2351 /* master is known, detach if not already detached */
2352 dlm_mle_detach_hb_events(dlm, oldmle);
2353 dlm_put_mle(oldmle);
2354 }
2355
2356 if (res)
2357 dlm_lockres_put(res);
2358leave:
2359 dlm_put(dlm);
2360 return ret;
2361}
2362
2363/* must be holding dlm->spinlock and dlm->master_lock
2364 * when adding a migration mle, we can clear any other mles
2365 * in the master list because we know with certainty that
2366 * the master is "master". so we remove any old mle from
2367 * the list after setting it's master field, and then add
2368 * the new migration mle. this way we can hold with the rule
2369 * of having only one mle for a given lock name at all times. */
2370static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2371 struct dlm_lock_resource *res,
2372 struct dlm_master_list_entry *mle,
2373 struct dlm_master_list_entry **oldmle,
2374 const char *name, unsigned int namelen,
2375 u8 new_master, u8 master)
2376{
2377 int found;
2378 int ret = 0;
2379
2380 *oldmle = NULL;
2381
2382 mlog_entry_void();
2383
2384 assert_spin_locked(&dlm->spinlock);
2385 assert_spin_locked(&dlm->master_lock);
2386
2387 /* caller is responsible for any ref taken here on oldmle */
2388 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
2389 if (found) {
2390 struct dlm_master_list_entry *tmp = *oldmle;
2391 spin_lock(&tmp->spinlock);
2392 if (tmp->type == DLM_MLE_MIGRATION) {
2393 if (master == dlm->node_num) {
2394 /* ah another process raced me to it */
2395 mlog(0, "tried to migrate %.*s, but some "
2396 "process beat me to it\n",
2397 namelen, name);
2398 ret = -EEXIST;
2399 } else {
2400 /* bad. 2 NODES are trying to migrate! */
2401 mlog(ML_ERROR, "migration error mle: "
2402 "master=%u new_master=%u // request: "
2403 "master=%u new_master=%u // "
2404 "lockres=%.*s\n",
2405 tmp->master, tmp->new_master,
2406 master, new_master,
2407 namelen, name);
2408 BUG();
2409 }
2410 } else {
2411 /* this is essentially what assert_master does */
2412 tmp->master = master;
2413 atomic_set(&tmp->woken, 1);
2414 wake_up(&tmp->wq);
2415 /* remove it from the list so that only one
2416 * mle will be found */
2417 list_del_init(&tmp->list);
2418 }
2419 spin_unlock(&tmp->spinlock);
2420 }
2421
2422 /* now add a migration mle to the tail of the list */
2423 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
2424 mle->new_master = new_master;
2425 mle->master = master;
2426 /* do this for consistency with other mle types */
2427 set_bit(new_master, mle->maybe_map);
2428 list_add(&mle->list, &dlm->master_list);
2429
2430 return ret;
2431}
2432
2433
2434void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
2435{
2436 struct list_head *iter, *iter2;
2437 struct dlm_master_list_entry *mle;
2438 struct dlm_lock_resource *res;
2439
2440 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
2441top:
2442 assert_spin_locked(&dlm->spinlock);
2443
2444 /* clean the master list */
2445 spin_lock(&dlm->master_lock);
2446 list_for_each_safe(iter, iter2, &dlm->master_list) {
2447 mle = list_entry(iter, struct dlm_master_list_entry, list);
2448
2449 BUG_ON(mle->type != DLM_MLE_BLOCK &&
2450 mle->type != DLM_MLE_MASTER &&
2451 mle->type != DLM_MLE_MIGRATION);
2452
2453 /* MASTER mles are initiated locally. the waiting
2454 * process will notice the node map change
2455 * shortly. let that happen as normal. */
2456 if (mle->type == DLM_MLE_MASTER)
2457 continue;
2458
2459
2460 /* BLOCK mles are initiated by other nodes.
2461 * need to clean up if the dead node would have
2462 * been the master. */
2463 if (mle->type == DLM_MLE_BLOCK) {
2464 int bit;
2465
2466 spin_lock(&mle->spinlock);
2467 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
2468 if (bit != dead_node) {
2469 mlog(0, "mle found, but dead node %u would "
2470 "not have been master\n", dead_node);
2471 spin_unlock(&mle->spinlock);
2472 } else {
2473 /* must drop the refcount by one since the
2474 * assert_master will never arrive. this
2475 * may result in the mle being unlinked and
2476 * freed, but there may still be a process
2477 * waiting in the dlmlock path which is fine. */
2478 mlog(ML_ERROR, "node %u was expected master\n",
2479 dead_node);
2480 atomic_set(&mle->woken, 1);
2481 spin_unlock(&mle->spinlock);
2482 wake_up(&mle->wq);
2483 /* final put will take care of list removal */
2484 __dlm_put_mle(mle);
2485 }
2486 continue;
2487 }
2488
2489 /* everything else is a MIGRATION mle */
2490
2491 /* the rule for MIGRATION mles is that the master
2492 * becomes UNKNOWN if *either* the original or
2493 * the new master dies. all UNKNOWN lockreses
2494 * are sent to whichever node becomes the recovery
2495 * master. the new master is responsible for
2496 * determining if there is still a master for
2497 * this lockres, or if he needs to take over
2498 * mastery. either way, this node should expect
2499 * another message to resolve this. */
2500 if (mle->master != dead_node &&
2501 mle->new_master != dead_node)
2502 continue;
2503
2504 /* if we have reached this point, this mle needs to
2505 * be removed from the list and freed. */
2506
2507 /* remove from the list early. NOTE: unlinking
2508 * list_head while in list_for_each_safe */
2509 spin_lock(&mle->spinlock);
2510 list_del_init(&mle->list);
2511 atomic_set(&mle->woken, 1);
2512 spin_unlock(&mle->spinlock);
2513 wake_up(&mle->wq);
2514
2515 mlog(0, "node %u died during migration from "
2516 "%u to %u!\n", dead_node,
2517 mle->master, mle->new_master);
2518 /* if there is a lockres associated with this
2519 * mle, find it and set its owner to UNKNOWN */
2520 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
2521 mle->u.name.len);
2522 if (res) {
2523 /* unfortunately if we hit this rare case, our
2524 * lock ordering is messed. we need to drop
2525 * the master lock so that we can take the
2526 * lockres lock, meaning that we will have to
2527 * restart from the head of list. */
2528 spin_unlock(&dlm->master_lock);
2529
2530 /* move lockres onto recovery list */
2531 spin_lock(&res->spinlock);
2532 dlm_set_lockres_owner(dlm, res,
2533 DLM_LOCK_RES_OWNER_UNKNOWN);
2534 dlm_move_lockres_to_recovery_list(dlm, res);
2535 spin_unlock(&res->spinlock);
2536 dlm_lockres_put(res);
2537
2538 /* dump the mle */
2539 spin_lock(&dlm->master_lock);
2540 __dlm_put_mle(mle);
2541 spin_unlock(&dlm->master_lock);
2542
2543 /* restart */
2544 goto top;
2545 }
2546
2547 /* this may be the last reference */
2548 __dlm_put_mle(mle);
2549 }
2550 spin_unlock(&dlm->master_lock);
2551}
2552
2553
2554int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2555 u8 old_master)
2556{
2557 struct dlm_node_iter iter;
2558 int ret = 0;
2559
2560 spin_lock(&dlm->spinlock);
2561 dlm_node_iter_init(dlm->domain_map, &iter);
2562 clear_bit(old_master, iter.node_map);
2563 clear_bit(dlm->node_num, iter.node_map);
2564 spin_unlock(&dlm->spinlock);
2565
2566 mlog(0, "now time to do a migrate request to other nodes\n");
2567 ret = dlm_do_migrate_request(dlm, res, old_master,
2568 dlm->node_num, &iter);
2569 if (ret < 0) {
2570 mlog_errno(ret);
2571 goto leave;
2572 }
2573
2574 mlog(0, "doing assert master of %.*s to all except the original node\n",
2575 res->lockname.len, res->lockname.name);
2576 /* this call now finishes out the nodemap
2577 * even if one or more nodes die */
2578 ret = dlm_do_assert_master(dlm, res->lockname.name,
2579 res->lockname.len, iter.node_map,
2580 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2581 if (ret < 0) {
2582 /* no longer need to retry. all living nodes contacted. */
2583 mlog_errno(ret);
2584 ret = 0;
2585 }
2586
2587 memset(iter.node_map, 0, sizeof(iter.node_map));
2588 set_bit(old_master, iter.node_map);
2589 mlog(0, "doing assert master of %.*s back to %u\n",
2590 res->lockname.len, res->lockname.name, old_master);
2591 ret = dlm_do_assert_master(dlm, res->lockname.name,
2592 res->lockname.len, iter.node_map,
2593 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2594 if (ret < 0) {
2595 mlog(0, "assert master to original master failed "
2596 "with %d.\n", ret);
2597 /* the only nonzero status here would be because of
2598 * a dead original node. we're done. */
2599 ret = 0;
2600 }
2601
2602 /* all done, set the owner, clear the flag */
2603 spin_lock(&res->spinlock);
2604 dlm_set_lockres_owner(dlm, res, dlm->node_num);
2605 res->state &= ~DLM_LOCK_RES_MIGRATING;
2606 spin_unlock(&res->spinlock);
2607 /* re-dirty it on the new master */
2608 dlm_kick_thread(dlm, res);
2609 wake_up(&res->wq);
2610leave:
2611 return ret;
2612}
2613
2614/*
2615 * LOCKRES AST REFCOUNT
2616 * this is integral to migration
2617 */
2618
2619/* for future intent to call an ast, reserve one ahead of time.
2620 * this should be called only after waiting on the lockres
2621 * with dlm_wait_on_lockres, and while still holding the
2622 * spinlock after the call. */
2623void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
2624{
2625 assert_spin_locked(&res->spinlock);
2626 if (res->state & DLM_LOCK_RES_MIGRATING) {
2627 __dlm_print_one_lock_resource(res);
2628 }
2629 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2630
2631 atomic_inc(&res->asts_reserved);
2632}
2633
2634/*
2635 * used to drop the reserved ast, either because it went unused,
2636 * or because the ast/bast was actually called.
2637 *
2638 * also, if there is a pending migration on this lockres,
2639 * and this was the last pending ast on the lockres,
2640 * atomically set the MIGRATING flag before we drop the lock.
2641 * this is how we ensure that migration can proceed with no
2642 * asts in progress. note that it is ok if the state of the
2643 * queues is such that a lock should be granted in the future
2644 * or that a bast should be fired, because the new master will
2645 * shuffle the lists on this lockres as soon as it is migrated.
2646 */
2647void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
2648 struct dlm_lock_resource *res)
2649{
2650 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
2651 return;
2652
2653 if (!res->migration_pending) {
2654 spin_unlock(&res->spinlock);
2655 return;
2656 }
2657
2658 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2659 res->migration_pending = 0;
2660 res->state |= DLM_LOCK_RES_MIGRATING;
2661 spin_unlock(&res->spinlock);
2662 wake_up(&res->wq);
2663 wake_up(&dlm->migration_wq);
2664}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 000000000000..0c8eb1093f00
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmrecovery.c
5 *
6 * recovery stuff
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
53#include "cluster/masklog.h"
54
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56
57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm);
62
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
64static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
65static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
66static int dlm_request_all_locks(struct dlm_ctxt *dlm,
67 u8 request_from, u8 dead_node);
68static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
69
70static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
71static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
72 const char *lockname, int namelen,
73 int total_locks, u64 cookie,
74 u8 flags, u8 master);
75static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
76 struct dlm_migratable_lockres *mres,
77 u8 send_to,
78 struct dlm_lock_resource *res,
79 int total_locks);
80static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
81 struct dlm_lock_resource *res,
82 u8 *real_master);
83static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
84 struct dlm_lock_resource *res,
85 struct dlm_migratable_lockres *mres);
86static int dlm_do_master_requery(struct dlm_ctxt *dlm,
87 struct dlm_lock_resource *res,
88 u8 nodenum, u8 *real_master);
89static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
90static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
91 u8 dead_node, u8 send_to);
92static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
93static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
94 struct list_head *list, u8 dead_node);
95static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
96 u8 dead_node, u8 new_master);
97static void dlm_reco_ast(void *astdata);
98static void dlm_reco_bast(void *astdata, int blocked_type);
99static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
100static void dlm_request_all_locks_worker(struct dlm_work_item *item,
101 void *data);
102static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
103
104static u64 dlm_get_next_mig_cookie(void);
105
106static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
107static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
108static u64 dlm_mig_cookie = 1;
109
110static u64 dlm_get_next_mig_cookie(void)
111{
112 u64 c;
113 spin_lock(&dlm_mig_cookie_lock);
114 c = dlm_mig_cookie;
115 if (dlm_mig_cookie == (~0ULL))
116 dlm_mig_cookie = 1;
117 else
118 dlm_mig_cookie++;
119 spin_unlock(&dlm_mig_cookie_lock);
120 return c;
121}
122
123static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
124{
125 spin_lock(&dlm->spinlock);
126 clear_bit(dlm->reco.dead_node, dlm->recovery_map);
127 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
128 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
129 spin_unlock(&dlm->spinlock);
130}
131
132/* Worker function used during recovery. */
133void dlm_dispatch_work(void *data)
134{
135 struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
136 LIST_HEAD(tmp_list);
137 struct list_head *iter, *iter2;
138 struct dlm_work_item *item;
139 dlm_workfunc_t *workfunc;
140
141 spin_lock(&dlm->work_lock);
142 list_splice_init(&dlm->work_list, &tmp_list);
143 spin_unlock(&dlm->work_lock);
144
145 list_for_each_safe(iter, iter2, &tmp_list) {
146 item = list_entry(iter, struct dlm_work_item, list);
147 workfunc = item->func;
148 list_del_init(&item->list);
149
150 /* already have ref on dlm to avoid having
151 * it disappear. just double-check. */
152 BUG_ON(item->dlm != dlm);
153
154 /* this is allowed to sleep and
155 * call network stuff */
156 workfunc(item, item->data);
157
158 dlm_put(dlm);
159 kfree(item);
160 }
161}
162
163/*
164 * RECOVERY THREAD
165 */
166
167static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
168{
169 /* wake the recovery thread
170 * this will wake the reco thread in one of three places
171 * 1) sleeping with no recovery happening
172 * 2) sleeping with recovery mastered elsewhere
173 * 3) recovery mastered here, waiting on reco data */
174
175 wake_up(&dlm->dlm_reco_thread_wq);
176}
177
178/* Launch the recovery thread */
179int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
180{
181 mlog(0, "starting dlm recovery thread...\n");
182
183 dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
184 "dlm_reco_thread");
185 if (IS_ERR(dlm->dlm_reco_thread_task)) {
186 mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
187 dlm->dlm_reco_thread_task = NULL;
188 return -EINVAL;
189 }
190
191 return 0;
192}
193
194void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
195{
196 if (dlm->dlm_reco_thread_task) {
197 mlog(0, "waiting for dlm recovery thread to exit\n");
198 kthread_stop(dlm->dlm_reco_thread_task);
199 dlm->dlm_reco_thread_task = NULL;
200 }
201}
202
203
204
205/*
206 * this is lame, but here's how recovery works...
207 * 1) all recovery threads cluster wide will work on recovering
208 * ONE node at a time
209 * 2) negotiate who will take over all the locks for the dead node.
210 * thats right... ALL the locks.
211 * 3) once a new master is chosen, everyone scans all locks
212 * and moves aside those mastered by the dead guy
213 * 4) each of these locks should be locked until recovery is done
214 * 5) the new master collects up all of secondary lock queue info
215 * one lock at a time, forcing each node to communicate back
216 * before continuing
217 * 6) each secondary lock queue responds with the full known lock info
218 * 7) once the new master has run all its locks, it sends a ALLDONE!
219 * message to everyone
220 * 8) upon receiving this message, the secondary queue node unlocks
221 * and responds to the ALLDONE
222 * 9) once the new master gets responses from everyone, he unlocks
223 * everything and recovery for this dead node is done
224 *10) go back to 2) while there are still dead nodes
225 *
226 */
227
228
229#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
230
231static int dlm_recovery_thread(void *data)
232{
233 int status;
234 struct dlm_ctxt *dlm = data;
235 unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
236
237 mlog(0, "dlm thread running for %s...\n", dlm->name);
238
239 while (!kthread_should_stop()) {
240 if (dlm_joined(dlm)) {
241 status = dlm_do_recovery(dlm);
242 if (status == -EAGAIN) {
243 /* do not sleep, recheck immediately. */
244 continue;
245 }
246 if (status < 0)
247 mlog_errno(status);
248 }
249
250 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
251 kthread_should_stop(),
252 timeout);
253 }
254
255 mlog(0, "quitting DLM recovery thread\n");
256 return 0;
257}
258
259/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins
262 * recovering a dead node (as the new master or not) and clear
263 * the state and wake as soon as all affected lock resources have
264 * been marked with the RECOVERY flag */
265static int dlm_in_recovery(struct dlm_ctxt *dlm)
266{
267 int in_recovery;
268 spin_lock(&dlm->spinlock);
269 in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
270 spin_unlock(&dlm->spinlock);
271 return in_recovery;
272}
273
274
275void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
276{
277 wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
278}
279
280static void dlm_begin_recovery(struct dlm_ctxt *dlm)
281{
282 spin_lock(&dlm->spinlock);
283 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
284 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
285 spin_unlock(&dlm->spinlock);
286}
287
288static void dlm_end_recovery(struct dlm_ctxt *dlm)
289{
290 spin_lock(&dlm->spinlock);
291 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
292 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
293 spin_unlock(&dlm->spinlock);
294 wake_up(&dlm->reco.event);
295}
296
297static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{
299 int status = 0;
300
301 spin_lock(&dlm->spinlock);
302
303 /* check to see if the new master has died */
304 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
305 test_bit(dlm->reco.new_master, dlm->recovery_map)) {
306 mlog(0, "new master %u died while recovering %u!\n",
307 dlm->reco.new_master, dlm->reco.dead_node);
308 /* unset the new_master, leave dead_node */
309 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
310 }
311
312 /* select a target to recover */
313 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
314 int bit;
315
316 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
317 if (bit >= O2NM_MAX_NODES || bit < 0)
318 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
319 else
320 dlm->reco.dead_node = bit;
321 } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
322 /* BUG? */
323 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
324 dlm->reco.dead_node);
325 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
326 }
327
328 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
329 // mlog(0, "nothing to recover! sleeping now!\n");
330 spin_unlock(&dlm->spinlock);
331 /* return to main thread loop and sleep. */
332 return 0;
333 }
334 mlog(0, "recovery thread found node %u in the recovery map!\n",
335 dlm->reco.dead_node);
336 spin_unlock(&dlm->spinlock);
337
338 /* take write barrier */
339 /* (stops the list reshuffling thread, proxy ast handling) */
340 dlm_begin_recovery(dlm);
341
342 if (dlm->reco.new_master == dlm->node_num)
343 goto master_here;
344
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */
347 if (!dlm_pick_recovery_master(dlm)) {
348 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here;
351 }
352 mlog(0, "another node will master this recovery session.\n");
353 }
354 mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
355 dlm->name, dlm->reco.new_master,
356 dlm->node_num, dlm->reco.dead_node);
357
358 /* it is safe to start everything back up here
359 * because all of the dead node's lock resources
360 * have been marked as in-recovery */
361 dlm_end_recovery(dlm);
362
363 /* sleep out in main dlm_recovery_thread loop. */
364 return 0;
365
366master_here:
367 mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
368 dlm->name, dlm->reco.dead_node, dlm->node_num);
369
370 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
371 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node);
374 } else {
375 /* success! see if any other nodes need recovery */
376 dlm_reset_recovery(dlm);
377 }
378 dlm_end_recovery(dlm);
379
380 /* continue and look for another dead node */
381 return -EAGAIN;
382}
383
384static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
385{
386 int status = 0;
387 struct dlm_reco_node_data *ndata;
388 struct list_head *iter;
389 int all_nodes_done;
390 int destroy = 0;
391 int pass = 0;
392
393 status = dlm_init_recovery_area(dlm, dead_node);
394 if (status < 0)
395 goto leave;
396
397 /* safe to access the node data list without a lock, since this
398 * process is the only one to change the list */
399 list_for_each(iter, &dlm->reco.node_data) {
400 ndata = list_entry (iter, struct dlm_reco_node_data, list);
401 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
402 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
403
404 mlog(0, "requesting lock info from node %u\n",
405 ndata->node_num);
406
407 if (ndata->node_num == dlm->node_num) {
408 ndata->state = DLM_RECO_NODE_DATA_DONE;
409 continue;
410 }
411
412 status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
413 if (status < 0) {
414 mlog_errno(status);
415 if (dlm_is_host_down(status))
416 ndata->state = DLM_RECO_NODE_DATA_DEAD;
417 else {
418 destroy = 1;
419 goto leave;
420 }
421 }
422
423 switch (ndata->state) {
424 case DLM_RECO_NODE_DATA_INIT:
425 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
426 case DLM_RECO_NODE_DATA_REQUESTED:
427 BUG();
428 break;
429 case DLM_RECO_NODE_DATA_DEAD:
430 mlog(0, "node %u died after requesting "
431 "recovery info for node %u\n",
432 ndata->node_num, dead_node);
433 // start all over
434 destroy = 1;
435 status = -EAGAIN;
436 goto leave;
437 case DLM_RECO_NODE_DATA_REQUESTING:
438 ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
439 mlog(0, "now receiving recovery data from "
440 "node %u for dead node %u\n",
441 ndata->node_num, dead_node);
442 break;
443 case DLM_RECO_NODE_DATA_RECEIVING:
444 mlog(0, "already receiving recovery data from "
445 "node %u for dead node %u\n",
446 ndata->node_num, dead_node);
447 break;
448 case DLM_RECO_NODE_DATA_DONE:
449 mlog(0, "already DONE receiving recovery data "
450 "from node %u for dead node %u\n",
451 ndata->node_num, dead_node);
452 break;
453 }
454 }
455
456 mlog(0, "done requesting all lock info\n");
457
458 /* nodes should be sending reco data now
459 * just need to wait */
460
461 while (1) {
462 /* check all the nodes now to see if we are
463 * done, or if anyone died */
464 all_nodes_done = 1;
465 spin_lock(&dlm_reco_state_lock);
466 list_for_each(iter, &dlm->reco.node_data) {
467 ndata = list_entry (iter, struct dlm_reco_node_data, list);
468
469 mlog(0, "checking recovery state of node %u\n",
470 ndata->node_num);
471 switch (ndata->state) {
472 case DLM_RECO_NODE_DATA_INIT:
473 case DLM_RECO_NODE_DATA_REQUESTING:
474 mlog(ML_ERROR, "bad ndata state for "
475 "node %u: state=%d\n",
476 ndata->node_num, ndata->state);
477 BUG();
478 break;
479 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after "
481 "requesting recovery info for "
482 "node %u\n", ndata->node_num,
483 dead_node);
484 spin_unlock(&dlm_reco_state_lock);
485 // start all over
486 destroy = 1;
487 status = -EAGAIN;
488 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED:
491 all_nodes_done = 0;
492 break;
493 case DLM_RECO_NODE_DATA_DONE:
494 break;
495 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
496 break;
497 }
498 }
499 spin_unlock(&dlm_reco_state_lock);
500
501 mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
502 all_nodes_done?"yes":"no");
503 if (all_nodes_done) {
504 int ret;
505
506 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
507 * just send a finalize message to everyone and
508 * clean up */
509 mlog(0, "all nodes are done! send finalize\n");
510 ret = dlm_send_finalize_reco_message(dlm);
511 if (ret < 0)
512 mlog_errno(ret);
513
514 spin_lock(&dlm->spinlock);
515 dlm_finish_local_lockres_recovery(dlm, dead_node,
516 dlm->node_num);
517 spin_unlock(&dlm->spinlock);
518 mlog(0, "should be done with recovery!\n");
519
520 mlog(0, "finishing recovery of %s at %lu, "
521 "dead=%u, this=%u, new=%u\n", dlm->name,
522 jiffies, dlm->reco.dead_node,
523 dlm->node_num, dlm->reco.new_master);
524 destroy = 1;
525 status = ret;
526 /* rescan everything marked dirty along the way */
527 dlm_kick_thread(dlm, NULL);
528 break;
529 }
530 /* wait to be signalled, with periodic timeout
531 * to check for node death */
532 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
533 kthread_should_stop(),
534 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
535
536 }
537
538leave:
539 if (destroy)
540 dlm_destroy_recovery_area(dlm, dead_node);
541
542 mlog_exit(status);
543 return status;
544}
545
546static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
547{
548 int num=0;
549 struct dlm_reco_node_data *ndata;
550
551 spin_lock(&dlm->spinlock);
552 memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
553 /* nodes can only be removed (by dying) after dropping
554 * this lock, and death will be trapped later, so this should do */
555 spin_unlock(&dlm->spinlock);
556
557 while (1) {
558 num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
559 if (num >= O2NM_MAX_NODES) {
560 break;
561 }
562 BUG_ON(num == dead_node);
563
564 ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
565 if (!ndata) {
566 dlm_destroy_recovery_area(dlm, dead_node);
567 return -ENOMEM;
568 }
569 ndata->node_num = num;
570 ndata->state = DLM_RECO_NODE_DATA_INIT;
571 spin_lock(&dlm_reco_state_lock);
572 list_add_tail(&ndata->list, &dlm->reco.node_data);
573 spin_unlock(&dlm_reco_state_lock);
574 num++;
575 }
576
577 return 0;
578}
579
580static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
581{
582 struct list_head *iter, *iter2;
583 struct dlm_reco_node_data *ndata;
584 LIST_HEAD(tmplist);
585
586 spin_lock(&dlm_reco_state_lock);
587 list_splice_init(&dlm->reco.node_data, &tmplist);
588 spin_unlock(&dlm_reco_state_lock);
589
590 list_for_each_safe(iter, iter2, &tmplist) {
591 ndata = list_entry (iter, struct dlm_reco_node_data, list);
592 list_del_init(&ndata->list);
593 kfree(ndata);
594 }
595}
596
597static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
598 u8 dead_node)
599{
600 struct dlm_lock_request lr;
601 enum dlm_status ret;
602
603 mlog(0, "\n");
604
605
606 mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
607 "to %u\n", dead_node, request_from);
608
609 memset(&lr, 0, sizeof(lr));
610 lr.node_idx = dlm->node_num;
611 lr.dead_node = dead_node;
612
613 // send message
614 ret = DLM_NOLOCKMGR;
615 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
616 &lr, sizeof(lr), request_from, NULL);
617
618 /* negative status is handled by caller */
619 if (ret < 0)
620 mlog_errno(ret);
621
622 // return from here, then
623 // sleep until all received or error
624 return ret;
625
626}
627
628int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
629{
630 struct dlm_ctxt *dlm = data;
631 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
632 char *buf = NULL;
633 struct dlm_work_item *item = NULL;
634
635 if (!dlm_grab(dlm))
636 return -EINVAL;
637
638 BUG_ON(lr->dead_node != dlm->reco.dead_node);
639
640 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
641 if (!item) {
642 dlm_put(dlm);
643 return -ENOMEM;
644 }
645
646 /* this will get freed by dlm_request_all_locks_worker */
647 buf = (char *) __get_free_page(GFP_KERNEL);
648 if (!buf) {
649 kfree(item);
650 dlm_put(dlm);
651 return -ENOMEM;
652 }
653
654 /* queue up work for dlm_request_all_locks_worker */
655 dlm_grab(dlm); /* get an extra ref for the work item */
656 dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
657 item->u.ral.reco_master = lr->node_idx;
658 item->u.ral.dead_node = lr->dead_node;
659 spin_lock(&dlm->work_lock);
660 list_add_tail(&item->list, &dlm->work_list);
661 spin_unlock(&dlm->work_lock);
662 schedule_work(&dlm->dispatched_work);
663
664 dlm_put(dlm);
665 return 0;
666}
667
668static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
669{
670 struct dlm_migratable_lockres *mres;
671 struct dlm_lock_resource *res;
672 struct dlm_ctxt *dlm;
673 LIST_HEAD(resources);
674 struct list_head *iter;
675 int ret;
676 u8 dead_node, reco_master;
677
678 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master;
681 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master);
683
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the
689 * whole cluster recovers only one node at a time, so we
690 * can safely move UNKNOWN lock resources for each recovery
691 * session. */
692 dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
693
694 /* now we can begin blasting lockreses without the dlm lock */
695 list_for_each(iter, &resources) {
696 res = list_entry (iter, struct dlm_lock_resource, recovering);
697 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
698 DLM_MRES_RECOVERY);
699 if (ret < 0)
700 mlog_errno(ret);
701 }
702
703 /* move the resources back to the list */
704 spin_lock(&dlm->spinlock);
705 list_splice_init(&resources, &dlm->reco.resources);
706 spin_unlock(&dlm->spinlock);
707
708 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
709 if (ret < 0)
710 mlog_errno(ret);
711
712 free_page((unsigned long)data);
713}
714
715
716static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
717{
718 int ret, tmpret;
719 struct dlm_reco_data_done done_msg;
720
721 memset(&done_msg, 0, sizeof(done_msg));
722 done_msg.node_idx = dlm->node_num;
723 done_msg.dead_node = dead_node;
724 mlog(0, "sending DATA DONE message to %u, "
725 "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
726 done_msg.dead_node);
727
728 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
729 sizeof(done_msg), send_to, &tmpret);
730 /* negative status is ignored by the caller */
731 if (ret >= 0)
732 ret = tmpret;
733 return ret;
734}
735
736
737int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
738{
739 struct dlm_ctxt *dlm = data;
740 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
741 struct list_head *iter;
742 struct dlm_reco_node_data *ndata = NULL;
743 int ret = -EINVAL;
744
745 if (!dlm_grab(dlm))
746 return -EINVAL;
747
748 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
749 "node_idx=%u, this node=%u\n", done->dead_node,
750 dlm->reco.dead_node, done->node_idx, dlm->node_num);
751 BUG_ON(done->dead_node != dlm->reco.dead_node);
752
753 spin_lock(&dlm_reco_state_lock);
754 list_for_each(iter, &dlm->reco.node_data) {
755 ndata = list_entry (iter, struct dlm_reco_node_data, list);
756 if (ndata->node_num != done->node_idx)
757 continue;
758
759 switch (ndata->state) {
760 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num,
766 ndata->state);
767 BUG();
768 break;
769 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING:
772 mlog(0, "node %u is DONE sending "
773 "recovery data!\n",
774 ndata->node_num);
775
776 ndata->state = DLM_RECO_NODE_DATA_DONE;
777 ret = 0;
778 break;
779 }
780 }
781 spin_unlock(&dlm_reco_state_lock);
782
783 /* wake the recovery thread, some node is done */
784 if (!ret)
785 dlm_kick_recovery_thread(dlm);
786
787 if (ret < 0)
788 mlog(ML_ERROR, "failed to find recovery node data for node "
789 "%u\n", done->node_idx);
790 dlm_put(dlm);
791
792 mlog(0, "leaving reco data done handler, ret=%d\n", ret);
793 return ret;
794}
795
796static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
797 struct list_head *list,
798 u8 dead_node)
799{
800 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2;
802
803 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering);
806 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len))
808 continue;
809 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n",
812 dead_node);
813 list_del_init(&res->recovering);
814 list_add_tail(&res->recovering, list);
815 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
816 mlog(0, "found UNKNOWN owner while doing recovery "
817 "for node %u. sending it.\n", dead_node);
818 list_del_init(&res->recovering);
819 list_add_tail(&res->recovering, list);
820 }
821 }
822 spin_unlock(&dlm->spinlock);
823}
824
825static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
826{
827 int total_locks = 0;
828 struct list_head *iter, *queue = &res->granted;
829 int i;
830
831 for (i=0; i<3; i++) {
832 list_for_each(iter, queue)
833 total_locks++;
834 queue++;
835 }
836 return total_locks;
837}
838
839
840static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
841 struct dlm_migratable_lockres *mres,
842 u8 send_to,
843 struct dlm_lock_resource *res,
844 int total_locks)
845{
846 u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
847 int mres_total_locks = be32_to_cpu(mres->total_locks);
848 int sz, ret = 0, status = 0;
849 u8 orig_flags = mres->flags,
850 orig_master = mres->master;
851
852 BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
853 if (!mres->num_locks)
854 return 0;
855
856 sz = sizeof(struct dlm_migratable_lockres) +
857 (mres->num_locks * sizeof(struct dlm_migratable_lock));
858
859 /* add an all-done flag if we reached the last lock */
860 orig_flags = mres->flags;
861 BUG_ON(total_locks > mres_total_locks);
862 if (total_locks == mres_total_locks)
863 mres->flags |= DLM_MRES_ALL_DONE;
864
865 /* send it */
866 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
867 sz, send_to, &status);
868 if (ret < 0) {
869 /* XXX: negative status is not handled.
870 * this will end up killing this node. */
871 mlog_errno(ret);
872 } else {
873 /* might get an -ENOMEM back here */
874 ret = status;
875 if (ret < 0) {
876 mlog_errno(ret);
877
878 if (ret == -EFAULT) {
879 mlog(ML_ERROR, "node %u told me to kill "
880 "myself!\n", send_to);
881 BUG();
882 }
883 }
884 }
885
886 /* zero and reinit the message buffer */
887 dlm_init_migratable_lockres(mres, res->lockname.name,
888 res->lockname.len, mres_total_locks,
889 mig_cookie, orig_flags, orig_master);
890 return ret;
891}
892
893static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
894 const char *lockname, int namelen,
895 int total_locks, u64 cookie,
896 u8 flags, u8 master)
897{
898 /* mres here is one full page */
899 memset(mres, 0, PAGE_SIZE);
900 mres->lockname_len = namelen;
901 memcpy(mres->lockname, lockname, namelen);
902 mres->num_locks = 0;
903 mres->total_locks = cpu_to_be32(total_locks);
904 mres->mig_cookie = cpu_to_be64(cookie);
905 mres->flags = flags;
906 mres->master = master;
907}
908
909
910/* returns 1 if this lock fills the network structure,
911 * 0 otherwise */
912static int dlm_add_lock_to_array(struct dlm_lock *lock,
913 struct dlm_migratable_lockres *mres, int queue)
914{
915 struct dlm_migratable_lock *ml;
916 int lock_num = mres->num_locks;
917
918 ml = &(mres->ml[lock_num]);
919 ml->cookie = lock->ml.cookie;
920 ml->type = lock->ml.type;
921 ml->convert_type = lock->ml.convert_type;
922 ml->highest_blocked = lock->ml.highest_blocked;
923 ml->list = queue;
924 if (lock->lksb) {
925 ml->flags = lock->lksb->flags;
926 /* send our current lvb */
927 if (ml->type == LKM_EXMODE ||
928 ml->type == LKM_PRMODE) {
929 /* if it is already set, this had better be a PR
930 * and it has to match */
931 if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
932 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
933 mlog(ML_ERROR, "mismatched lvbs!\n");
934 __dlm_print_one_lock_resource(lock->lockres);
935 BUG();
936 }
937 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
938 }
939 }
940 ml->node = lock->ml.node;
941 mres->num_locks++;
942 /* we reached the max, send this network message */
943 if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
944 return 1;
945 return 0;
946}
947
948
949int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
950 struct dlm_migratable_lockres *mres,
951 u8 send_to, u8 flags)
952{
953 struct list_head *queue, *iter;
954 int total_locks, i;
955 u64 mig_cookie = 0;
956 struct dlm_lock *lock;
957 int ret = 0;
958
959 BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
960
961 mlog(0, "sending to %u\n", send_to);
962
963 total_locks = dlm_num_locks_in_lockres(res);
964 if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
965 /* rare, but possible */
966 mlog(0, "argh. lockres has %d locks. this will "
967 "require more than one network packet to "
968 "migrate\n", total_locks);
969 mig_cookie = dlm_get_next_mig_cookie();
970 }
971
972 dlm_init_migratable_lockres(mres, res->lockname.name,
973 res->lockname.len, total_locks,
974 mig_cookie, flags, res->owner);
975
976 total_locks = 0;
977 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
978 queue = dlm_list_idx_to_ptr(res, i);
979 list_for_each(iter, queue) {
980 lock = list_entry (iter, struct dlm_lock, list);
981
982 /* add another lock. */
983 total_locks++;
984 if (!dlm_add_lock_to_array(lock, mres, i))
985 continue;
986
987 /* this filled the lock message,
988 * we must send it immediately. */
989 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
990 res, total_locks);
991 if (ret < 0) {
992 // TODO
993 mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
994 "returned %d, TODO\n", ret);
995 BUG();
996 }
997 }
998 }
999 /* flush any remaining locks */
1000 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1001 if (ret < 0) {
1002 // TODO
1003 mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
1004 "TODO\n", ret);
1005 BUG();
1006 }
1007 return ret;
1008}
1009
1010
1011
1012/*
1013 * this message will contain no more than one page worth of
1014 * recovery data, and it will work on only one lockres.
1015 * there may be many locks in this page, and we may need to wait
1016 * for additional packets to complete all the locks (rare, but
1017 * possible).
1018 */
1019/*
1020 * NOTE: the allocation error cases here are scary
1021 * we really cannot afford to fail an alloc in recovery
1022 * do we spin? returning an error only delays the problem really
1023 */
1024
1025int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1026{
1027 struct dlm_ctxt *dlm = data;
1028 struct dlm_migratable_lockres *mres =
1029 (struct dlm_migratable_lockres *)msg->buf;
1030 int ret = 0;
1031 u8 real_master;
1032 char *buf = NULL;
1033 struct dlm_work_item *item = NULL;
1034 struct dlm_lock_resource *res = NULL;
1035
1036 if (!dlm_grab(dlm))
1037 return -EINVAL;
1038
1039 BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1040
1041 real_master = mres->master;
1042 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1043 /* cannot migrate a lockres with no master */
1044 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1045 }
1046
1047 mlog(0, "%s message received from node %u\n",
1048 (mres->flags & DLM_MRES_RECOVERY) ?
1049 "recovery" : "migration", mres->master);
1050 if (mres->flags & DLM_MRES_ALL_DONE)
1051 mlog(0, "all done flag. all lockres data received!\n");
1052
1053 ret = -ENOMEM;
1054 buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
1055 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1056 if (!buf || !item)
1057 goto leave;
1058
1059 /* lookup the lock to see if we have a secondary queue for this
1060 * already... just add the locks in and this will have its owner
1061 * and RECOVERY flag changed when it completes. */
1062 res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
1063 if (res) {
1064 /* this will get a ref on res */
1065 /* mark it as recovering/migrating and hash it */
1066 spin_lock(&res->spinlock);
1067 if (mres->flags & DLM_MRES_RECOVERY) {
1068 res->state |= DLM_LOCK_RES_RECOVERING;
1069 } else {
1070 if (res->state & DLM_LOCK_RES_MIGRATING) {
1071 /* this is at least the second
1072 * lockres message */
1073 mlog(0, "lock %.*s is already migrating\n",
1074 mres->lockname_len,
1075 mres->lockname);
1076 } else if (res->state & DLM_LOCK_RES_RECOVERING) {
1077 /* caller should BUG */
1078 mlog(ML_ERROR, "node is attempting to migrate "
1079 "lock %.*s, but marked as recovering!\n",
1080 mres->lockname_len, mres->lockname);
1081 ret = -EFAULT;
1082 spin_unlock(&res->spinlock);
1083 goto leave;
1084 }
1085 res->state |= DLM_LOCK_RES_MIGRATING;
1086 }
1087 spin_unlock(&res->spinlock);
1088 } else {
1089 /* need to allocate, just like if it was
1090 * mastered here normally */
1091 res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
1092 if (!res)
1093 goto leave;
1094
1095 /* to match the ref that we would have gotten if
1096 * dlm_lookup_lockres had succeeded */
1097 dlm_lockres_get(res);
1098
1099 /* mark it as recovering/migrating and hash it */
1100 if (mres->flags & DLM_MRES_RECOVERY)
1101 res->state |= DLM_LOCK_RES_RECOVERING;
1102 else
1103 res->state |= DLM_LOCK_RES_MIGRATING;
1104
1105 spin_lock(&dlm->spinlock);
1106 __dlm_insert_lockres(dlm, res);
1107 spin_unlock(&dlm->spinlock);
1108
1109 /* now that the new lockres is inserted,
1110 * make it usable by other processes */
1111 spin_lock(&res->spinlock);
1112 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1113 spin_unlock(&res->spinlock);
1114
1115 /* add an extra ref for just-allocated lockres
1116 * otherwise the lockres will be purged immediately */
1117 dlm_lockres_get(res);
1118
1119 }
1120
1121 /* at this point we have allocated everything we need,
1122 * and we have a hashed lockres with an extra ref and
1123 * the proper res->state flags. */
1124 ret = 0;
1125 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1126 /* migration cannot have an unknown master */
1127 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1128 mlog(0, "recovery has passed me a lockres with an "
1129 "unknown owner.. will need to requery: "
1130 "%.*s\n", mres->lockname_len, mres->lockname);
1131 } else {
1132 spin_lock(&res->spinlock);
1133 dlm_change_lockres_owner(dlm, res, dlm->node_num);
1134 spin_unlock(&res->spinlock);
1135 }
1136
1137 /* queue up work for dlm_mig_lockres_worker */
1138 dlm_grab(dlm); /* get an extra ref for the work item */
1139 memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */
1140 dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
1141 item->u.ml.lockres = res; /* already have a ref */
1142 item->u.ml.real_master = real_master;
1143 spin_lock(&dlm->work_lock);
1144 list_add_tail(&item->list, &dlm->work_list);
1145 spin_unlock(&dlm->work_lock);
1146 schedule_work(&dlm->dispatched_work);
1147
1148leave:
1149 dlm_put(dlm);
1150 if (ret < 0) {
1151 if (buf)
1152 kfree(buf);
1153 if (item)
1154 kfree(item);
1155 }
1156
1157 mlog_exit(ret);
1158 return ret;
1159}
1160
1161
1162static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1163{
1164 struct dlm_ctxt *dlm = data;
1165 struct dlm_migratable_lockres *mres;
1166 int ret = 0;
1167 struct dlm_lock_resource *res;
1168 u8 real_master;
1169
1170 dlm = item->dlm;
1171 mres = (struct dlm_migratable_lockres *)data;
1172
1173 res = item->u.ml.lockres;
1174 real_master = item->u.ml.real_master;
1175
1176 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1177 /* this case is super-rare. only occurs if
1178 * node death happens during migration. */
1179again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n",
1183 ret);
1184 goto again;
1185 }
1186 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1187 mlog(0, "lockres %.*s not claimed. "
1188 "this node will take it.\n",
1189 res->lockname.len, res->lockname.name);
1190 } else {
1191 mlog(0, "master needs to respond to sender "
1192 "that node %u still owns %.*s\n",
1193 real_master, res->lockname.len,
1194 res->lockname.name);
1195 /* cannot touch this lockres */
1196 goto leave;
1197 }
1198 }
1199
1200 ret = dlm_process_recovery_data(dlm, res, mres);
1201 if (ret < 0)
1202 mlog(0, "dlm_process_recovery_data returned %d\n", ret);
1203 else
1204 mlog(0, "dlm_process_recovery_data succeeded\n");
1205
1206 if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
1207 (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
1208 ret = dlm_finish_migration(dlm, res, mres->master);
1209 if (ret < 0)
1210 mlog_errno(ret);
1211 }
1212
1213leave:
1214 kfree(data);
1215 mlog_exit(ret);
1216}
1217
1218
1219
1220static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
1221 struct dlm_lock_resource *res,
1222 u8 *real_master)
1223{
1224 struct dlm_node_iter iter;
1225 int nodenum;
1226 int ret = 0;
1227
1228 *real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
1229
1230 /* we only reach here if one of the two nodes in a
1231 * migration died while the migration was in progress.
1232 * at this point we need to requery the master. we
1233 * know that the new_master got as far as creating
1234 * an mle on at least one node, but we do not know
1235 * if any nodes had actually cleared the mle and set
1236 * the master to the new_master. the old master
1237 * is supposed to set the owner to UNKNOWN in the
1238 * event of a new_master death, so the only possible
1239 * responses that we can get from nodes here are
1240 * that the master is new_master, or that the master
1241 * is UNKNOWN.
1242 * if all nodes come back with UNKNOWN then we know
1243 * the lock needs remastering here.
1244 * if any node comes back with a valid master, check
1245 * to see if that master is the one that we are
1246 * recovering. if so, then the new_master died and
1247 * we need to remaster this lock. if not, then the
1248 * new_master survived and that node will respond to
1249 * other nodes about the owner.
1250 * if there is an owner, this node needs to dump this
1251 * lockres and alert the sender that this lockres
1252 * was rejected. */
1253 spin_lock(&dlm->spinlock);
1254 dlm_node_iter_init(dlm->domain_map, &iter);
1255 spin_unlock(&dlm->spinlock);
1256
1257 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1258 /* do not send to self */
1259 if (nodenum == dlm->node_num)
1260 continue;
1261 ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
1262 if (ret < 0) {
1263 mlog_errno(ret);
1264 BUG();
1265 /* TODO: need to figure a way to restart this */
1266 }
1267 if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1268 mlog(0, "lock master is %u\n", *real_master);
1269 break;
1270 }
1271 }
1272 return ret;
1273}
1274
1275
1276static int dlm_do_master_requery(struct dlm_ctxt *dlm,
1277 struct dlm_lock_resource *res,
1278 u8 nodenum, u8 *real_master)
1279{
1280 int ret = -EINVAL;
1281 struct dlm_master_requery req;
1282 int status = DLM_LOCK_RES_OWNER_UNKNOWN;
1283
1284 memset(&req, 0, sizeof(req));
1285 req.node_idx = dlm->node_num;
1286 req.namelen = res->lockname.len;
1287 memcpy(req.name, res->lockname.name, res->lockname.len);
1288
1289 ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
1290 &req, sizeof(req), nodenum, &status);
1291 /* XXX: negative status not handled properly here. */
1292 if (ret < 0)
1293 mlog_errno(ret);
1294 else {
1295 BUG_ON(status < 0);
1296 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
1297 *real_master = (u8) (status & 0xff);
1298 mlog(0, "node %u responded to master requery with %u\n",
1299 nodenum, *real_master);
1300 ret = 0;
1301 }
1302 return ret;
1303}
1304
1305
1306/* this function cannot error, so unless the sending
1307 * or receiving of the message failed, the owner can
1308 * be trusted */
1309int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
1310{
1311 struct dlm_ctxt *dlm = data;
1312 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
1313 struct dlm_lock_resource *res = NULL;
1314 int master = DLM_LOCK_RES_OWNER_UNKNOWN;
1315 u32 flags = DLM_ASSERT_MASTER_REQUERY;
1316
1317 if (!dlm_grab(dlm)) {
1318 /* since the domain has gone away on this
1319 * node, the proper response is UNKNOWN */
1320 return master;
1321 }
1322
1323 spin_lock(&dlm->spinlock);
1324 res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
1325 if (res) {
1326 spin_lock(&res->spinlock);
1327 master = res->owner;
1328 if (master == dlm->node_num) {
1329 int ret = dlm_dispatch_assert_master(dlm, res,
1330 0, 0, flags);
1331 if (ret < 0) {
1332 mlog_errno(-ENOMEM);
1333 /* retry!? */
1334 BUG();
1335 }
1336 }
1337 spin_unlock(&res->spinlock);
1338 }
1339 spin_unlock(&dlm->spinlock);
1340
1341 dlm_put(dlm);
1342 return master;
1343}
1344
1345static inline struct list_head *
1346dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
1347{
1348 struct list_head *ret;
1349 BUG_ON(list_num < 0);
1350 BUG_ON(list_num > 2);
1351 ret = &(res->granted);
1352 ret += list_num;
1353 return ret;
1354}
1355/* TODO: do ast flush business
1356 * TODO: do MIGRATING and RECOVERING spinning
1357 */
1358
1359/*
1360* NOTE about in-flight requests during migration:
1361*
1362* Before attempting the migrate, the master has marked the lockres as
1363* MIGRATING and then flushed all of its pending ASTS. So any in-flight
1364* requests either got queued before the MIGRATING flag got set, in which
1365* case the lock data will reflect the change and a return message is on
1366* the way, or the request failed to get in before MIGRATING got set. In
1367* this case, the caller will be told to spin and wait for the MIGRATING
1368* flag to be dropped, then recheck the master.
1369* This holds true for the convert, cancel and unlock cases, and since lvb
1370* updates are tied to these same messages, it applies to lvb updates as
1371* well. For the lock case, there is no way a lock can be on the master
1372* queue and not be on the secondary queue since the lock is always added
1373* locally first. This means that the new target node will never be sent
1374* a lock that he doesn't already have on the list.
1375* In total, this means that the local lock is correct and should not be
1376* updated to match the one sent by the master. Any messages sent back
1377* from the master before the MIGRATING flag will bring the lock properly
1378* up-to-date, and the change will be ordered properly for the waiter.
1379* We will *not* attempt to modify the lock underneath the waiter.
1380*/
1381
1382static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1383 struct dlm_lock_resource *res,
1384 struct dlm_migratable_lockres *mres)
1385{
1386 struct dlm_migratable_lock *ml;
1387 struct list_head *queue;
1388 struct dlm_lock *newlock = NULL;
1389 struct dlm_lockstatus *lksb = NULL;
1390 int ret = 0;
1391 int i;
1392 struct list_head *iter;
1393 struct dlm_lock *lock = NULL;
1394
1395 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1396 for (i=0; i<mres->num_locks; i++) {
1397 ml = &(mres->ml[i]);
1398 BUG_ON(ml->highest_blocked != LKM_IVMODE);
1399 newlock = NULL;
1400 lksb = NULL;
1401
1402 queue = dlm_list_num_to_pointer(res, ml->list);
1403
1404 /* if the lock is for the local node it needs to
1405 * be moved to the proper location within the queue.
1406 * do not allocate a new lock structure. */
1407 if (ml->node == dlm->node_num) {
1408 /* MIGRATION ONLY! */
1409 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1410
1411 spin_lock(&res->spinlock);
1412 list_for_each(iter, queue) {
1413 lock = list_entry (iter, struct dlm_lock, list);
1414 if (lock->ml.cookie != ml->cookie)
1415 lock = NULL;
1416 else
1417 break;
1418 }
1419
1420 /* lock is always created locally first, and
1421 * destroyed locally last. it must be on the list */
1422 if (!lock) {
1423 mlog(ML_ERROR, "could not find local lock "
1424 "with cookie %"MLFu64"!\n",
1425 ml->cookie);
1426 BUG();
1427 }
1428 BUG_ON(lock->ml.node != ml->node);
1429
1430 /* see NOTE above about why we do not update
1431 * to match the master here */
1432
1433 /* move the lock to its proper place */
1434 /* do not alter lock refcount. switching lists. */
1435 list_del_init(&lock->list);
1436 list_add_tail(&lock->list, queue);
1437 spin_unlock(&res->spinlock);
1438
1439 mlog(0, "just reordered a local lock!\n");
1440 continue;
1441 }
1442
1443 /* lock is for another node. */
1444 newlock = dlm_new_lock(ml->type, ml->node,
1445 be64_to_cpu(ml->cookie), NULL);
1446 if (!newlock) {
1447 ret = -ENOMEM;
1448 goto leave;
1449 }
1450 lksb = newlock->lksb;
1451 dlm_lock_attach_lockres(newlock, res);
1452
1453 if (ml->convert_type != LKM_IVMODE) {
1454 BUG_ON(queue != &res->converting);
1455 newlock->ml.convert_type = ml->convert_type;
1456 }
1457 lksb->flags |= (ml->flags &
1458 (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
1459
1460 if (mres->lvb[0]) {
1461 if (lksb->flags & DLM_LKSB_PUT_LVB) {
1462 /* other node was trying to update
1463 * lvb when node died. recreate the
1464 * lksb with the updated lvb. */
1465 memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
1466 } else {
1467 /* otherwise, the node is sending its
1468 * most recent valid lvb info */
1469 BUG_ON(ml->type != LKM_EXMODE &&
1470 ml->type != LKM_PRMODE);
1471 if (res->lvb[0] && (ml->type == LKM_EXMODE ||
1472 memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
1473 mlog(ML_ERROR, "received bad lvb!\n");
1474 __dlm_print_one_lock_resource(res);
1475 BUG();
1476 }
1477 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1478 }
1479 }
1480
1481
1482 /* NOTE:
1483 * wrt lock queue ordering and recovery:
1484 * 1. order of locks on granted queue is
1485 * meaningless.
1486 * 2. order of locks on converting queue is
1487 * LOST with the node death. sorry charlie.
1488 * 3. order of locks on the blocked queue is
1489 * also LOST.
1490 * order of locks does not affect integrity, it
1491 * just means that a lock request may get pushed
1492 * back in line as a result of the node death.
1493 * also note that for a given node the lock order
1494 * for its secondary queue locks is preserved
1495 * relative to each other, but clearly *not*
1496 * preserved relative to locks from other nodes.
1497 */
1498 spin_lock(&res->spinlock);
1499 dlm_lock_get(newlock);
1500 list_add_tail(&newlock->list, queue);
1501 spin_unlock(&res->spinlock);
1502 }
1503 mlog(0, "done running all the locks\n");
1504
1505leave:
1506 if (ret < 0) {
1507 mlog_errno(ret);
1508 if (newlock)
1509 dlm_lock_put(newlock);
1510 }
1511
1512 mlog_exit(ret);
1513 return ret;
1514}
1515
1516void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1517 struct dlm_lock_resource *res)
1518{
1519 int i;
1520 struct list_head *queue, *iter, *iter2;
1521 struct dlm_lock *lock;
1522
1523 res->state |= DLM_LOCK_RES_RECOVERING;
1524 if (!list_empty(&res->recovering))
1525 list_del_init(&res->recovering);
1526 list_add_tail(&res->recovering, &dlm->reco.resources);
1527
1528 /* find any pending locks and put them back on proper list */
1529 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1530 queue = dlm_list_idx_to_ptr(res, i);
1531 list_for_each_safe(iter, iter2, queue) {
1532 lock = list_entry (iter, struct dlm_lock, list);
1533 dlm_lock_get(lock);
1534 if (lock->convert_pending) {
1535 /* move converting lock back to granted */
1536 BUG_ON(i != DLM_CONVERTING_LIST);
1537 mlog(0, "node died with convert pending "
1538 "on %.*s. move back to granted list.\n",
1539 res->lockname.len, res->lockname.name);
1540 dlm_revert_pending_convert(res, lock);
1541 lock->convert_pending = 0;
1542 } else if (lock->lock_pending) {
1543 /* remove pending lock requests completely */
1544 BUG_ON(i != DLM_BLOCKED_LIST);
1545 mlog(0, "node died with lock pending "
1546 "on %.*s. remove from blocked list and skip.\n",
1547 res->lockname.len, res->lockname.name);
1548 /* lock will be floating until ref in
1549 * dlmlock_remote is freed after the network
1550 * call returns. ok for it to not be on any
1551 * list since no ast can be called
1552 * (the master is dead). */
1553 dlm_revert_pending_lock(res, lock);
1554 lock->lock_pending = 0;
1555 } else if (lock->unlock_pending) {
1556 /* if an unlock was in progress, treat as
1557 * if this had completed successfully
1558 * before sending this lock state to the
1559 * new master. note that the dlm_unlock
1560 * call is still responsible for calling
1561 * the unlockast. that will happen after
1562 * the network call times out. for now,
1563 * just move lists to prepare the new
1564 * recovery master. */
1565 BUG_ON(i != DLM_GRANTED_LIST);
1566 mlog(0, "node died with unlock pending "
1567 "on %.*s. remove from blocked list and skip.\n",
1568 res->lockname.len, res->lockname.name);
1569 dlm_commit_pending_unlock(res, lock);
1570 lock->unlock_pending = 0;
1571 } else if (lock->cancel_pending) {
1572 /* if a cancel was in progress, treat as
1573 * if this had completed successfully
1574 * before sending this lock state to the
1575 * new master */
1576 BUG_ON(i != DLM_CONVERTING_LIST);
1577 mlog(0, "node died with cancel pending "
1578 "on %.*s. move back to granted list.\n",
1579 res->lockname.len, res->lockname.name);
1580 dlm_commit_pending_cancel(res, lock);
1581 lock->cancel_pending = 0;
1582 }
1583 dlm_lock_put(lock);
1584 }
1585 }
1586}
1587
1588
1589
1590/* removes all recovered locks from the recovery list.
1591 * sets the res->owner to the new master.
1592 * unsets the RECOVERY flag and wakes waiters. */
1593static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1594 u8 dead_node, u8 new_master)
1595{
1596 int i;
1597 struct list_head *iter, *iter2, *bucket;
1598 struct dlm_lock_resource *res;
1599
1600 mlog_entry_void();
1601
1602 assert_spin_locked(&dlm->spinlock);
1603
1604 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
1605 res = list_entry (iter, struct dlm_lock_resource, recovering);
1606 if (res->owner == dead_node) {
1607 list_del_init(&res->recovering);
1608 spin_lock(&res->spinlock);
1609 dlm_change_lockres_owner(dlm, res, new_master);
1610 res->state &= ~DLM_LOCK_RES_RECOVERING;
1611 __dlm_dirty_lockres(dlm, res);
1612 spin_unlock(&res->spinlock);
1613 wake_up(&res->wq);
1614 }
1615 }
1616
1617 /* this will become unnecessary eventually, but
1618 * for now we need to run the whole hash, clear
1619 * the RECOVERING state and set the owner
1620 * if necessary */
1621 for (i=0; i<DLM_HASH_SIZE; i++) {
1622 bucket = &(dlm->resources[i]);
1623 list_for_each(iter, bucket) {
1624 res = list_entry (iter, struct dlm_lock_resource, list);
1625 if (res->state & DLM_LOCK_RES_RECOVERING) {
1626 if (res->owner == dead_node) {
1627 mlog(0, "(this=%u) res %.*s owner=%u "
1628 "was not on recovering list, but "
1629 "clearing state anyway\n",
1630 dlm->node_num, res->lockname.len,
1631 res->lockname.name, new_master);
1632 } else if (res->owner == dlm->node_num) {
1633 mlog(0, "(this=%u) res %.*s owner=%u "
1634 "was not on recovering list, "
1635 "owner is THIS node, clearing\n",
1636 dlm->node_num, res->lockname.len,
1637 res->lockname.name, new_master);
1638 } else
1639 continue;
1640
1641 spin_lock(&res->spinlock);
1642 dlm_change_lockres_owner(dlm, res, new_master);
1643 res->state &= ~DLM_LOCK_RES_RECOVERING;
1644 __dlm_dirty_lockres(dlm, res);
1645 spin_unlock(&res->spinlock);
1646 wake_up(&res->wq);
1647 }
1648 }
1649 }
1650}
1651
1652static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
1653{
1654 if (local) {
1655 if (lock->ml.type != LKM_EXMODE &&
1656 lock->ml.type != LKM_PRMODE)
1657 return 1;
1658 } else if (lock->ml.type == LKM_EXMODE)
1659 return 1;
1660 return 0;
1661}
1662
1663static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
1664 struct dlm_lock_resource *res, u8 dead_node)
1665{
1666 struct list_head *iter, *queue;
1667 struct dlm_lock *lock;
1668 int blank_lvb = 0, local = 0;
1669 int i;
1670 u8 search_node;
1671
1672 assert_spin_locked(&dlm->spinlock);
1673 assert_spin_locked(&res->spinlock);
1674
1675 if (res->owner == dlm->node_num)
1676 /* if this node owned the lockres, and if the dead node
1677 * had an EX when he died, blank out the lvb */
1678 search_node = dead_node;
1679 else {
1680 /* if this is a secondary lockres, and we had no EX or PR
1681 * locks granted, we can no longer trust the lvb */
1682 search_node = dlm->node_num;
1683 local = 1; /* check local state for valid lvb */
1684 }
1685
1686 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
1687 queue = dlm_list_idx_to_ptr(res, i);
1688 list_for_each(iter, queue) {
1689 lock = list_entry (iter, struct dlm_lock, list);
1690 if (lock->ml.node == search_node) {
1691 if (dlm_lvb_needs_invalidation(lock, local)) {
1692 /* zero the lksb lvb and lockres lvb */
1693 blank_lvb = 1;
1694 memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
1695 }
1696 }
1697 }
1698 }
1699
1700 if (blank_lvb) {
1701 mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
1702 res->lockname.len, res->lockname.name, dead_node);
1703 memset(res->lvb, 0, DLM_LVB_LEN);
1704 }
1705}
1706
1707static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
1708 struct dlm_lock_resource *res, u8 dead_node)
1709{
1710 struct list_head *iter, *tmpiter;
1711 struct dlm_lock *lock;
1712
1713 /* this node is the lockres master:
1714 * 1) remove any stale locks for the dead node
1715 * 2) if the dead node had an EX when he died, blank out the lvb
1716 */
1717 assert_spin_locked(&dlm->spinlock);
1718 assert_spin_locked(&res->spinlock);
1719
1720 /* TODO: check pending_asts, pending_basts here */
1721 list_for_each_safe(iter, tmpiter, &res->granted) {
1722 lock = list_entry (iter, struct dlm_lock, list);
1723 if (lock->ml.node == dead_node) {
1724 list_del_init(&lock->list);
1725 dlm_lock_put(lock);
1726 }
1727 }
1728 list_for_each_safe(iter, tmpiter, &res->converting) {
1729 lock = list_entry (iter, struct dlm_lock, list);
1730 if (lock->ml.node == dead_node) {
1731 list_del_init(&lock->list);
1732 dlm_lock_put(lock);
1733 }
1734 }
1735 list_for_each_safe(iter, tmpiter, &res->blocked) {
1736 lock = list_entry (iter, struct dlm_lock, list);
1737 if (lock->ml.node == dead_node) {
1738 list_del_init(&lock->list);
1739 dlm_lock_put(lock);
1740 }
1741 }
1742
1743 /* do not kick thread yet */
1744 __dlm_dirty_lockres(dlm, res);
1745}
1746
1747/* if this node is the recovery master, and there are no
1748 * locks for a given lockres owned by this node that are in
1749 * either PR or EX mode, zero out the lvb before requesting.
1750 *
1751 */
1752
1753
1754static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1755{
1756 struct list_head *iter;
1757 struct dlm_lock_resource *res;
1758 int i;
1759 struct list_head *bucket;
1760
1761
1762 /* purge any stale mles */
1763 dlm_clean_master_list(dlm, dead_node);
1764
1765 /*
1766 * now clean up all lock resources. there are two rules:
1767 *
1768 * 1) if the dead node was the master, move the lockres
1769 * to the recovering list. set the RECOVERING flag.
1770 * this lockres needs to be cleaned up before it can
1771 * be used further.
1772 *
1773 * 2) if this node was the master, remove all locks from
1774 * each of the lockres queues that were owned by the
1775 * dead node. once recovery finishes, the dlm thread
1776 * can be kicked again to see if any ASTs or BASTs
1777 * need to be fired as a result.
1778 */
1779 for (i=0; i<DLM_HASH_SIZE; i++) {
1780 bucket = &(dlm->resources[i]);
1781 list_for_each(iter, bucket) {
1782 res = list_entry (iter, struct dlm_lock_resource, list);
1783 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len))
1785 continue;
1786
1787 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node);
1790 if (res->owner == dead_node)
1791 dlm_move_lockres_to_recovery_list(dlm, res);
1792 else if (res->owner == dlm->node_num) {
1793 dlm_free_dead_locks(dlm, res, dead_node);
1794 __dlm_lockres_calc_usage(dlm, res);
1795 }
1796 spin_unlock(&res->spinlock);
1797 }
1798 }
1799
1800}
1801
1802static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
1803{
1804 assert_spin_locked(&dlm->spinlock);
1805
1806 /* check to see if the node is already considered dead */
1807 if (!test_bit(idx, dlm->live_nodes_map)) {
1808 mlog(0, "for domain %s, node %d is already dead. "
1809 "another node likely did recovery already.\n",
1810 dlm->name, idx);
1811 return;
1812 }
1813
1814 /* check to see if we do not care about this node */
1815 if (!test_bit(idx, dlm->domain_map)) {
1816 /* This also catches the case that we get a node down
1817 * but haven't joined the domain yet. */
1818 mlog(0, "node %u already removed from domain!\n", idx);
1819 return;
1820 }
1821
1822 clear_bit(idx, dlm->live_nodes_map);
1823
1824 /* Clean up join state on node death. */
1825 if (dlm->joining_node == idx) {
1826 mlog(0, "Clearing join state for node %u\n", idx);
1827 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1828 }
1829
1830 /* make sure local cleanup occurs before the heartbeat events */
1831 if (!test_bit(idx, dlm->recovery_map))
1832 dlm_do_local_recovery_cleanup(dlm, idx);
1833
1834 /* notify anything attached to the heartbeat events */
1835 dlm_hb_event_notify_attached(dlm, idx, 0);
1836
1837 mlog(0, "node %u being removed from domain map!\n", idx);
1838 clear_bit(idx, dlm->domain_map);
1839 /* wake up migration waiters if a node goes down.
1840 * perhaps later we can genericize this for other waiters. */
1841 wake_up(&dlm->migration_wq);
1842
1843 if (test_bit(idx, dlm->recovery_map))
1844 mlog(0, "domain %s, node %u already added "
1845 "to recovery map!\n", dlm->name, idx);
1846 else
1847 set_bit(idx, dlm->recovery_map);
1848}
1849
1850void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
1851{
1852 struct dlm_ctxt *dlm = data;
1853
1854 if (!dlm_grab(dlm))
1855 return;
1856
1857 spin_lock(&dlm->spinlock);
1858 __dlm_hb_node_down(dlm, idx);
1859 spin_unlock(&dlm->spinlock);
1860
1861 dlm_put(dlm);
1862}
1863
1864void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1865{
1866 struct dlm_ctxt *dlm = data;
1867
1868 if (!dlm_grab(dlm))
1869 return;
1870
1871 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map);
1874
1875 /* notify any mles attached to the heartbeat events */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock);
1879
1880 dlm_put(dlm);
1881}
1882
1883static void dlm_reco_ast(void *astdata)
1884{
1885 struct dlm_ctxt *dlm = astdata;
1886 mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
1887 dlm->node_num, dlm->name);
1888}
1889static void dlm_reco_bast(void *astdata, int blocked_type)
1890{
1891 struct dlm_ctxt *dlm = astdata;
1892 mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
1893 dlm->node_num, dlm->name);
1894}
1895static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1896{
1897 mlog(0, "unlockast for recovery lock fired!\n");
1898}
1899
1900
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{
1903 enum dlm_status ret;
1904 struct dlm_lockstatus lksb;
1905 int status = -EINVAL;
1906
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry:
1910 memset(&lksb, 0, sizeof(lksb));
1911
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914
1915 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying
1919 * that I am beginning a recovery session */
1920 status = dlm_send_begin_reco_message(dlm,
1921 dlm->reco.dead_node);
1922
1923 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
1926 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock
1929 * because of node death. this means the unlock
1930 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only
1932 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 }
1941 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num);
1944 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */
1946 status = -EEXIST;
1947 }
1948
1949 return status;
1950}
1951
1952static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1953{
1954 struct dlm_begin_reco br;
1955 int ret = 0;
1956 struct dlm_node_iter iter;
1957 int nodenum;
1958 int status;
1959
1960 mlog_entry("%u\n", dead_node);
1961
1962 mlog(0, "dead node is %u\n", dead_node);
1963
1964 spin_lock(&dlm->spinlock);
1965 dlm_node_iter_init(dlm->domain_map, &iter);
1966 spin_unlock(&dlm->spinlock);
1967
1968 clear_bit(dead_node, iter.node_map);
1969
1970 memset(&br, 0, sizeof(br));
1971 br.node_idx = dlm->node_num;
1972 br.dead_node = dead_node;
1973
1974 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1975 ret = 0;
1976 if (nodenum == dead_node) {
1977 mlog(0, "not sending begin reco to dead node "
1978 "%u\n", dead_node);
1979 continue;
1980 }
1981 if (nodenum == dlm->node_num) {
1982 mlog(0, "not sending begin reco to self\n");
1983 continue;
1984 }
1985
1986 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum);
1989 ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
1990 &br, sizeof(br), nodenum, &status);
1991 /* negative status is handled ok by caller here */
1992 if (ret >= 0)
1993 ret = status;
1994 if (ret < 0) {
1995 struct dlm_lock_resource *res;
1996 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret);
1999 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2000 DLM_RECOVERY_LOCK_NAME_LEN);
2001 if (res) {
2002 dlm_print_one_lock_resource(res);
2003 dlm_lockres_put(res);
2004 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n");
2006 }
2007 break;
2008 }
2009 }
2010
2011 return ret;
2012}
2013
2014int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2015{
2016 struct dlm_ctxt *dlm = data;
2017 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
2018
2019 /* ok to return 0, domain has gone away */
2020 if (!dlm_grab(dlm))
2021 return 0;
2022
2023 mlog(0, "node %u wants to recover node %u\n",
2024 br->node_idx, br->dead_node);
2025
2026 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2027
2028 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n",
2031 dlm->reco.new_master);
2032 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n",
2035 dlm->reco.dead_node);
2036 }
2037 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node);
2043 __dlm_hb_node_down(dlm, br->dead_node);
2044 }
2045 spin_unlock(&dlm->spinlock);
2046
2047 dlm_kick_recovery_thread(dlm);
2048 dlm_put(dlm);
2049 return 0;
2050}
2051
2052static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2053{
2054 int ret = 0;
2055 struct dlm_finalize_reco fr;
2056 struct dlm_node_iter iter;
2057 int nodenum;
2058 int status;
2059
2060 mlog(0, "finishing recovery for node %s:%u\n",
2061 dlm->name, dlm->reco.dead_node);
2062
2063 spin_lock(&dlm->spinlock);
2064 dlm_node_iter_init(dlm->domain_map, &iter);
2065 spin_unlock(&dlm->spinlock);
2066
2067 memset(&fr, 0, sizeof(fr));
2068 fr.node_idx = dlm->node_num;
2069 fr.dead_node = dlm->reco.dead_node;
2070
2071 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2072 if (nodenum == dlm->node_num)
2073 continue;
2074 ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
2075 &fr, sizeof(fr), nodenum, &status);
2076 if (ret >= 0) {
2077 ret = status;
2078 if (dlm_is_host_down(ret)) {
2079 /* this has no effect on this recovery
2080 * session, so set the status to zero to
2081 * finish out the last recovery */
2082 mlog(ML_ERROR, "node %u went down after this "
2083 "node finished recovery.\n", nodenum);
2084 ret = 0;
2085 }
2086 }
2087 if (ret < 0) {
2088 mlog_errno(ret);
2089 break;
2090 }
2091 }
2092
2093 return ret;
2094}
2095
2096int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2097{
2098 struct dlm_ctxt *dlm = data;
2099 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
2100
2101 /* ok to return 0, domain has gone away */
2102 if (!dlm_grab(dlm))
2103 return 0;
2104
2105 mlog(0, "node %u finalizing recovery of node %u\n",
2106 fr->node_idx, fr->dead_node);
2107
2108 spin_lock(&dlm->spinlock);
2109
2110 if (dlm->reco.new_master != fr->node_idx) {
2111 mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
2112 "%u is supposed to be the new master, dead=%u\n",
2113 fr->node_idx, dlm->reco.new_master, fr->dead_node);
2114 BUG();
2115 }
2116 if (dlm->reco.dead_node != fr->dead_node) {
2117 mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
2118 "node %u, but node %u is supposed to be dead\n",
2119 fr->node_idx, fr->dead_node, dlm->reco.dead_node);
2120 BUG();
2121 }
2122
2123 dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
2124
2125 spin_unlock(&dlm->spinlock);
2126
2127 dlm_reset_recovery(dlm);
2128
2129 dlm_kick_recovery_thread(dlm);
2130 dlm_put(dlm);
2131 return 0;
2132}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 000000000000..5be9d14f12cb
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,692 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmthread.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
53#include "cluster/masklog.h"
54
55static int dlm_thread(void *data);
56
57static void dlm_flush_asts(struct dlm_ctxt *dlm);
58
59#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
60
61/* will exit holding res->spinlock, but may drop in function */
62/* waits until flags are cleared on res->state */
63void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
64{
65 DECLARE_WAITQUEUE(wait, current);
66
67 assert_spin_locked(&res->spinlock);
68
69 add_wait_queue(&res->wq, &wait);
70repeat:
71 set_current_state(TASK_UNINTERRUPTIBLE);
72 if (res->state & flags) {
73 spin_unlock(&res->spinlock);
74 schedule();
75 spin_lock(&res->spinlock);
76 goto repeat;
77 }
78 remove_wait_queue(&res->wq, &wait);
79 current->state = TASK_RUNNING;
80}
81
82
83static int __dlm_lockres_unused(struct dlm_lock_resource *res)
84{
85 if (list_empty(&res->granted) &&
86 list_empty(&res->converting) &&
87 list_empty(&res->blocked) &&
88 list_empty(&res->dirty))
89 return 1;
90 return 0;
91}
92
93
94/* Call whenever you may have added or deleted something from one of
95 * the lockres queue's. This will figure out whether it belongs on the
96 * unused list or not and does the appropriate thing. */
97void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
98 struct dlm_lock_resource *res)
99{
100 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
101
102 assert_spin_locked(&dlm->spinlock);
103 assert_spin_locked(&res->spinlock);
104
105 if (__dlm_lockres_unused(res)){
106 if (list_empty(&res->purge)) {
107 mlog(0, "putting lockres %.*s from purge list\n",
108 res->lockname.len, res->lockname.name);
109
110 res->last_used = jiffies;
111 list_add_tail(&res->purge, &dlm->purge_list);
112 dlm->purge_count++;
113 }
114 } else if (!list_empty(&res->purge)) {
115 mlog(0, "removing lockres %.*s from purge list\n",
116 res->lockname.len, res->lockname.name);
117
118 list_del_init(&res->purge);
119 dlm->purge_count--;
120 }
121}
122
123void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
124 struct dlm_lock_resource *res)
125{
126 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
127 spin_lock(&dlm->spinlock);
128 spin_lock(&res->spinlock);
129
130 __dlm_lockres_calc_usage(dlm, res);
131
132 spin_unlock(&res->spinlock);
133 spin_unlock(&dlm->spinlock);
134}
135
136/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
137 * to do migration, but will re-acquire before exit. */
138void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
139{
140 int master;
141 int ret;
142
143 spin_lock(&lockres->spinlock);
144 master = lockres->owner == dlm->node_num;
145 spin_unlock(&lockres->spinlock);
146
147 mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
148 lockres->lockname.name, master);
149
150 /* Non master is the easy case -- no migration required, just
151 * quit. */
152 if (!master)
153 goto finish;
154
155 /* Wheee! Migrate lockres here! */
156 spin_unlock(&dlm->spinlock);
157again:
158
159 ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
160 if (ret == -ENOTEMPTY) {
161 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
162 lockres->lockname.len, lockres->lockname.name);
163
164 BUG();
165 } else if (ret < 0) {
166 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
167 lockres->lockname.len, lockres->lockname.name);
168 goto again;
169 }
170
171 spin_lock(&dlm->spinlock);
172
173finish:
174 if (!list_empty(&lockres->purge)) {
175 list_del_init(&lockres->purge);
176 dlm->purge_count--;
177 }
178 __dlm_unhash_lockres(lockres);
179}
180
181static void dlm_run_purge_list(struct dlm_ctxt *dlm,
182 int purge_now)
183{
184 unsigned int run_max, unused;
185 unsigned long purge_jiffies;
186 struct dlm_lock_resource *lockres;
187
188 spin_lock(&dlm->spinlock);
189 run_max = dlm->purge_count;
190
191 while(run_max && !list_empty(&dlm->purge_list)) {
192 run_max--;
193
194 lockres = list_entry(dlm->purge_list.next,
195 struct dlm_lock_resource, purge);
196
197 /* Status of the lockres *might* change so double
198 * check. If the lockres is unused, holding the dlm
199 * spinlock will prevent people from getting and more
200 * refs on it -- there's no need to keep the lockres
201 * spinlock. */
202 spin_lock(&lockres->spinlock);
203 unused = __dlm_lockres_unused(lockres);
204 spin_unlock(&lockres->spinlock);
205
206 if (!unused)
207 continue;
208
209 purge_jiffies = lockres->last_used +
210 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
211
212 /* Make sure that we want to be processing this guy at
213 * this time. */
214 if (!purge_now && time_after(purge_jiffies, jiffies)) {
215 /* Since resources are added to the purge list
216 * in tail order, we can stop at the first
217 * unpurgable resource -- anyone added after
218 * him will have a greater last_used value */
219 break;
220 }
221
222 list_del_init(&lockres->purge);
223 dlm->purge_count--;
224
225 /* This may drop and reacquire the dlm spinlock if it
226 * has to do migration. */
227 mlog(0, "calling dlm_purge_lockres!\n");
228 dlm_purge_lockres(dlm, lockres);
229 mlog(0, "DONE calling dlm_purge_lockres!\n");
230
231 /* Avoid adding any scheduling latencies */
232 cond_resched_lock(&dlm->spinlock);
233 }
234
235 spin_unlock(&dlm->spinlock);
236}
237
238static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
239 struct dlm_lock_resource *res)
240{
241 struct dlm_lock *lock, *target;
242 struct list_head *iter;
243 struct list_head *head;
244 int can_grant = 1;
245
246 //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
247 //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
248 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
249 // res->lockname.name);
250
251 /* because this function is called with the lockres
252 * spinlock, and because we know that it is not migrating/
253 * recovering/in-progress, it is fine to reserve asts and
254 * basts right before queueing them all throughout */
255 assert_spin_locked(&res->spinlock);
256 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
257 DLM_LOCK_RES_RECOVERING|
258 DLM_LOCK_RES_IN_PROGRESS)));
259
260converting:
261 if (list_empty(&res->converting))
262 goto blocked;
263 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
264 res->lockname.name);
265
266 target = list_entry(res->converting.next, struct dlm_lock, list);
267 if (target->ml.convert_type == LKM_IVMODE) {
268 mlog(ML_ERROR, "%.*s: converting a lock with no "
269 "convert_type!\n", res->lockname.len, res->lockname.name);
270 BUG();
271 }
272 head = &res->granted;
273 list_for_each(iter, head) {
274 lock = list_entry(iter, struct dlm_lock, list);
275 if (lock==target)
276 continue;
277 if (!dlm_lock_compatible(lock->ml.type,
278 target->ml.convert_type)) {
279 can_grant = 0;
280 /* queue the BAST if not already */
281 if (lock->ml.highest_blocked == LKM_IVMODE) {
282 __dlm_lockres_reserve_ast(res);
283 dlm_queue_bast(dlm, lock);
284 }
285 /* update the highest_blocked if needed */
286 if (lock->ml.highest_blocked < target->ml.convert_type)
287 lock->ml.highest_blocked =
288 target->ml.convert_type;
289 }
290 }
291 head = &res->converting;
292 list_for_each(iter, head) {
293 lock = list_entry(iter, struct dlm_lock, list);
294 if (lock==target)
295 continue;
296 if (!dlm_lock_compatible(lock->ml.type,
297 target->ml.convert_type)) {
298 can_grant = 0;
299 if (lock->ml.highest_blocked == LKM_IVMODE) {
300 __dlm_lockres_reserve_ast(res);
301 dlm_queue_bast(dlm, lock);
302 }
303 if (lock->ml.highest_blocked < target->ml.convert_type)
304 lock->ml.highest_blocked =
305 target->ml.convert_type;
306 }
307 }
308
309 /* we can convert the lock */
310 if (can_grant) {
311 spin_lock(&target->spinlock);
312 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
313
314 mlog(0, "calling ast for converting lock: %.*s, have: %d, "
315 "granting: %d, node: %u\n", res->lockname.len,
316 res->lockname.name, target->ml.type,
317 target->ml.convert_type, target->ml.node);
318
319 target->ml.type = target->ml.convert_type;
320 target->ml.convert_type = LKM_IVMODE;
321 list_del_init(&target->list);
322 list_add_tail(&target->list, &res->granted);
323
324 BUG_ON(!target->lksb);
325 target->lksb->status = DLM_NORMAL;
326
327 spin_unlock(&target->spinlock);
328
329 __dlm_lockres_reserve_ast(res);
330 dlm_queue_ast(dlm, target);
331 /* go back and check for more */
332 goto converting;
333 }
334
335blocked:
336 if (list_empty(&res->blocked))
337 goto leave;
338 target = list_entry(res->blocked.next, struct dlm_lock, list);
339
340 head = &res->granted;
341 list_for_each(iter, head) {
342 lock = list_entry(iter, struct dlm_lock, list);
343 if (lock==target)
344 continue;
345 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
346 can_grant = 0;
347 if (lock->ml.highest_blocked == LKM_IVMODE) {
348 __dlm_lockres_reserve_ast(res);
349 dlm_queue_bast(dlm, lock);
350 }
351 if (lock->ml.highest_blocked < target->ml.type)
352 lock->ml.highest_blocked = target->ml.type;
353 }
354 }
355
356 head = &res->converting;
357 list_for_each(iter, head) {
358 lock = list_entry(iter, struct dlm_lock, list);
359 if (lock==target)
360 continue;
361 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
362 can_grant = 0;
363 if (lock->ml.highest_blocked == LKM_IVMODE) {
364 __dlm_lockres_reserve_ast(res);
365 dlm_queue_bast(dlm, lock);
366 }
367 if (lock->ml.highest_blocked < target->ml.type)
368 lock->ml.highest_blocked = target->ml.type;
369 }
370 }
371
372 /* we can grant the blocked lock (only
373 * possible if converting list empty) */
374 if (can_grant) {
375 spin_lock(&target->spinlock);
376 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
377
378 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
379 "node: %u\n", res->lockname.len, res->lockname.name,
380 target->ml.type, target->ml.node);
381
382 // target->ml.type is already correct
383 list_del_init(&target->list);
384 list_add_tail(&target->list, &res->granted);
385
386 BUG_ON(!target->lksb);
387 target->lksb->status = DLM_NORMAL;
388
389 spin_unlock(&target->spinlock);
390
391 __dlm_lockres_reserve_ast(res);
392 dlm_queue_ast(dlm, target);
393 /* go back and check for more */
394 goto converting;
395 }
396
397leave:
398 return;
399}
400
401/* must have NO locks when calling this with res !=NULL * */
402void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
403{
404 mlog_entry("dlm=%p, res=%p\n", dlm, res);
405 if (res) {
406 spin_lock(&dlm->spinlock);
407 spin_lock(&res->spinlock);
408 __dlm_dirty_lockres(dlm, res);
409 spin_unlock(&res->spinlock);
410 spin_unlock(&dlm->spinlock);
411 }
412 wake_up(&dlm->dlm_thread_wq);
413}
414
415void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
416{
417 mlog_entry("dlm=%p, res=%p\n", dlm, res);
418
419 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&res->spinlock);
421
422 /* don't shuffle secondary queues */
423 if ((res->owner == dlm->node_num) &&
424 !(res->state & DLM_LOCK_RES_DIRTY)) {
425 list_add_tail(&res->dirty, &dlm->dirty_list);
426 res->state |= DLM_LOCK_RES_DIRTY;
427 }
428}
429
430
431/* Launch the NM thread for the mounted volume */
432int dlm_launch_thread(struct dlm_ctxt *dlm)
433{
434 mlog(0, "starting dlm thread...\n");
435
436 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
437 if (IS_ERR(dlm->dlm_thread_task)) {
438 mlog_errno(PTR_ERR(dlm->dlm_thread_task));
439 dlm->dlm_thread_task = NULL;
440 return -EINVAL;
441 }
442
443 return 0;
444}
445
446void dlm_complete_thread(struct dlm_ctxt *dlm)
447{
448 if (dlm->dlm_thread_task) {
449 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
450 kthread_stop(dlm->dlm_thread_task);
451 dlm->dlm_thread_task = NULL;
452 }
453}
454
455static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
456{
457 int empty;
458
459 spin_lock(&dlm->spinlock);
460 empty = list_empty(&dlm->dirty_list);
461 spin_unlock(&dlm->spinlock);
462
463 return empty;
464}
465
466static void dlm_flush_asts(struct dlm_ctxt *dlm)
467{
468 int ret;
469 struct dlm_lock *lock;
470 struct dlm_lock_resource *res;
471 u8 hi;
472
473 spin_lock(&dlm->ast_lock);
474 while (!list_empty(&dlm->pending_asts)) {
475 lock = list_entry(dlm->pending_asts.next,
476 struct dlm_lock, ast_list);
477 /* get an extra ref on lock */
478 dlm_lock_get(lock);
479 res = lock->lockres;
480 mlog(0, "delivering an ast for this lockres\n");
481
482 BUG_ON(!lock->ast_pending);
483
484 /* remove from list (including ref) */
485 list_del_init(&lock->ast_list);
486 dlm_lock_put(lock);
487 spin_unlock(&dlm->ast_lock);
488
489 if (lock->ml.node != dlm->node_num) {
490 ret = dlm_do_remote_ast(dlm, res, lock);
491 if (ret < 0)
492 mlog_errno(ret);
493 } else
494 dlm_do_local_ast(dlm, res, lock);
495
496 spin_lock(&dlm->ast_lock);
497
498 /* possible that another ast was queued while
499 * we were delivering the last one */
500 if (!list_empty(&lock->ast_list)) {
501 mlog(0, "aha another ast got queued while "
502 "we were finishing the last one. will "
503 "keep the ast_pending flag set.\n");
504 } else
505 lock->ast_pending = 0;
506
507 /* drop the extra ref.
508 * this may drop it completely. */
509 dlm_lock_put(lock);
510 dlm_lockres_release_ast(dlm, res);
511 }
512
513 while (!list_empty(&dlm->pending_basts)) {
514 lock = list_entry(dlm->pending_basts.next,
515 struct dlm_lock, bast_list);
516 /* get an extra ref on lock */
517 dlm_lock_get(lock);
518 res = lock->lockres;
519
520 BUG_ON(!lock->bast_pending);
521
522 /* get the highest blocked lock, and reset */
523 spin_lock(&lock->spinlock);
524 BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
525 hi = lock->ml.highest_blocked;
526 lock->ml.highest_blocked = LKM_IVMODE;
527 spin_unlock(&lock->spinlock);
528
529 /* remove from list (including ref) */
530 list_del_init(&lock->bast_list);
531 dlm_lock_put(lock);
532 spin_unlock(&dlm->ast_lock);
533
534 mlog(0, "delivering a bast for this lockres "
535 "(blocked = %d\n", hi);
536
537 if (lock->ml.node != dlm->node_num) {
538 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
539 if (ret < 0)
540 mlog_errno(ret);
541 } else
542 dlm_do_local_bast(dlm, res, lock, hi);
543
544 spin_lock(&dlm->ast_lock);
545
546 /* possible that another bast was queued while
547 * we were delivering the last one */
548 if (!list_empty(&lock->bast_list)) {
549 mlog(0, "aha another bast got queued while "
550 "we were finishing the last one. will "
551 "keep the bast_pending flag set.\n");
552 } else
553 lock->bast_pending = 0;
554
555 /* drop the extra ref.
556 * this may drop it completely. */
557 dlm_lock_put(lock);
558 dlm_lockres_release_ast(dlm, res);
559 }
560 wake_up(&dlm->ast_wq);
561 spin_unlock(&dlm->ast_lock);
562}
563
564
565#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
566#define DLM_THREAD_MAX_DIRTY 100
567#define DLM_THREAD_MAX_ASTS 10
568
569static int dlm_thread(void *data)
570{
571 struct dlm_lock_resource *res;
572 struct dlm_ctxt *dlm = data;
573 unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
574
575 mlog(0, "dlm thread running for %s...\n", dlm->name);
576
577 while (!kthread_should_stop()) {
578 int n = DLM_THREAD_MAX_DIRTY;
579
580 /* dlm_shutting_down is very point-in-time, but that
581 * doesn't matter as we'll just loop back around if we
582 * get false on the leading edge of a state
583 * transition. */
584 dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
585
586 /* We really don't want to hold dlm->spinlock while
587 * calling dlm_shuffle_lists on each lockres that
588 * needs to have its queues adjusted and AST/BASTs
589 * run. So let's pull each entry off the dirty_list
590 * and drop dlm->spinlock ASAP. Once off the list,
591 * res->spinlock needs to be taken again to protect
592 * the queues while calling dlm_shuffle_lists. */
593 spin_lock(&dlm->spinlock);
594 while (!list_empty(&dlm->dirty_list)) {
595 int delay = 0;
596 res = list_entry(dlm->dirty_list.next,
597 struct dlm_lock_resource, dirty);
598
599 /* peel a lockres off, remove it from the list,
600 * unset the dirty flag and drop the dlm lock */
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603
604 spin_lock(&res->spinlock);
605 res->state &= ~DLM_LOCK_RES_DIRTY;
606 list_del_init(&res->dirty);
607 spin_unlock(&res->spinlock);
608 spin_unlock(&dlm->spinlock);
609
610 /* lockres can be re-dirtied/re-added to the
611 * dirty_list in this gap, but that is ok */
612
613 spin_lock(&res->spinlock);
614 if (res->owner != dlm->node_num) {
615 __dlm_print_one_lock_resource(res);
616 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
617 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
618 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
619 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
620 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
621 }
622 BUG_ON(res->owner != dlm->node_num);
623
624 /* it is now ok to move lockreses in these states
625 * to the dirty list, assuming that they will only be
626 * dirty for a short while. */
627 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
628 DLM_LOCK_RES_MIGRATING |
629 DLM_LOCK_RES_RECOVERING)) {
630 /* move it to the tail and keep going */
631 spin_unlock(&res->spinlock);
632 mlog(0, "delaying list shuffling for in-"
633 "progress lockres %.*s, state=%d\n",
634 res->lockname.len, res->lockname.name,
635 res->state);
636 delay = 1;
637 goto in_progress;
638 }
639
640 /* at this point the lockres is not migrating/
641 * recovering/in-progress. we have the lockres
642 * spinlock and do NOT have the dlm lock.
643 * safe to reserve/queue asts and run the lists. */
644
645 mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
646 "res=%p\n", dlm, res);
647
648 /* called while holding lockres lock */
649 dlm_shuffle_lists(dlm, res);
650 spin_unlock(&res->spinlock);
651
652 dlm_lockres_calc_usage(dlm, res);
653
654in_progress:
655
656 spin_lock(&dlm->spinlock);
657 /* if the lock was in-progress, stick
658 * it on the back of the list */
659 if (delay) {
660 spin_lock(&res->spinlock);
661 list_add_tail(&res->dirty, &dlm->dirty_list);
662 res->state |= DLM_LOCK_RES_DIRTY;
663 spin_unlock(&res->spinlock);
664 }
665 dlm_lockres_put(res);
666
667 /* unlikely, but we may need to give time to
668 * other tasks */
669 if (!--n) {
670 mlog(0, "throttling dlm_thread\n");
671 break;
672 }
673 }
674
675 spin_unlock(&dlm->spinlock);
676 dlm_flush_asts(dlm);
677
678 /* yield and continue right away if there is more work to do */
679 if (!n) {
680 yield();
681 continue;
682 }
683
684 wait_event_interruptible_timeout(dlm->dlm_thread_wq,
685 !dlm_dirty_list_empty(dlm) ||
686 kthread_should_stop(),
687 timeout);
688 }
689
690 mlog(0, "quitting DLM thread\n");
691 return 0;
692}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 000000000000..cec2ce1cd318
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmunlock.c
5 *
6 * underlying calls for unlocking locks
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#define MLOG_MASK_PREFIX ML_DLM
51#include "cluster/masklog.h"
52
53#define DLM_UNLOCK_FREE_LOCK 0x00000001
54#define DLM_UNLOCK_CALL_AST 0x00000002
55#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
56#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
57#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
58
59
60static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock,
63 struct dlm_lockstatus *lksb,
64 int *actions);
65static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock,
68 struct dlm_lockstatus *lksb,
69 int *actions);
70
71static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
72 struct dlm_lock_resource *res,
73 struct dlm_lock *lock,
74 struct dlm_lockstatus *lksb,
75 int flags,
76 u8 owner);
77
78
79/*
80 * according to the spec:
81 * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
82 *
83 * flags & LKM_CANCEL != 0: must be converting or blocked
84 * flags & LKM_CANCEL == 0: must be granted
85 *
86 * So to unlock a converting lock, you must first cancel the
87 * convert (passing LKM_CANCEL in flags), then call the unlock
88 * again (with no LKM_CANCEL in flags).
89 */
90
91
92/*
93 * locking:
94 * caller needs: none
95 * taken: res->spinlock and lock->spinlock taken and dropped
96 * held on exit: none
97 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
98 * all callers should have taken an extra ref on lock coming in
99 */
100static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
101 struct dlm_lock_resource *res,
102 struct dlm_lock *lock,
103 struct dlm_lockstatus *lksb,
104 int flags, int *call_ast,
105 int master_node)
106{
107 enum dlm_status status;
108 int actions = 0;
109 int in_use;
110 u8 owner;
111
112 mlog(0, "master_node = %d, valblk = %d\n", master_node,
113 flags & LKM_VALBLK);
114
115 if (master_node)
116 BUG_ON(res->owner != dlm->node_num);
117 else
118 BUG_ON(res->owner == dlm->node_num);
119
120 spin_lock(&dlm->spinlock);
121 /* We want to be sure that we're not freeing a lock
122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->spinlock);
125 if (in_use) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name);
129 return DLM_BADPARAM;
130 }
131
132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) {
135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD;
138 }
139 /* ok for this to sleep if not in a network handler */
140 __dlm_wait_on_lockres(res);
141 res->state |= DLM_LOCK_RES_IN_PROGRESS;
142 }
143 spin_lock(&lock->spinlock);
144
145 if (res->state & DLM_LOCK_RES_RECOVERING) {
146 status = DLM_RECOVERING;
147 goto leave;
148 }
149
150
151 /* see above for what the spec says about
152 * LKM_CANCEL and the lock queue state */
153 if (flags & LKM_CANCEL)
154 status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
155 else
156 status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
157
158 if (status != DLM_NORMAL)
159 goto leave;
160
161 /* By now this has been masked out of cancel requests. */
162 if (flags & LKM_VALBLK) {
163 /* make the final update to the lvb */
164 if (master_node)
165 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
166 else
167 flags |= LKM_PUT_LVB; /* let the send function
168 * handle it. */
169 }
170
171 if (!master_node) {
172 owner = res->owner;
173 /* drop locks and send message */
174 if (flags & LKM_CANCEL)
175 lock->cancel_pending = 1;
176 else
177 lock->unlock_pending = 1;
178 spin_unlock(&lock->spinlock);
179 spin_unlock(&res->spinlock);
180 status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
181 flags, owner);
182 spin_lock(&res->spinlock);
183 spin_lock(&lock->spinlock);
184 /* if the master told us the lock was already granted,
185 * let the ast handle all of these actions */
186 if (status == DLM_NORMAL &&
187 lksb->status == DLM_CANCELGRANT) {
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 }
192 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0;
194 else
195 lock->unlock_pending = 0;
196
197 }
198
199 /* get an extra ref on lock. if we are just switching
200 * lists here, we dont want the lock to go away. */
201 dlm_lock_get(lock);
202
203 if (actions & DLM_UNLOCK_REMOVE_LOCK) {
204 list_del_init(&lock->list);
205 dlm_lock_put(lock);
206 }
207 if (actions & DLM_UNLOCK_REGRANT_LOCK) {
208 dlm_lock_get(lock);
209 list_add_tail(&lock->list, &res->granted);
210 }
211 if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
212 mlog(0, "clearing convert_type at %smaster node\n",
213 master_node ? "" : "non-");
214 lock->ml.convert_type = LKM_IVMODE;
215 }
216
217 /* remove the extra ref on lock */
218 dlm_lock_put(lock);
219
220leave:
221 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
222 if (!dlm_lock_on_list(&res->converting, lock))
223 BUG_ON(lock->ml.convert_type != LKM_IVMODE);
224 else
225 BUG_ON(lock->ml.convert_type == LKM_IVMODE);
226 spin_unlock(&lock->spinlock);
227 spin_unlock(&res->spinlock);
228 wake_up(&res->wq);
229
230 /* let the caller's final dlm_lock_put handle the actual kfree */
231 if (actions & DLM_UNLOCK_FREE_LOCK) {
232 /* this should always be coupled with list removal */
233 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
234 mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
235 lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
236 dlm_lock_put(lock);
237 }
238 if (actions & DLM_UNLOCK_CALL_AST)
239 *call_ast = 1;
240
241 /* if cancel or unlock succeeded, lvb work is done */
242 if (status == DLM_NORMAL)
243 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
244
245 return status;
246}
247
248void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
249 struct dlm_lock *lock)
250{
251 /* leave DLM_LKSB_PUT_LVB on the lksb so any final
252 * update of the lvb will be sent to the new master */
253 list_del_init(&lock->list);
254}
255
256void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
257 struct dlm_lock *lock)
258{
259 list_del_init(&lock->list);
260 list_add_tail(&lock->list, &res->granted);
261 lock->ml.convert_type = LKM_IVMODE;
262}
263
264
265static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
266 struct dlm_lock_resource *res,
267 struct dlm_lock *lock,
268 struct dlm_lockstatus *lksb,
269 int flags,
270 int *call_ast)
271{
272 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
273}
274
275static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
276 struct dlm_lock_resource *res,
277 struct dlm_lock *lock,
278 struct dlm_lockstatus *lksb,
279 int flags, int *call_ast)
280{
281 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
282}
283
284/*
285 * locking:
286 * caller needs: none
287 * taken: none
288 * held on exit: none
289 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
290 */
291static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
292 struct dlm_lock_resource *res,
293 struct dlm_lock *lock,
294 struct dlm_lockstatus *lksb,
295 int flags,
296 u8 owner)
297{
298 struct dlm_unlock_lock unlock;
299 int tmpret;
300 enum dlm_status ret;
301 int status = 0;
302 struct kvec vec[2];
303 size_t veclen = 1;
304
305 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
306
307 memset(&unlock, 0, sizeof(unlock));
308 unlock.node_idx = dlm->node_num;
309 unlock.flags = cpu_to_be32(flags);
310 unlock.cookie = lock->ml.cookie;
311 unlock.namelen = res->lockname.len;
312 memcpy(unlock.name, res->lockname.name, unlock.namelen);
313
314 vec[0].iov_len = sizeof(struct dlm_unlock_lock);
315 vec[0].iov_base = &unlock;
316
317 if (flags & LKM_PUT_LVB) {
318 /* extra data to send if we are updating lvb */
319 vec[1].iov_len = DLM_LVB_LEN;
320 vec[1].iov_base = lock->lksb->lvb;
321 veclen++;
322 }
323
324 tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
325 vec, veclen, owner, &status);
326 if (tmpret >= 0) {
327 // successfully sent and received
328 if (status == DLM_CANCELGRANT)
329 ret = DLM_NORMAL;
330 else if (status == DLM_FORWARD) {
331 mlog(0, "master was in-progress. retry\n");
332 ret = DLM_FORWARD;
333 } else
334 ret = status;
335 lksb->status = status;
336 } else {
337 mlog_errno(tmpret);
338 if (dlm_is_host_down(tmpret)) {
339 /* NOTE: this seems strange, but it is what we want.
340 * when the master goes down during a cancel or
341 * unlock, the recovery code completes the operation
342 * as if the master had not died, then passes the
343 * updated state to the recovery master. this thread
344 * just needs to finish out the operation and call
345 * the unlockast. */
346 ret = DLM_NORMAL;
347 } else {
348 /* something bad. this will BUG in ocfs2 */
349 ret = dlm_err_to_dlm_status(tmpret);
350 }
351 lksb->status = ret;
352 }
353
354 return ret;
355}
356
357/*
358 * locking:
359 * caller needs: none
360 * taken: takes and drops res->spinlock
361 * held on exit: none
362 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
363 * return value from dlmunlock_master
364 */
365int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
366{
367 struct dlm_ctxt *dlm = data;
368 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
369 struct dlm_lock_resource *res = NULL;
370 struct list_head *iter;
371 struct dlm_lock *lock = NULL;
372 enum dlm_status status = DLM_NORMAL;
373 int found = 0, i;
374 struct dlm_lockstatus *lksb = NULL;
375 int ignore;
376 u32 flags;
377 struct list_head *queue;
378
379 flags = be32_to_cpu(unlock->flags);
380
381 if (flags & LKM_GET_LVB) {
382 mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
383 return DLM_BADARGS;
384 }
385
386 if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
387 mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
388 "request!\n");
389 return DLM_BADARGS;
390 }
391
392 if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
393 mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
394 return DLM_IVBUFLEN;
395 }
396
397 if (!dlm_grab(dlm))
398 return DLM_REJECTED;
399
400 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
401 "Domain %s not fully joined!\n", dlm->name);
402
403 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
404
405 res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
406 if (!res) {
407 /* We assume here that a no lock resource simply means
408 * it was migrated away and destroyed before the other
409 * node could detect it. */
410 mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
411 status = DLM_FORWARD;
412 goto not_found;
413 }
414
415 queue=&res->granted;
416 found = 0;
417 spin_lock(&res->spinlock);
418 if (res->state & DLM_LOCK_RES_RECOVERING) {
419 spin_unlock(&res->spinlock);
420 mlog(0, "returning DLM_RECOVERING\n");
421 status = DLM_RECOVERING;
422 goto leave;
423 }
424
425 if (res->state & DLM_LOCK_RES_MIGRATING) {
426 spin_unlock(&res->spinlock);
427 mlog(0, "returning DLM_MIGRATING\n");
428 status = DLM_MIGRATING;
429 goto leave;
430 }
431
432 if (res->owner != dlm->node_num) {
433 spin_unlock(&res->spinlock);
434 mlog(0, "returning DLM_FORWARD -- not master\n");
435 status = DLM_FORWARD;
436 goto leave;
437 }
438
439 for (i=0; i<3; i++) {
440 list_for_each(iter, queue) {
441 lock = list_entry(iter, struct dlm_lock, list);
442 if (lock->ml.cookie == unlock->cookie &&
443 lock->ml.node == unlock->node_idx) {
444 dlm_lock_get(lock);
445 found = 1;
446 break;
447 }
448 }
449 if (found)
450 break;
451 /* scan granted -> converting -> blocked queues */
452 queue++;
453 }
454 spin_unlock(&res->spinlock);
455 if (!found) {
456 status = DLM_IVLOCKID;
457 goto not_found;
458 }
459
460 /* lock was found on queue */
461 lksb = lock->lksb;
462 /* unlockast only called on originating node */
463 if (flags & LKM_PUT_LVB) {
464 lksb->flags |= DLM_LKSB_PUT_LVB;
465 memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
466 }
467
468 /* if this is in-progress, propagate the DLM_FORWARD
469 * all the way back out */
470 status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
471 if (status == DLM_FORWARD)
472 mlog(0, "lockres is in progress\n");
473
474 if (flags & LKM_PUT_LVB)
475 lksb->flags &= ~DLM_LKSB_PUT_LVB;
476
477 dlm_lockres_calc_usage(dlm, res);
478 dlm_kick_thread(dlm, res);
479
480not_found:
481 if (!found)
482 mlog(ML_ERROR, "failed to find lock to unlock! "
483 "cookie=%"MLFu64"\n",
484 unlock->cookie);
485 else {
486 /* send the lksb->status back to the other node */
487 status = lksb->status;
488 dlm_lock_put(lock);
489 }
490
491leave:
492 if (res)
493 dlm_lockres_put(res);
494
495 dlm_put(dlm);
496
497 return status;
498}
499
500
501static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
502 struct dlm_lock_resource *res,
503 struct dlm_lock *lock,
504 struct dlm_lockstatus *lksb,
505 int *actions)
506{
507 enum dlm_status status;
508
509 if (dlm_lock_on_list(&res->blocked, lock)) {
510 /* cancel this outright */
511 lksb->status = DLM_NORMAL;
512 status = DLM_NORMAL;
513 *actions = (DLM_UNLOCK_CALL_AST |
514 DLM_UNLOCK_REMOVE_LOCK);
515 } else if (dlm_lock_on_list(&res->converting, lock)) {
516 /* cancel the request, put back on granted */
517 lksb->status = DLM_NORMAL;
518 status = DLM_NORMAL;
519 *actions = (DLM_UNLOCK_CALL_AST |
520 DLM_UNLOCK_REMOVE_LOCK |
521 DLM_UNLOCK_REGRANT_LOCK |
522 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
523 } else if (dlm_lock_on_list(&res->granted, lock)) {
524 /* too late, already granted. DLM_CANCELGRANT */
525 lksb->status = DLM_CANCELGRANT;
526 status = DLM_NORMAL;
527 *actions = DLM_UNLOCK_CALL_AST;
528 } else {
529 mlog(ML_ERROR, "lock to cancel is not on any list!\n");
530 lksb->status = DLM_IVLOCKID;
531 status = DLM_IVLOCKID;
532 *actions = 0;
533 }
534 return status;
535}
536
537static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
538 struct dlm_lock_resource *res,
539 struct dlm_lock *lock,
540 struct dlm_lockstatus *lksb,
541 int *actions)
542{
543 enum dlm_status status;
544
545 /* unlock request */
546 if (!dlm_lock_on_list(&res->granted, lock)) {
547 lksb->status = DLM_DENIED;
548 status = DLM_DENIED;
549 dlm_error(status);
550 *actions = 0;
551 } else {
552 /* unlock granted lock */
553 lksb->status = DLM_NORMAL;
554 status = DLM_NORMAL;
555 *actions = (DLM_UNLOCK_FREE_LOCK |
556 DLM_UNLOCK_CALL_AST |
557 DLM_UNLOCK_REMOVE_LOCK);
558 }
559 return status;
560}
561
562/* there seems to be no point in doing this async
563 * since (even for the remote case) there is really
564 * no work to queue up... so just do it and fire the
565 * unlockast by hand when done... */
566enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
567 int flags, dlm_astunlockfunc_t *unlockast, void *data)
568{
569 enum dlm_status status;
570 struct dlm_lock_resource *res;
571 struct dlm_lock *lock = NULL;
572 int call_ast, is_master;
573
574 mlog_entry_void();
575
576 if (!lksb) {
577 dlm_error(DLM_BADARGS);
578 return DLM_BADARGS;
579 }
580
581 if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
582 dlm_error(DLM_BADPARAM);
583 return DLM_BADPARAM;
584 }
585
586 if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
587 mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
588 flags &= ~LKM_VALBLK;
589 }
590
591 if (!lksb->lockid || !lksb->lockid->lockres) {
592 dlm_error(DLM_BADPARAM);
593 return DLM_BADPARAM;
594 }
595
596 lock = lksb->lockid;
597 BUG_ON(!lock);
598 dlm_lock_get(lock);
599
600 res = lock->lockres;
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603retry:
604 call_ast = 0;
605 /* need to retry up here because owner may have changed */
606 mlog(0, "lock=%p res=%p\n", lock, res);
607
608 spin_lock(&res->spinlock);
609 is_master = (res->owner == dlm->node_num);
610 spin_unlock(&res->spinlock);
611
612 if (is_master) {
613 status = dlmunlock_master(dlm, res, lock, lksb, flags,
614 &call_ast);
615 mlog(0, "done calling dlmunlock_master: returned %d, "
616 "call_ast is %d\n", status, call_ast);
617 } else {
618 status = dlmunlock_remote(dlm, res, lock, lksb, flags,
619 &call_ast);
620 mlog(0, "done calling dlmunlock_remote: returned %d, "
621 "call_ast is %d\n", status, call_ast);
622 }
623
624 if (status == DLM_RECOVERING ||
625 status == DLM_MIGRATING ||
626 status == DLM_FORWARD) {
627 /* We want to go away for a tiny bit to allow recovery
628 * / migration to complete on this resource. I don't
629 * know of any wait queue we could sleep on as this
630 * may be happening on another node. Perhaps the
631 * proper solution is to queue up requests on the
632 * other end? */
633
634 /* do we want to yield(); ?? */
635 msleep(50);
636
637 mlog(0, "retrying unlock due to pending recovery/"
638 "migration/in-progress\n");
639 goto retry;
640 }
641
642 if (call_ast) {
643 mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
644 if (is_master) {
645 /* it is possible that there is one last bast
646 * pending. make sure it is flushed, then
647 * call the unlockast.
648 * not an issue if this is a mastered remotely,
649 * since this lock has been removed from the
650 * lockres queues and cannot be found. */
651 dlm_kick_thread(dlm, NULL);
652 wait_event(dlm->ast_wq,
653 dlm_lock_basts_flushed(dlm, lock));
654 }
655 (*unlockast)(data, lksb->status);
656 }
657
658 if (status == DLM_NORMAL) {
659 mlog(0, "kicking the thread\n");
660 dlm_kick_thread(dlm, res);
661 } else
662 dlm_error(status);
663
664 dlm_lockres_calc_usage(dlm, res);
665 dlm_lockres_put(res);
666 dlm_lock_put(lock);
667
668 mlog(0, "returning status=%d!\n", status);
669 return status;
670}
671EXPORT_SYMBOL_GPL(dlmunlock);
672
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 000000000000..7ef2653f8f41
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 000000000000..f674aee77a16
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 000000000000..e1fdd288796e
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM.
8 *
9 * Many of the functions here are pared down versions of dlmglue.c
10 * functions.
11 *
12 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public
16 * License as published by the Free Software Foundation; either
17 * version 2 of the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public
25 * License along with this program; if not, write to the
26 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 * Boston, MA 021110-1307, USA.
28 */
29
30#include <asm/signal.h>
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/types.h>
35#include <linux/crc32.h>
36
37
38#include "cluster/nodemanager.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h"
45
46#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h"
48
49static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag)
51{
52 int ret;
53
54 spin_lock(&lockres->l_lock);
55 ret = lockres->l_flags & flag;
56 spin_unlock(&lockres->l_lock);
57
58 return ret;
59}
60
61static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
62
63{
64 wait_event(lockres->l_event,
65 !user_check_wait_flag(lockres, USER_LOCK_BUSY));
66}
67
68static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
69
70{
71 wait_event(lockres->l_event,
72 !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
73}
74
75/* I heart container_of... */
76static inline struct dlm_ctxt *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
78{
79 struct dlmfs_inode_private *ip;
80
81 ip = container_of(lockres,
82 struct dlmfs_inode_private,
83 ip_lockres);
84 return ip->ip_dlm;
85}
86
87static struct inode *
88user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
89{
90 struct dlmfs_inode_private *ip;
91
92 ip = container_of(lockres,
93 struct dlmfs_inode_private,
94 ip_lockres);
95 return &ip->ip_vfs_inode;
96}
97
98static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
99{
100 spin_lock(&lockres->l_lock);
101 lockres->l_flags &= ~USER_LOCK_BUSY;
102 spin_unlock(&lockres->l_lock);
103}
104
105#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
107 "resource %s: %s\n", dlm_errname(_stat), _func, \
108 _lockres->l_name, dlm_errmsg(_stat)); \
109} while (0)
110
111/* WARNING: This function lives in a world where the only three lock
112 * levels are EX, PR, and NL. It *will* have to be adjusted when more
113 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level)
115{
116 int new_level = LKM_EXMODE;
117
118 if (level == LKM_EXMODE)
119 new_level = LKM_NLMODE;
120 else if (level == LKM_PRMODE)
121 new_level = LKM_PRMODE;
122 return new_level;
123}
124
125static void user_ast(void *opaque)
126{
127 struct user_lock_res *lockres = opaque;
128 struct dlm_lockstatus *lksb;
129
130 mlog(0, "AST fired for lockres %s\n", lockres->l_name);
131
132 spin_lock(&lockres->l_lock);
133
134 lksb = &(lockres->l_lksb);
135 if (lksb->status != DLM_NORMAL) {
136 mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
137 lksb->status, lockres->l_name);
138 spin_unlock(&lockres->l_lock);
139 return;
140 }
141
142 /* we're downconverting. */
143 if (lockres->l_requested < lockres->l_level) {
144 if (lockres->l_requested <=
145 user_highest_compat_lock_level(lockres->l_blocking)) {
146 lockres->l_blocking = LKM_NLMODE;
147 lockres->l_flags &= ~USER_LOCK_BLOCKED;
148 }
149 }
150
151 lockres->l_level = lockres->l_requested;
152 lockres->l_requested = LKM_IVMODE;
153 lockres->l_flags |= USER_LOCK_ATTACHED;
154 lockres->l_flags &= ~USER_LOCK_BUSY;
155
156 spin_unlock(&lockres->l_lock);
157
158 wake_up(&lockres->l_event);
159}
160
161static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
162{
163 struct inode *inode;
164 inode = user_dlm_inode_from_user_lockres(lockres);
165 if (!igrab(inode))
166 BUG();
167}
168
169static void user_dlm_unblock_lock(void *opaque);
170
171static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
172{
173 if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
174 user_dlm_grab_inode_ref(lockres);
175
176 INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
177 lockres);
178
179 queue_work(user_dlm_worker, &lockres->l_work);
180 lockres->l_flags |= USER_LOCK_QUEUED;
181 }
182}
183
184static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
185{
186 int queue = 0;
187
188 if (!(lockres->l_flags & USER_LOCK_BLOCKED))
189 return;
190
191 switch (lockres->l_blocking) {
192 case LKM_EXMODE:
193 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
194 queue = 1;
195 break;
196 case LKM_PRMODE:
197 if (!lockres->l_ex_holders)
198 queue = 1;
199 break;
200 default:
201 BUG();
202 }
203
204 if (queue)
205 __user_dlm_queue_lockres(lockres);
206}
207
208static void user_bast(void *opaque, int level)
209{
210 struct user_lock_res *lockres = opaque;
211
212 mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
213 lockres->l_name, level);
214
215 spin_lock(&lockres->l_lock);
216 lockres->l_flags |= USER_LOCK_BLOCKED;
217 if (level > lockres->l_blocking)
218 lockres->l_blocking = level;
219
220 __user_dlm_queue_lockres(lockres);
221 spin_unlock(&lockres->l_lock);
222
223 wake_up(&lockres->l_event);
224}
225
226static void user_unlock_ast(void *opaque, enum dlm_status status)
227{
228 struct user_lock_res *lockres = opaque;
229
230 mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
231
232 if (status != DLM_NORMAL)
233 mlog(ML_ERROR, "Dlm returns status %d\n", status);
234
235 spin_lock(&lockres->l_lock);
236 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
237 lockres->l_level = LKM_IVMODE;
238 else {
239 lockres->l_requested = LKM_IVMODE; /* cancel an
240 * upconvert
241 * request. */
242 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
243 /* we want the unblock thread to look at it again
244 * now. */
245 __user_dlm_queue_lockres(lockres);
246 }
247
248 lockres->l_flags &= ~USER_LOCK_BUSY;
249 spin_unlock(&lockres->l_lock);
250
251 wake_up(&lockres->l_event);
252}
253
254static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
255{
256 struct inode *inode;
257 inode = user_dlm_inode_from_user_lockres(lockres);
258 iput(inode);
259}
260
261static void user_dlm_unblock_lock(void *opaque)
262{
263 int new_level, status;
264 struct user_lock_res *lockres = (struct user_lock_res *) opaque;
265 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
266
267 mlog(0, "processing lockres %s\n", lockres->l_name);
268
269 spin_lock(&lockres->l_lock);
270
271 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
272 BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
273
274 /* notice that we don't clear USER_LOCK_BLOCKED here. That's
275 * for user_ast to do. */
276 lockres->l_flags &= ~USER_LOCK_QUEUED;
277
278 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
279 mlog(0, "lock is in teardown so we do nothing\n");
280 spin_unlock(&lockres->l_lock);
281 goto drop_ref;
282 }
283
284 if (lockres->l_flags & USER_LOCK_BUSY) {
285 mlog(0, "BUSY flag detected...\n");
286 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
287 spin_unlock(&lockres->l_lock);
288 goto drop_ref;
289 }
290
291 lockres->l_flags |= USER_LOCK_IN_CANCEL;
292 spin_unlock(&lockres->l_lock);
293
294 status = dlmunlock(dlm,
295 &lockres->l_lksb,
296 LKM_CANCEL,
297 user_unlock_ast,
298 lockres);
299 if (status == DLM_CANCELGRANT) {
300 /* If we got this, then the ast was fired
301 * before we could cancel. We cleanup our
302 * state, and restart the function. */
303 spin_lock(&lockres->l_lock);
304 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
305 spin_unlock(&lockres->l_lock);
306 } else if (status != DLM_NORMAL)
307 user_log_dlm_error("dlmunlock", status, lockres);
308 goto drop_ref;
309 }
310
311 /* If there are still incompat holders, we can exit safely
312 * without worrying about re-queueing this lock as that will
313 * happen on the last call to user_cluster_unlock. */
314 if ((lockres->l_blocking == LKM_EXMODE)
315 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
316 spin_unlock(&lockres->l_lock);
317 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
318 lockres->l_ro_holders, lockres->l_ex_holders);
319 goto drop_ref;
320 }
321
322 if ((lockres->l_blocking == LKM_PRMODE)
323 && lockres->l_ex_holders) {
324 spin_unlock(&lockres->l_lock);
325 mlog(0, "can't downconvert for pr: ex = %u\n",
326 lockres->l_ex_holders);
327 goto drop_ref;
328 }
329
330 /* yay, we can downconvert now. */
331 new_level = user_highest_compat_lock_level(lockres->l_blocking);
332 lockres->l_requested = new_level;
333 lockres->l_flags |= USER_LOCK_BUSY;
334 mlog(0, "Downconvert lock from %d to %d\n",
335 lockres->l_level, new_level);
336 spin_unlock(&lockres->l_lock);
337
338 /* need lock downconvert request now... */
339 status = dlmlock(dlm,
340 new_level,
341 &lockres->l_lksb,
342 LKM_CONVERT|LKM_VALBLK,
343 lockres->l_name,
344 user_ast,
345 lockres,
346 user_bast);
347 if (status != DLM_NORMAL) {
348 user_log_dlm_error("dlmlock", status, lockres);
349 user_recover_from_dlm_error(lockres);
350 }
351
352drop_ref:
353 user_dlm_drop_inode_ref(lockres);
354}
355
356static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
357 int level)
358{
359 switch(level) {
360 case LKM_EXMODE:
361 lockres->l_ex_holders++;
362 break;
363 case LKM_PRMODE:
364 lockres->l_ro_holders++;
365 break;
366 default:
367 BUG();
368 }
369}
370
371/* predict what lock level we'll be dropping down to on behalf
372 * of another node, and return true if the currently wanted
373 * level will be compatible with it. */
374static inline int
375user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
376 int wanted)
377{
378 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
379
380 return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
381}
382
383int user_dlm_cluster_lock(struct user_lock_res *lockres,
384 int level,
385 int lkm_flags)
386{
387 int status, local_flags;
388 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
389
390 if (level != LKM_EXMODE &&
391 level != LKM_PRMODE) {
392 mlog(ML_ERROR, "lockres %s: invalid request!\n",
393 lockres->l_name);
394 status = -EINVAL;
395 goto bail;
396 }
397
398 mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
399 lockres->l_name,
400 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
401 lkm_flags);
402
403again:
404 if (signal_pending(current)) {
405 status = -ERESTARTSYS;
406 goto bail;
407 }
408
409 spin_lock(&lockres->l_lock);
410
411 /* We only compare against the currently granted level
412 * here. If the lock is blocked waiting on a downconvert,
413 * we'll get caught below. */
414 if ((lockres->l_flags & USER_LOCK_BUSY) &&
415 (level > lockres->l_level)) {
416 /* is someone sitting in dlm_lock? If so, wait on
417 * them. */
418 spin_unlock(&lockres->l_lock);
419
420 user_wait_on_busy_lock(lockres);
421 goto again;
422 }
423
424 if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
425 (!user_may_continue_on_blocked_lock(lockres, level))) {
426 /* is the lock is currently blocked on behalf of
427 * another node */
428 spin_unlock(&lockres->l_lock);
429
430 user_wait_on_blocked_lock(lockres);
431 goto again;
432 }
433
434 if (level > lockres->l_level) {
435 local_flags = lkm_flags | LKM_VALBLK;
436 if (lockres->l_level != LKM_IVMODE)
437 local_flags |= LKM_CONVERT;
438
439 lockres->l_requested = level;
440 lockres->l_flags |= USER_LOCK_BUSY;
441 spin_unlock(&lockres->l_lock);
442
443 BUG_ON(level == LKM_IVMODE);
444 BUG_ON(level == LKM_NLMODE);
445
446 mlog(0, "lock %s, get lock from %d to level = %d\n",
447 lockres->l_name, lockres->l_level, level);
448
449 /* call dlm_lock to upgrade lock now */
450 status = dlmlock(dlm,
451 level,
452 &lockres->l_lksb,
453 local_flags,
454 lockres->l_name,
455 user_ast,
456 lockres,
457 user_bast);
458 if (status != DLM_NORMAL) {
459 if ((lkm_flags & LKM_NOQUEUE) &&
460 (status == DLM_NOTQUEUED))
461 status = -EAGAIN;
462 else {
463 user_log_dlm_error("dlmlock", status, lockres);
464 status = -EINVAL;
465 }
466 user_recover_from_dlm_error(lockres);
467 goto bail;
468 }
469
470 mlog(0, "lock %s, successfull return from dlmlock\n",
471 lockres->l_name);
472
473 user_wait_on_busy_lock(lockres);
474 goto again;
475 }
476
477 user_dlm_inc_holders(lockres, level);
478 spin_unlock(&lockres->l_lock);
479
480 mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
481 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
482
483 status = 0;
484bail:
485 return status;
486}
487
488static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
489 int level)
490{
491 switch(level) {
492 case LKM_EXMODE:
493 BUG_ON(!lockres->l_ex_holders);
494 lockres->l_ex_holders--;
495 break;
496 case LKM_PRMODE:
497 BUG_ON(!lockres->l_ro_holders);
498 lockres->l_ro_holders--;
499 break;
500 default:
501 BUG();
502 }
503}
504
505void user_dlm_cluster_unlock(struct user_lock_res *lockres,
506 int level)
507{
508 if (level != LKM_EXMODE &&
509 level != LKM_PRMODE) {
510 mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
511 return;
512 }
513
514 mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
515 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
516
517 spin_lock(&lockres->l_lock);
518 user_dlm_dec_holders(lockres, level);
519 __user_dlm_cond_queue_lockres(lockres);
520 spin_unlock(&lockres->l_lock);
521}
522
523void user_dlm_write_lvb(struct inode *inode,
524 const char *val,
525 unsigned int len)
526{
527 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
528 char *lvb = lockres->l_lksb.lvb;
529
530 BUG_ON(len > DLM_LVB_LEN);
531
532 spin_lock(&lockres->l_lock);
533
534 BUG_ON(lockres->l_level < LKM_EXMODE);
535 memcpy(lvb, val, len);
536
537 spin_unlock(&lockres->l_lock);
538}
539
540void user_dlm_read_lvb(struct inode *inode,
541 char *val,
542 unsigned int len)
543{
544 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
545 char *lvb = lockres->l_lksb.lvb;
546
547 BUG_ON(len > DLM_LVB_LEN);
548
549 spin_lock(&lockres->l_lock);
550
551 BUG_ON(lockres->l_level < LKM_PRMODE);
552 memcpy(val, lvb, len);
553
554 spin_unlock(&lockres->l_lock);
555}
556
557void user_dlm_lock_res_init(struct user_lock_res *lockres,
558 struct dentry *dentry)
559{
560 memset(lockres, 0, sizeof(*lockres));
561
562 spin_lock_init(&lockres->l_lock);
563 init_waitqueue_head(&lockres->l_event);
564 lockres->l_level = LKM_IVMODE;
565 lockres->l_requested = LKM_IVMODE;
566 lockres->l_blocking = LKM_IVMODE;
567
568 /* should have been checked before getting here. */
569 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
570
571 memcpy(lockres->l_name,
572 dentry->d_name.name,
573 dentry->d_name.len);
574}
575
576int user_dlm_destroy_lock(struct user_lock_res *lockres)
577{
578 int status = -EBUSY;
579 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
580
581 mlog(0, "asked to destroy %s\n", lockres->l_name);
582
583 spin_lock(&lockres->l_lock);
584 while (lockres->l_flags & USER_LOCK_BUSY) {
585 spin_unlock(&lockres->l_lock);
586
587 mlog(0, "lock %s is busy\n", lockres->l_name);
588
589 user_wait_on_busy_lock(lockres);
590
591 spin_lock(&lockres->l_lock);
592 }
593
594 if (lockres->l_ro_holders || lockres->l_ex_holders) {
595 spin_unlock(&lockres->l_lock);
596 mlog(0, "lock %s has holders\n", lockres->l_name);
597 goto bail;
598 }
599
600 status = 0;
601 if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
602 spin_unlock(&lockres->l_lock);
603 mlog(0, "lock %s is not attached\n", lockres->l_name);
604 goto bail;
605 }
606
607 lockres->l_flags &= ~USER_LOCK_ATTACHED;
608 lockres->l_flags |= USER_LOCK_BUSY;
609 lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
610 spin_unlock(&lockres->l_lock);
611
612 mlog(0, "unlocking lockres %s\n", lockres->l_name);
613 status = dlmunlock(dlm,
614 &lockres->l_lksb,
615 LKM_VALBLK,
616 user_unlock_ast,
617 lockres);
618 if (status != DLM_NORMAL) {
619 user_log_dlm_error("dlmunlock", status, lockres);
620 status = -EINVAL;
621 goto bail;
622 }
623
624 user_wait_on_busy_lock(lockres);
625
626 status = 0;
627bail:
628 return status;
629}
630
631struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
632{
633 struct dlm_ctxt *dlm;
634 u32 dlm_key;
635 char *domain;
636
637 domain = kmalloc(name->len + 1, GFP_KERNEL);
638 if (!domain) {
639 mlog_errno(-ENOMEM);
640 return ERR_PTR(-ENOMEM);
641 }
642
643 dlm_key = crc32_le(0, name->name, name->len);
644
645 snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
646
647 dlm = dlm_register_domain(domain, dlm_key);
648 if (IS_ERR(dlm))
649 mlog_errno(PTR_ERR(dlm));
650
651 kfree(domain);
652 return dlm;
653}
654
655void user_dlm_unregister_context(struct dlm_ctxt *dlm)
656{
657 dlm_unregister_domain(dlm);
658}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 000000000000..04178bc40b76
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.h
5 *
6 * Userspace dlm defines
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef USERDLM_H
28#define USERDLM_H
29
30#include <linux/module.h>
31#include <linux/fs.h>
32#include <linux/types.h>
33#include <linux/workqueue.h>
34
35/* user_lock_res->l_flags flags. */
36#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized
37 * the lvb */
38#define USER_LOCK_BUSY (0x00000002) /* we are currently in
39 * dlm_lock */
40#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
41 * downconvert*/
42#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
43 * destroying this
44 * lock. */
45#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
46 * workqueue */
47#define USER_LOCK_IN_CANCEL (0x00000020)
48
49struct user_lock_res {
50 spinlock_t l_lock;
51
52 int l_flags;
53
54#define USER_DLM_LOCK_ID_MAX_LEN 32
55 char l_name[USER_DLM_LOCK_ID_MAX_LEN];
56 int l_level;
57 unsigned int l_ro_holders;
58 unsigned int l_ex_holders;
59 struct dlm_lockstatus l_lksb;
60
61 int l_requested;
62 int l_blocking;
63
64 wait_queue_head_t l_event;
65
66 struct work_struct l_work;
67};
68
69extern struct workqueue_struct *user_dlm_worker;
70
71void user_dlm_lock_res_init(struct user_lock_res *lockres,
72 struct dentry *dentry);
73int user_dlm_destroy_lock(struct user_lock_res *lockres);
74int user_dlm_cluster_lock(struct user_lock_res *lockres,
75 int level,
76 int lkm_flags);
77void user_dlm_cluster_unlock(struct user_lock_res *lockres,
78 int level);
79void user_dlm_write_lvb(struct inode *inode,
80 const char *val,
81 unsigned int len);
82void user_dlm_read_lvb(struct inode *inode,
83 char *val,
84 unsigned int len);
85struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
86void user_dlm_unregister_context(struct dlm_ctxt *dlm);
87
88struct dlmfs_inode_private {
89 struct dlm_ctxt *ip_dlm;
90
91 struct user_lock_res ip_lockres; /* unused for directories. */
92 struct inode *ip_parent;
93
94 struct inode ip_vfs_inode;
95};
96
97static inline struct dlmfs_inode_private *
98DLMFS_I(struct inode *inode)
99{
100 return container_of(inode,
101 struct dlmfs_inode_private,
102 ip_vfs_inode);
103}
104
105struct dlmfs_filp_private {
106 int fp_lock_level;
107};
108
109#define DLMFS_MAGIC 0x76a9f425
110
111#endif /* USERDLM_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 000000000000..e971ec2f8407
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "dlmglue.h"
50#include "extent_map.h"
51#include "heartbeat.h"
52#include "inode.h"
53#include "journal.h"
54#include "slot_map.h"
55#include "super.h"
56#include "uptodate.h"
57#include "vote.h"
58
59#include "buffer_head_io.h"
60
61struct ocfs2_mask_waiter {
62 struct list_head mw_item;
63 int mw_status;
64 struct completion mw_complete;
65 unsigned long mw_mask;
66 unsigned long mw_goal;
67};
68
69static void ocfs2_inode_ast_func(void *opaque);
70static void ocfs2_inode_bast_func(void *opaque,
71 int level);
72static void ocfs2_super_ast_func(void *opaque);
73static void ocfs2_super_bast_func(void *opaque,
74 int level);
75static void ocfs2_rename_ast_func(void *opaque);
76static void ocfs2_rename_bast_func(void *opaque,
77 int level);
78
79/* so far, all locks have gotten along with the same unlock ast */
80static void ocfs2_unlock_ast_func(void *opaque,
81 enum dlm_status status);
82static int ocfs2_do_unblock_meta(struct inode *inode,
83 int *requeue);
84static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 int *requeue);
86static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 int *requeue);
88static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 int *requeue);
90static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 int *requeue);
92typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 struct ocfs2_lock_res *lockres,
95 int *requeue,
96 ocfs2_convert_worker_t *worker);
97
98struct ocfs2_lock_res_ops {
99 void (*ast)(void *);
100 void (*bast)(void *, int);
101 void (*unlock_ast)(void *, enum dlm_status);
102 int (*unblock)(struct ocfs2_lock_res *, int *);
103};
104
105static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 .ast = ocfs2_inode_ast_func,
107 .bast = ocfs2_inode_bast_func,
108 .unlock_ast = ocfs2_unlock_ast_func,
109 .unblock = ocfs2_unblock_inode_lock,
110};
111
112static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 .ast = ocfs2_inode_ast_func,
114 .bast = ocfs2_inode_bast_func,
115 .unlock_ast = ocfs2_unlock_ast_func,
116 .unblock = ocfs2_unblock_meta,
117};
118
119static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 int blocking);
121
122static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 .ast = ocfs2_inode_ast_func,
124 .bast = ocfs2_inode_bast_func,
125 .unlock_ast = ocfs2_unlock_ast_func,
126 .unblock = ocfs2_unblock_data,
127};
128
129static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 .ast = ocfs2_super_ast_func,
131 .bast = ocfs2_super_bast_func,
132 .unlock_ast = ocfs2_unlock_ast_func,
133 .unblock = ocfs2_unblock_osb_lock,
134};
135
136static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 .ast = ocfs2_rename_ast_func,
138 .bast = ocfs2_rename_bast_func,
139 .unlock_ast = ocfs2_unlock_ast_func,
140 .unblock = ocfs2_unblock_osb_lock,
141};
142
143static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144{
145 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 lockres->l_type == OCFS2_LOCK_TYPE_RW;
148}
149
150static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151{
152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153}
154
155static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156{
157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158}
159
160static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161{
162 BUG_ON(!ocfs2_is_super_lock(lockres)
163 && !ocfs2_is_rename_lock(lockres));
164
165 return (struct ocfs2_super *) lockres->l_priv;
166}
167
168static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169{
170 BUG_ON(!ocfs2_is_inode_lock(lockres));
171
172 return (struct inode *) lockres->l_priv;
173}
174
175static int ocfs2_lock_create(struct ocfs2_super *osb,
176 struct ocfs2_lock_res *lockres,
177 int level,
178 int dlm_flags);
179static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 int wanted);
181static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 struct ocfs2_lock_res *lockres,
183 int level);
184static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 struct ocfs2_lock_res *lockres);
190static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 int convert);
192#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
194 "resource %s: %s\n", dlm_errname(_stat), _func, \
195 _lockres->l_name, dlm_errmsg(_stat)); \
196} while (0)
197static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 struct ocfs2_lock_res *lockres);
199static int ocfs2_meta_lock_update(struct inode *inode,
200 struct buffer_head **bh);
201static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202static inline int ocfs2_highest_compat_lock_level(int level);
203static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 struct ocfs2_lock_res *lockres,
205 int new_level);
206
207static char *ocfs2_lock_type_strings[] = {
208 [OCFS2_LOCK_TYPE_META] = "Meta",
209 [OCFS2_LOCK_TYPE_DATA] = "Data",
210 [OCFS2_LOCK_TYPE_SUPER] = "Super",
211 [OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 /* Need to differntiate from [R]ename.. serializing writes is the
213 * important job it does, anyway. */
214 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
215};
216
217static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218{
219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 return ocfs2_lock_type_strings[type];
221}
222
223static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 u64 blkno,
225 u32 generation,
226 char *name)
227{
228 int len;
229
230 mlog_entry_void();
231
232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233
234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
236 generation);
237
238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239
240 mlog(0, "built lock resource with name: %s\n", name);
241
242 mlog_exit_void();
243}
244
245static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
246
247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 struct ocfs2_dlm_debug *dlm_debug)
249{
250 mlog(0, "Add tracking for lockres %s\n", res->l_name);
251
252 spin_lock(&ocfs2_dlm_tracking_lock);
253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 spin_unlock(&ocfs2_dlm_tracking_lock);
255}
256
257static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258{
259 spin_lock(&ocfs2_dlm_tracking_lock);
260 if (!list_empty(&res->l_debug_list))
261 list_del_init(&res->l_debug_list);
262 spin_unlock(&ocfs2_dlm_tracking_lock);
263}
264
265static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 struct ocfs2_lock_res *res,
267 enum ocfs2_lock_type type,
268 u64 blkno,
269 u32 generation,
270 struct ocfs2_lock_res_ops *ops,
271 void *priv)
272{
273 ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274
275 res->l_type = type;
276 res->l_ops = ops;
277 res->l_priv = priv;
278
279 res->l_level = LKM_IVMODE;
280 res->l_requested = LKM_IVMODE;
281 res->l_blocking = LKM_IVMODE;
282 res->l_action = OCFS2_AST_INVALID;
283 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284
285 res->l_flags = OCFS2_LOCK_INITIALIZED;
286
287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288}
289
290void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291{
292 /* This also clears out the lock status block */
293 memset(res, 0, sizeof(struct ocfs2_lock_res));
294 spin_lock_init(&res->l_lock);
295 init_waitqueue_head(&res->l_event);
296 INIT_LIST_HEAD(&res->l_blocked_list);
297 INIT_LIST_HEAD(&res->l_mask_waiters);
298}
299
300void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 enum ocfs2_lock_type type,
302 struct inode *inode)
303{
304 struct ocfs2_lock_res_ops *ops;
305
306 switch(type) {
307 case OCFS2_LOCK_TYPE_RW:
308 ops = &ocfs2_inode_rw_lops;
309 break;
310 case OCFS2_LOCK_TYPE_META:
311 ops = &ocfs2_inode_meta_lops;
312 break;
313 case OCFS2_LOCK_TYPE_DATA:
314 ops = &ocfs2_inode_data_lops;
315 break;
316 default:
317 mlog_bug_on_msg(1, "type: %d\n", type);
318 ops = NULL; /* thanks, gcc */
319 break;
320 };
321
322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 OCFS2_I(inode)->ip_blkno,
324 inode->i_generation, ops, inode);
325}
326
327static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 struct ocfs2_super *osb)
329{
330 /* Superblock lockres doesn't come from a slab so we call init
331 * once on it manually. */
332 ocfs2_lock_res_init_once(res);
333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 OCFS2_SUPER_BLOCK_BLKNO, 0,
335 &ocfs2_super_lops, osb);
336}
337
338static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 struct ocfs2_super *osb)
340{
341 /* Rename lockres doesn't come from a slab so we call init
342 * once on it manually. */
343 ocfs2_lock_res_init_once(res);
344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 &ocfs2_rename_lops, osb);
346}
347
348void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349{
350 mlog_entry_void();
351
352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 return;
354
355 ocfs2_remove_lockres_tracking(res);
356
357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 "Lockres %s is on the blocked list\n",
359 res->l_name);
360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 "Lockres %s has mask waiters pending\n",
362 res->l_name);
363 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 "Lockres %s is locked\n",
365 res->l_name);
366 mlog_bug_on_msg(res->l_ro_holders,
367 "Lockres %s has %u ro holders\n",
368 res->l_name, res->l_ro_holders);
369 mlog_bug_on_msg(res->l_ex_holders,
370 "Lockres %s has %u ex holders\n",
371 res->l_name, res->l_ex_holders);
372
373 /* Need to clear out the lock status block for the dlm */
374 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375
376 res->l_flags = 0UL;
377 mlog_exit_void();
378}
379
380static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 int level)
382{
383 mlog_entry_void();
384
385 BUG_ON(!lockres);
386
387 switch(level) {
388 case LKM_EXMODE:
389 lockres->l_ex_holders++;
390 break;
391 case LKM_PRMODE:
392 lockres->l_ro_holders++;
393 break;
394 default:
395 BUG();
396 }
397
398 mlog_exit_void();
399}
400
401static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 int level)
403{
404 mlog_entry_void();
405
406 BUG_ON(!lockres);
407
408 switch(level) {
409 case LKM_EXMODE:
410 BUG_ON(!lockres->l_ex_holders);
411 lockres->l_ex_holders--;
412 break;
413 case LKM_PRMODE:
414 BUG_ON(!lockres->l_ro_holders);
415 lockres->l_ro_holders--;
416 break;
417 default:
418 BUG();
419 }
420 mlog_exit_void();
421}
422
423/* WARNING: This function lives in a world where the only three lock
424 * levels are EX, PR, and NL. It *will* have to be adjusted when more
425 * lock types are added. */
426static inline int ocfs2_highest_compat_lock_level(int level)
427{
428 int new_level = LKM_EXMODE;
429
430 if (level == LKM_EXMODE)
431 new_level = LKM_NLMODE;
432 else if (level == LKM_PRMODE)
433 new_level = LKM_PRMODE;
434 return new_level;
435}
436
437static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 unsigned long newflags)
439{
440 struct list_head *pos, *tmp;
441 struct ocfs2_mask_waiter *mw;
442
443 assert_spin_locked(&lockres->l_lock);
444
445 lockres->l_flags = newflags;
446
447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 continue;
451
452 list_del_init(&mw->mw_item);
453 mw->mw_status = 0;
454 complete(&mw->mw_complete);
455 }
456}
457static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458{
459 lockres_set_flags(lockres, lockres->l_flags | or);
460}
461static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 unsigned long clear)
463{
464 lockres_set_flags(lockres, lockres->l_flags & ~clear);
465}
466
467static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468{
469 mlog_entry_void();
470
471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475
476 lockres->l_level = lockres->l_requested;
477 if (lockres->l_level <=
478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 lockres->l_blocking = LKM_NLMODE;
480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 }
482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483
484 mlog_exit_void();
485}
486
487static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488{
489 mlog_entry_void();
490
491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493
494 /* Convert from RO to EX doesn't really need anything as our
495 * information is already up to data. Convert from NL to
496 * *anything* however should mark ourselves as needing an
497 * update */
498 if (lockres->l_level == LKM_NLMODE)
499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500
501 lockres->l_level = lockres->l_requested;
502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503
504 mlog_exit_void();
505}
506
507static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508{
509 mlog_entry_void();
510
511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513
514 if (lockres->l_requested > LKM_NLMODE &&
515 !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517
518 lockres->l_level = lockres->l_requested;
519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521
522 mlog_exit_void();
523}
524
525static void ocfs2_inode_ast_func(void *opaque)
526{
527 struct ocfs2_lock_res *lockres = opaque;
528 struct inode *inode;
529 struct dlm_lockstatus *lksb;
530 unsigned long flags;
531
532 mlog_entry_void();
533
534 inode = ocfs2_lock_res_inode(lockres);
535
536 mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
537 OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 ocfs2_lock_type_string(lockres->l_type));
539
540 BUG_ON(!ocfs2_is_inode_lock(lockres));
541
542 spin_lock_irqsave(&lockres->l_lock, flags);
543
544 lksb = &(lockres->l_lksb);
545 if (lksb->status != DLM_NORMAL) {
546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 "on inode %"MLFu64"\n", lksb->status,
548 OCFS2_I(inode)->ip_blkno);
549 spin_unlock_irqrestore(&lockres->l_lock, flags);
550 mlog_exit_void();
551 return;
552 }
553
554 switch(lockres->l_action) {
555 case OCFS2_AST_ATTACH:
556 ocfs2_generic_handle_attach_action(lockres);
557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 break;
559 case OCFS2_AST_CONVERT:
560 ocfs2_generic_handle_convert_action(lockres);
561 break;
562 case OCFS2_AST_DOWNCONVERT:
563 ocfs2_generic_handle_downconvert_action(lockres);
564 break;
565 default:
566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 "lockres flags = 0x%lx, unlock action: %u\n",
568 lockres->l_name, lockres->l_action, lockres->l_flags,
569 lockres->l_unlock_action);
570
571 BUG();
572 }
573
574 /* data and rw locking ignores refresh flag for now. */
575 if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577
578 /* set it to something invalid so if we get called again we
579 * can catch it. */
580 lockres->l_action = OCFS2_AST_INVALID;
581 spin_unlock_irqrestore(&lockres->l_lock, flags);
582 wake_up(&lockres->l_event);
583
584 mlog_exit_void();
585}
586
587static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 int level)
589{
590 int needs_downconvert = 0;
591 mlog_entry_void();
592
593 assert_spin_locked(&lockres->l_lock);
594
595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596
597 if (level > lockres->l_blocking) {
598 /* only schedule a downconvert if we haven't already scheduled
599 * one that goes low enough to satisfy the level we're
600 * blocking. this also catches the case where we get
601 * duplicate BASTs */
602 if (ocfs2_highest_compat_lock_level(level) <
603 ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 needs_downconvert = 1;
605
606 lockres->l_blocking = level;
607 }
608
609 mlog_exit(needs_downconvert);
610 return needs_downconvert;
611}
612
613static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 struct ocfs2_lock_res *lockres,
615 int level)
616{
617 int needs_downconvert;
618 unsigned long flags;
619
620 mlog_entry_void();
621
622 BUG_ON(level <= LKM_NLMODE);
623
624 spin_lock_irqsave(&lockres->l_lock, flags);
625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 if (needs_downconvert)
627 ocfs2_schedule_blocked_lock(osb, lockres);
628 spin_unlock_irqrestore(&lockres->l_lock, flags);
629
630 ocfs2_kick_vote_thread(osb);
631
632 wake_up(&lockres->l_event);
633 mlog_exit_void();
634}
635
636static void ocfs2_inode_bast_func(void *opaque, int level)
637{
638 struct ocfs2_lock_res *lockres = opaque;
639 struct inode *inode;
640 struct ocfs2_super *osb;
641
642 mlog_entry_void();
643
644 BUG_ON(!ocfs2_is_inode_lock(lockres));
645
646 inode = ocfs2_lock_res_inode(lockres);
647 osb = OCFS2_SB(inode->i_sb);
648
649 mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
650 "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
651 lockres->l_level,
652 ocfs2_lock_type_string(lockres->l_type));
653
654 ocfs2_generic_bast_func(osb, lockres, level);
655
656 mlog_exit_void();
657}
658
659static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
660 int ignore_refresh)
661{
662 struct dlm_lockstatus *lksb = &lockres->l_lksb;
663 unsigned long flags;
664
665 spin_lock_irqsave(&lockres->l_lock, flags);
666
667 if (lksb->status != DLM_NORMAL) {
668 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
669 lockres->l_name, lksb->status);
670 spin_unlock_irqrestore(&lockres->l_lock, flags);
671 return;
672 }
673
674 switch(lockres->l_action) {
675 case OCFS2_AST_ATTACH:
676 ocfs2_generic_handle_attach_action(lockres);
677 break;
678 case OCFS2_AST_CONVERT:
679 ocfs2_generic_handle_convert_action(lockres);
680 break;
681 case OCFS2_AST_DOWNCONVERT:
682 ocfs2_generic_handle_downconvert_action(lockres);
683 break;
684 default:
685 BUG();
686 }
687
688 if (ignore_refresh)
689 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
690
691 /* set it to something invalid so if we get called again we
692 * can catch it. */
693 lockres->l_action = OCFS2_AST_INVALID;
694 spin_unlock_irqrestore(&lockres->l_lock, flags);
695
696 wake_up(&lockres->l_event);
697}
698
699static void ocfs2_super_ast_func(void *opaque)
700{
701 struct ocfs2_lock_res *lockres = opaque;
702
703 mlog_entry_void();
704 mlog(0, "Superblock AST fired\n");
705
706 BUG_ON(!ocfs2_is_super_lock(lockres));
707 ocfs2_generic_ast_func(lockres, 0);
708
709 mlog_exit_void();
710}
711
712static void ocfs2_super_bast_func(void *opaque,
713 int level)
714{
715 struct ocfs2_lock_res *lockres = opaque;
716 struct ocfs2_super *osb;
717
718 mlog_entry_void();
719 mlog(0, "Superblock BAST fired\n");
720
721 BUG_ON(!ocfs2_is_super_lock(lockres));
722 osb = ocfs2_lock_res_super(lockres);
723 ocfs2_generic_bast_func(osb, lockres, level);
724
725 mlog_exit_void();
726}
727
728static void ocfs2_rename_ast_func(void *opaque)
729{
730 struct ocfs2_lock_res *lockres = opaque;
731
732 mlog_entry_void();
733
734 mlog(0, "Rename AST fired\n");
735
736 BUG_ON(!ocfs2_is_rename_lock(lockres));
737
738 ocfs2_generic_ast_func(lockres, 1);
739
740 mlog_exit_void();
741}
742
743static void ocfs2_rename_bast_func(void *opaque,
744 int level)
745{
746 struct ocfs2_lock_res *lockres = opaque;
747 struct ocfs2_super *osb;
748
749 mlog_entry_void();
750
751 mlog(0, "Rename BAST fired\n");
752
753 BUG_ON(!ocfs2_is_rename_lock(lockres));
754
755 osb = ocfs2_lock_res_super(lockres);
756 ocfs2_generic_bast_func(osb, lockres, level);
757
758 mlog_exit_void();
759}
760
761static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
762 int convert)
763{
764 unsigned long flags;
765
766 mlog_entry_void();
767 spin_lock_irqsave(&lockres->l_lock, flags);
768 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
769 if (convert)
770 lockres->l_action = OCFS2_AST_INVALID;
771 else
772 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
773 spin_unlock_irqrestore(&lockres->l_lock, flags);
774
775 wake_up(&lockres->l_event);
776 mlog_exit_void();
777}
778
779/* Note: If we detect another process working on the lock (i.e.,
780 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
781 * to do the right thing in that case.
782 */
783static int ocfs2_lock_create(struct ocfs2_super *osb,
784 struct ocfs2_lock_res *lockres,
785 int level,
786 int dlm_flags)
787{
788 int ret = 0;
789 enum dlm_status status;
790 unsigned long flags;
791
792 mlog_entry_void();
793
794 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
795 dlm_flags);
796
797 spin_lock_irqsave(&lockres->l_lock, flags);
798 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
799 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
800 spin_unlock_irqrestore(&lockres->l_lock, flags);
801 goto bail;
802 }
803
804 lockres->l_action = OCFS2_AST_ATTACH;
805 lockres->l_requested = level;
806 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
807 spin_unlock_irqrestore(&lockres->l_lock, flags);
808
809 status = dlmlock(osb->dlm,
810 level,
811 &lockres->l_lksb,
812 dlm_flags,
813 lockres->l_name,
814 lockres->l_ops->ast,
815 lockres,
816 lockres->l_ops->bast);
817 if (status != DLM_NORMAL) {
818 ocfs2_log_dlm_error("dlmlock", status, lockres);
819 ret = -EINVAL;
820 ocfs2_recover_from_dlm_error(lockres, 1);
821 }
822
823 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
824
825bail:
826 mlog_exit(ret);
827 return ret;
828}
829
830static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
831 int flag)
832{
833 unsigned long flags;
834 int ret;
835
836 spin_lock_irqsave(&lockres->l_lock, flags);
837 ret = lockres->l_flags & flag;
838 spin_unlock_irqrestore(&lockres->l_lock, flags);
839
840 return ret;
841}
842
843static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
844
845{
846 wait_event(lockres->l_event,
847 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
848}
849
850static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
851
852{
853 wait_event(lockres->l_event,
854 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
855}
856
857/* predict what lock level we'll be dropping down to on behalf
858 * of another node, and return true if the currently wanted
859 * level will be compatible with it. */
860static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
861 int wanted)
862{
863 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
864
865 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
866}
867
868static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
869{
870 INIT_LIST_HEAD(&mw->mw_item);
871 init_completion(&mw->mw_complete);
872}
873
874static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
875{
876 wait_for_completion(&mw->mw_complete);
877 /* Re-arm the completion in case we want to wait on it again */
878 INIT_COMPLETION(mw->mw_complete);
879 return mw->mw_status;
880}
881
882static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
883 struct ocfs2_mask_waiter *mw,
884 unsigned long mask,
885 unsigned long goal)
886{
887 BUG_ON(!list_empty(&mw->mw_item));
888
889 assert_spin_locked(&lockres->l_lock);
890
891 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
892 mw->mw_mask = mask;
893 mw->mw_goal = goal;
894}
895
896/* returns 0 if the mw that was removed was already satisfied, -EBUSY
897 * if the mask still hadn't reached its goal */
898static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
899 struct ocfs2_mask_waiter *mw)
900{
901 unsigned long flags;
902 int ret = 0;
903
904 spin_lock_irqsave(&lockres->l_lock, flags);
905 if (!list_empty(&mw->mw_item)) {
906 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
907 ret = -EBUSY;
908
909 list_del_init(&mw->mw_item);
910 init_completion(&mw->mw_complete);
911 }
912 spin_unlock_irqrestore(&lockres->l_lock, flags);
913
914 return ret;
915
916}
917
918static int ocfs2_cluster_lock(struct ocfs2_super *osb,
919 struct ocfs2_lock_res *lockres,
920 int level,
921 int lkm_flags,
922 int arg_flags)
923{
924 struct ocfs2_mask_waiter mw;
925 enum dlm_status status;
926 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
927 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
928 unsigned long flags;
929
930 mlog_entry_void();
931
932 ocfs2_init_mask_waiter(&mw);
933
934again:
935 wait = 0;
936
937 if (catch_signals && signal_pending(current)) {
938 ret = -ERESTARTSYS;
939 goto out;
940 }
941
942 spin_lock_irqsave(&lockres->l_lock, flags);
943
944 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
945 "Cluster lock called on freeing lockres %s! flags "
946 "0x%lx\n", lockres->l_name, lockres->l_flags);
947
948 /* We only compare against the currently granted level
949 * here. If the lock is blocked waiting on a downconvert,
950 * we'll get caught below. */
951 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
952 level > lockres->l_level) {
953 /* is someone sitting in dlm_lock? If so, wait on
954 * them. */
955 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
956 wait = 1;
957 goto unlock;
958 }
959
960 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
961 /* lock has not been created yet. */
962 spin_unlock_irqrestore(&lockres->l_lock, flags);
963
964 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
965 if (ret < 0) {
966 mlog_errno(ret);
967 goto out;
968 }
969 goto again;
970 }
971
972 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
973 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
974 /* is the lock is currently blocked on behalf of
975 * another node */
976 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
977 wait = 1;
978 goto unlock;
979 }
980
981 if (level > lockres->l_level) {
982 if (lockres->l_action != OCFS2_AST_INVALID)
983 mlog(ML_ERROR, "lockres %s has action %u pending\n",
984 lockres->l_name, lockres->l_action);
985
986 lockres->l_action = OCFS2_AST_CONVERT;
987 lockres->l_requested = level;
988 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
989 spin_unlock_irqrestore(&lockres->l_lock, flags);
990
991 BUG_ON(level == LKM_IVMODE);
992 BUG_ON(level == LKM_NLMODE);
993
994 mlog(0, "lock %s, convert from %d to level = %d\n",
995 lockres->l_name, lockres->l_level, level);
996
997 /* call dlm_lock to upgrade lock now */
998 status = dlmlock(osb->dlm,
999 level,
1000 &lockres->l_lksb,
1001 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1002 lockres->l_name,
1003 lockres->l_ops->ast,
1004 lockres,
1005 lockres->l_ops->bast);
1006 if (status != DLM_NORMAL) {
1007 if ((lkm_flags & LKM_NOQUEUE) &&
1008 (status == DLM_NOTQUEUED))
1009 ret = -EAGAIN;
1010 else {
1011 ocfs2_log_dlm_error("dlmlock", status,
1012 lockres);
1013 ret = -EINVAL;
1014 }
1015 ocfs2_recover_from_dlm_error(lockres, 1);
1016 goto out;
1017 }
1018
1019 mlog(0, "lock %s, successfull return from dlmlock\n",
1020 lockres->l_name);
1021
1022 /* At this point we've gone inside the dlm and need to
1023 * complete our work regardless. */
1024 catch_signals = 0;
1025
1026 /* wait for busy to clear and carry on */
1027 goto again;
1028 }
1029
1030 /* Ok, if we get here then we're good to go. */
1031 ocfs2_inc_holders(lockres, level);
1032
1033 ret = 0;
1034unlock:
1035 spin_unlock_irqrestore(&lockres->l_lock, flags);
1036out:
1037 /*
1038 * This is helping work around a lock inversion between the page lock
1039 * and dlm locks. One path holds the page lock while calling aops
1040 * which block acquiring dlm locks. The voting thread holds dlm
1041 * locks while acquiring page locks while down converting data locks.
1042 * This block is helping an aop path notice the inversion and back
1043 * off to unlock its page lock before trying the dlm lock again.
1044 */
1045 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1046 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1047 wait = 0;
1048 if (lockres_remove_mask_waiter(lockres, &mw))
1049 ret = -EAGAIN;
1050 else
1051 goto again;
1052 }
1053 if (wait) {
1054 ret = ocfs2_wait_for_mask(&mw);
1055 if (ret == 0)
1056 goto again;
1057 mlog_errno(ret);
1058 }
1059
1060 mlog_exit(ret);
1061 return ret;
1062}
1063
1064static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1065 struct ocfs2_lock_res *lockres,
1066 int level)
1067{
1068 unsigned long flags;
1069
1070 mlog_entry_void();
1071 spin_lock_irqsave(&lockres->l_lock, flags);
1072 ocfs2_dec_holders(lockres, level);
1073 ocfs2_vote_on_unlock(osb, lockres);
1074 spin_unlock_irqrestore(&lockres->l_lock, flags);
1075 mlog_exit_void();
1076}
1077
1078static int ocfs2_create_new_inode_lock(struct inode *inode,
1079 struct ocfs2_lock_res *lockres)
1080{
1081 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1082 unsigned long flags;
1083
1084 spin_lock_irqsave(&lockres->l_lock, flags);
1085 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1086 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1087 spin_unlock_irqrestore(&lockres->l_lock, flags);
1088
1089 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1090}
1091
1092/* Grants us an EX lock on the data and metadata resources, skipping
1093 * the normal cluster directory lookup. Use this ONLY on newly created
1094 * inodes which other nodes can't possibly see, and which haven't been
1095 * hashed in the inode hash yet. This can give us a good performance
1096 * increase as it'll skip the network broadcast normally associated
1097 * with creating a new lock resource. */
1098int ocfs2_create_new_inode_locks(struct inode *inode)
1099{
1100 int ret;
1101
1102 BUG_ON(!inode);
1103 BUG_ON(!ocfs2_inode_is_new(inode));
1104
1105 mlog_entry_void();
1106
1107 mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
1108
1109 /* NOTE: That we don't increment any of the holder counts, nor
1110 * do we add anything to a journal handle. Since this is
1111 * supposed to be a new inode which the cluster doesn't know
1112 * about yet, there is no need to. As far as the LVB handling
1113 * is concerned, this is basically like acquiring an EX lock
1114 * on a resource which has an invalid one -- we'll set it
1115 * valid when we release the EX. */
1116
1117 ret = ocfs2_create_new_inode_lock(inode,
1118 &OCFS2_I(inode)->ip_rw_lockres);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto bail;
1122 }
1123
1124 ret = ocfs2_create_new_inode_lock(inode,
1125 &OCFS2_I(inode)->ip_meta_lockres);
1126 if (ret) {
1127 mlog_errno(ret);
1128 goto bail;
1129 }
1130
1131 ret = ocfs2_create_new_inode_lock(inode,
1132 &OCFS2_I(inode)->ip_data_lockres);
1133 if (ret) {
1134 mlog_errno(ret);
1135 goto bail;
1136 }
1137
1138bail:
1139 mlog_exit(ret);
1140 return ret;
1141}
1142
1143int ocfs2_rw_lock(struct inode *inode, int write)
1144{
1145 int status, level;
1146 struct ocfs2_lock_res *lockres;
1147
1148 BUG_ON(!inode);
1149
1150 mlog_entry_void();
1151
1152 mlog(0, "inode %"MLFu64" take %s RW lock\n",
1153 OCFS2_I(inode)->ip_blkno,
1154 write ? "EXMODE" : "PRMODE");
1155
1156 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1157
1158 level = write ? LKM_EXMODE : LKM_PRMODE;
1159
1160 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1161 0);
1162 if (status < 0)
1163 mlog_errno(status);
1164
1165 mlog_exit(status);
1166 return status;
1167}
1168
1169void ocfs2_rw_unlock(struct inode *inode, int write)
1170{
1171 int level = write ? LKM_EXMODE : LKM_PRMODE;
1172 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1173
1174 mlog_entry_void();
1175
1176 mlog(0, "inode %"MLFu64" drop %s RW lock\n",
1177 OCFS2_I(inode)->ip_blkno,
1178 write ? "EXMODE" : "PRMODE");
1179
1180 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1181
1182 mlog_exit_void();
1183}
1184
1185int ocfs2_data_lock_full(struct inode *inode,
1186 int write,
1187 int arg_flags)
1188{
1189 int status = 0, level;
1190 struct ocfs2_lock_res *lockres;
1191
1192 BUG_ON(!inode);
1193
1194 mlog_entry_void();
1195
1196 mlog(0, "inode %"MLFu64" take %s DATA lock\n",
1197 OCFS2_I(inode)->ip_blkno,
1198 write ? "EXMODE" : "PRMODE");
1199
1200 /* We'll allow faking a readonly data lock for
1201 * rodevices. */
1202 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1203 if (write) {
1204 status = -EROFS;
1205 mlog_errno(status);
1206 }
1207 goto out;
1208 }
1209
1210 lockres = &OCFS2_I(inode)->ip_data_lockres;
1211
1212 level = write ? LKM_EXMODE : LKM_PRMODE;
1213
1214 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1215 0, arg_flags);
1216 if (status < 0 && status != -EAGAIN)
1217 mlog_errno(status);
1218
1219out:
1220 mlog_exit(status);
1221 return status;
1222}
1223
1224/* see ocfs2_meta_lock_with_page() */
1225int ocfs2_data_lock_with_page(struct inode *inode,
1226 int write,
1227 struct page *page)
1228{
1229 int ret;
1230
1231 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1232 if (ret == -EAGAIN) {
1233 unlock_page(page);
1234 if (ocfs2_data_lock(inode, write) == 0)
1235 ocfs2_data_unlock(inode, write);
1236 ret = AOP_TRUNCATED_PAGE;
1237 }
1238
1239 return ret;
1240}
1241
1242static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1243 struct ocfs2_lock_res *lockres)
1244{
1245 int kick = 0;
1246
1247 mlog_entry_void();
1248
1249 /* If we know that another node is waiting on our lock, kick
1250 * the vote thread * pre-emptively when we reach a release
1251 * condition. */
1252 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1253 switch(lockres->l_blocking) {
1254 case LKM_EXMODE:
1255 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1256 kick = 1;
1257 break;
1258 case LKM_PRMODE:
1259 if (!lockres->l_ex_holders)
1260 kick = 1;
1261 break;
1262 default:
1263 BUG();
1264 }
1265 }
1266
1267 if (kick)
1268 ocfs2_kick_vote_thread(osb);
1269
1270 mlog_exit_void();
1271}
1272
1273void ocfs2_data_unlock(struct inode *inode,
1274 int write)
1275{
1276 int level = write ? LKM_EXMODE : LKM_PRMODE;
1277 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1278
1279 mlog_entry_void();
1280
1281 mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
1282 OCFS2_I(inode)->ip_blkno,
1283 write ? "EXMODE" : "PRMODE");
1284
1285 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1286 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1287
1288 mlog_exit_void();
1289}
1290
1291#define OCFS2_SEC_BITS 34
1292#define OCFS2_SEC_SHIFT (64 - 34)
1293#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1294
1295/* LVB only has room for 64 bits of time here so we pack it for
1296 * now. */
1297static u64 ocfs2_pack_timespec(struct timespec *spec)
1298{
1299 u64 res;
1300 u64 sec = spec->tv_sec;
1301 u32 nsec = spec->tv_nsec;
1302
1303 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1304
1305 return res;
1306}
1307
1308/* Call this with the lockres locked. I am reasonably sure we don't
1309 * need ip_lock in this function as anyone who would be changing those
1310 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1311static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1312{
1313 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1314 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1315 struct ocfs2_meta_lvb *lvb;
1316
1317 mlog_entry_void();
1318
1319 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1320
1321 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION);
1322 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1323 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1324 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1325 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1326 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1327 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1328 lvb->lvb_iatime_packed =
1329 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1330 lvb->lvb_ictime_packed =
1331 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1332 lvb->lvb_imtime_packed =
1333 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1334
1335 mlog_meta_lvb(0, lockres);
1336
1337 mlog_exit_void();
1338}
1339
1340static void ocfs2_unpack_timespec(struct timespec *spec,
1341 u64 packed_time)
1342{
1343 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1344 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1345}
1346
1347static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1348{
1349 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1350 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1351 struct ocfs2_meta_lvb *lvb;
1352
1353 mlog_entry_void();
1354
1355 mlog_meta_lvb(0, lockres);
1356
1357 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1358
1359 /* We're safe here without the lockres lock... */
1360 spin_lock(&oi->ip_lock);
1361 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1362 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1363
1364 /* fast-symlinks are a special case */
1365 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1366 inode->i_blocks = 0;
1367 else
1368 inode->i_blocks =
1369 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1370
1371 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1372 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1373 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1374 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1375 ocfs2_unpack_timespec(&inode->i_atime,
1376 be64_to_cpu(lvb->lvb_iatime_packed));
1377 ocfs2_unpack_timespec(&inode->i_mtime,
1378 be64_to_cpu(lvb->lvb_imtime_packed));
1379 ocfs2_unpack_timespec(&inode->i_ctime,
1380 be64_to_cpu(lvb->lvb_ictime_packed));
1381 spin_unlock(&oi->ip_lock);
1382
1383 mlog_exit_void();
1384}
1385
1386static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1387{
1388 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1389
1390 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1391 return 1;
1392 return 0;
1393}
1394
1395/* Determine whether a lock resource needs to be refreshed, and
1396 * arbitrate who gets to refresh it.
1397 *
1398 * 0 means no refresh needed.
1399 *
1400 * > 0 means you need to refresh this and you MUST call
1401 * ocfs2_complete_lock_res_refresh afterwards. */
1402static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1403{
1404 unsigned long flags;
1405 int status = 0;
1406
1407 mlog_entry_void();
1408
1409refresh_check:
1410 spin_lock_irqsave(&lockres->l_lock, flags);
1411 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1412 spin_unlock_irqrestore(&lockres->l_lock, flags);
1413 goto bail;
1414 }
1415
1416 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1417 spin_unlock_irqrestore(&lockres->l_lock, flags);
1418
1419 ocfs2_wait_on_refreshing_lock(lockres);
1420 goto refresh_check;
1421 }
1422
1423 /* Ok, I'll be the one to refresh this lock. */
1424 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1425 spin_unlock_irqrestore(&lockres->l_lock, flags);
1426
1427 status = 1;
1428bail:
1429 mlog_exit(status);
1430 return status;
1431}
1432
1433/* If status is non zero, I'll mark it as not being in refresh
1434 * anymroe, but i won't clear the needs refresh flag. */
1435static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1436 int status)
1437{
1438 unsigned long flags;
1439 mlog_entry_void();
1440
1441 spin_lock_irqsave(&lockres->l_lock, flags);
1442 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1443 if (!status)
1444 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1445 spin_unlock_irqrestore(&lockres->l_lock, flags);
1446
1447 wake_up(&lockres->l_event);
1448
1449 mlog_exit_void();
1450}
1451
1452/* may or may not return a bh if it went to disk. */
1453static int ocfs2_meta_lock_update(struct inode *inode,
1454 struct buffer_head **bh)
1455{
1456 int status = 0;
1457 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1458 struct ocfs2_lock_res *lockres;
1459 struct ocfs2_dinode *fe;
1460
1461 mlog_entry_void();
1462
1463 spin_lock(&oi->ip_lock);
1464 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1465 mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
1466 "were waiting on a lock. ip_flags = 0x%x\n",
1467 oi->ip_blkno, oi->ip_flags);
1468 spin_unlock(&oi->ip_lock);
1469 status = -ENOENT;
1470 goto bail;
1471 }
1472 spin_unlock(&oi->ip_lock);
1473
1474 lockres = &oi->ip_meta_lockres;
1475
1476 if (!ocfs2_should_refresh_lock_res(lockres))
1477 goto bail;
1478
1479 /* This will discard any caching information we might have had
1480 * for the inode metadata. */
1481 ocfs2_metadata_cache_purge(inode);
1482
1483 /* will do nothing for inode types that don't use the extent
1484 * map (directories, bitmap files, etc) */
1485 ocfs2_extent_map_trunc(inode, 0);
1486
1487 if (ocfs2_meta_lvb_is_trustable(lockres)) {
1488 mlog(0, "Trusting LVB on inode %"MLFu64"\n",
1489 oi->ip_blkno);
1490 ocfs2_refresh_inode_from_lvb(inode);
1491 } else {
1492 /* Boo, we have to go to disk. */
1493 /* read bh, cast, ocfs2_refresh_inode */
1494 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1495 bh, OCFS2_BH_CACHED, inode);
1496 if (status < 0) {
1497 mlog_errno(status);
1498 goto bail_refresh;
1499 }
1500 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1501
1502 /* This is a good chance to make sure we're not
1503 * locking an invalid object.
1504 *
1505 * We bug on a stale inode here because we checked
1506 * above whether it was wiped from disk. The wiping
1507 * node provides a guarantee that we receive that
1508 * message and can mark the inode before dropping any
1509 * locks associated with it. */
1510 if (!OCFS2_IS_VALID_DINODE(fe)) {
1511 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1512 status = -EIO;
1513 goto bail_refresh;
1514 }
1515 mlog_bug_on_msg(inode->i_generation !=
1516 le32_to_cpu(fe->i_generation),
1517 "Invalid dinode %"MLFu64" disk generation: %u "
1518 "inode->i_generation: %u\n",
1519 oi->ip_blkno, le32_to_cpu(fe->i_generation),
1520 inode->i_generation);
1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 "Stale dinode %"MLFu64" dtime: %"MLFu64" "
1524 "flags: 0x%x\n", oi->ip_blkno,
1525 le64_to_cpu(fe->i_dtime),
1526 le32_to_cpu(fe->i_flags));
1527
1528 ocfs2_refresh_inode(inode, fe);
1529 }
1530
1531 status = 0;
1532bail_refresh:
1533 ocfs2_complete_lock_res_refresh(lockres, status);
1534bail:
1535 mlog_exit(status);
1536 return status;
1537}
1538
1539static int ocfs2_assign_bh(struct inode *inode,
1540 struct buffer_head **ret_bh,
1541 struct buffer_head *passed_bh)
1542{
1543 int status;
1544
1545 if (passed_bh) {
1546 /* Ok, the update went to disk for us, use the
1547 * returned bh. */
1548 *ret_bh = passed_bh;
1549 get_bh(*ret_bh);
1550
1551 return 0;
1552 }
1553
1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 OCFS2_I(inode)->ip_blkno,
1556 ret_bh,
1557 OCFS2_BH_CACHED,
1558 inode);
1559 if (status < 0)
1560 mlog_errno(status);
1561
1562 return status;
1563}
1564
1565/*
1566 * returns < 0 error if the callback will never be called, otherwise
1567 * the result of the lock will be communicated via the callback.
1568 */
1569int ocfs2_meta_lock_full(struct inode *inode,
1570 struct ocfs2_journal_handle *handle,
1571 struct buffer_head **ret_bh,
1572 int ex,
1573 int arg_flags)
1574{
1575 int status, level, dlm_flags, acquired;
1576 struct ocfs2_lock_res *lockres;
1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 struct buffer_head *local_bh = NULL;
1579
1580 BUG_ON(!inode);
1581
1582 mlog_entry_void();
1583
1584 mlog(0, "inode %"MLFu64", take %s META lock\n",
1585 OCFS2_I(inode)->ip_blkno,
1586 ex ? "EXMODE" : "PRMODE");
1587
1588 status = 0;
1589 acquired = 0;
1590 /* We'll allow faking a readonly metadata lock for
1591 * rodevices. */
1592 if (ocfs2_is_hard_readonly(osb)) {
1593 if (ex)
1594 status = -EROFS;
1595 goto bail;
1596 }
1597
1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 wait_event(osb->recovery_event,
1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601
1602 acquired = 0;
1603 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 dlm_flags = 0;
1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 dlm_flags |= LKM_NOQUEUE;
1608
1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 if (status < 0) {
1611 if (status != -EAGAIN && status != -EIOCBRETRY)
1612 mlog_errno(status);
1613 goto bail;
1614 }
1615
1616 /* Notify the error cleanup path to drop the cluster lock. */
1617 acquired = 1;
1618
1619 /* We wait twice because a node may have died while we were in
1620 * the lower dlm layers. The second time though, we've
1621 * committed to owning this lock so we don't allow signals to
1622 * abort the operation. */
1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 wait_event(osb->recovery_event,
1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626
1627 /* This is fun. The caller may want a bh back, or it may
1628 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 * may or may not read one, depending on what's in the
1630 * LVB. The result of all of this is that we've *only* gone to
1631 * disk if we have to, so the complexity is worthwhile. */
1632 status = ocfs2_meta_lock_update(inode, &local_bh);
1633 if (status < 0) {
1634 if (status != -ENOENT)
1635 mlog_errno(status);
1636 goto bail;
1637 }
1638
1639 if (ret_bh) {
1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 if (handle) {
1648 status = ocfs2_handle_add_lock(handle, inode);
1649 if (status < 0)
1650 mlog_errno(status);
1651 }
1652
1653bail:
1654 if (status < 0) {
1655 if (ret_bh && (*ret_bh)) {
1656 brelse(*ret_bh);
1657 *ret_bh = NULL;
1658 }
1659 if (acquired)
1660 ocfs2_meta_unlock(inode, ex);
1661 }
1662
1663 if (local_bh)
1664 brelse(local_bh);
1665
1666 mlog_exit(status);
1667 return status;
1668}
1669
1670/*
1671 * This is working around a lock inversion between tasks acquiring DLM locks
1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673 * while acquiring page locks.
1674 *
1675 * ** These _with_page variantes are only intended to be called from aop
1676 * methods that hold page locks and return a very specific *positive* error
1677 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678 *
1679 * The DLM is called such that it returns -EAGAIN if it would have blocked
1680 * waiting for the vote thread. In that case we unlock our page so the vote
1681 * thread can make progress. Once we've done this we have to return
1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683 * into the VFS who will then immediately retry the aop call.
1684 *
1685 * We do a blocking lock and immediate unlock before returning, though, so that
1686 * the lock has a great chance of being cached on this node by the time the VFS
1687 * calls back to retry the aop. This has a potential to livelock as nodes
1688 * ping locks back and forth, but that's a risk we're willing to take to avoid
1689 * the lock inversion simply.
1690 */
1691int ocfs2_meta_lock_with_page(struct inode *inode,
1692 struct ocfs2_journal_handle *handle,
1693 struct buffer_head **ret_bh,
1694 int ex,
1695 struct page *page)
1696{
1697 int ret;
1698
1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 OCFS2_LOCK_NONBLOCK);
1701 if (ret == -EAGAIN) {
1702 unlock_page(page);
1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 ocfs2_meta_unlock(inode, ex);
1705 ret = AOP_TRUNCATED_PAGE;
1706 }
1707
1708 return ret;
1709}
1710
1711void ocfs2_meta_unlock(struct inode *inode,
1712 int ex)
1713{
1714 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716
1717 mlog_entry_void();
1718
1719 mlog(0, "inode %"MLFu64" drop %s META lock\n",
1720 OCFS2_I(inode)->ip_blkno,
1721 ex ? "EXMODE" : "PRMODE");
1722
1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725
1726 mlog_exit_void();
1727}
1728
1729int ocfs2_super_lock(struct ocfs2_super *osb,
1730 int ex)
1731{
1732 int status;
1733 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 struct buffer_head *bh;
1736 struct ocfs2_slot_info *si = osb->slot_info;
1737
1738 mlog_entry_void();
1739
1740 if (ocfs2_is_hard_readonly(osb))
1741 return -EROFS;
1742
1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 if (status < 0) {
1745 mlog_errno(status);
1746 goto bail;
1747 }
1748
1749 /* The super block lock path is really in the best position to
1750 * know when resources covered by the lock need to be
1751 * refreshed, so we do it here. Of course, making sense of
1752 * everything is up to the caller :) */
1753 status = ocfs2_should_refresh_lock_res(lockres);
1754 if (status < 0) {
1755 mlog_errno(status);
1756 goto bail;
1757 }
1758 if (status) {
1759 bh = si->si_bh;
1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 si->si_inode);
1762 if (status == 0)
1763 ocfs2_update_slot_info(si);
1764
1765 ocfs2_complete_lock_res_refresh(lockres, status);
1766
1767 if (status < 0)
1768 mlog_errno(status);
1769 }
1770bail:
1771 mlog_exit(status);
1772 return status;
1773}
1774
1775void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 int ex)
1777{
1778 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780
1781 ocfs2_cluster_unlock(osb, lockres, level);
1782}
1783
1784int ocfs2_rename_lock(struct ocfs2_super *osb)
1785{
1786 int status;
1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788
1789 if (ocfs2_is_hard_readonly(osb))
1790 return -EROFS;
1791
1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 if (status < 0)
1794 mlog_errno(status);
1795
1796 return status;
1797}
1798
1799void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800{
1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802
1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804}
1805
1806/* Reference counting of the dlm debug structure. We want this because
1807 * open references on the debug inodes can live on after a mount, so
1808 * we can't rely on the ocfs2_super to always exist. */
1809static void ocfs2_dlm_debug_free(struct kref *kref)
1810{
1811 struct ocfs2_dlm_debug *dlm_debug;
1812
1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814
1815 kfree(dlm_debug);
1816}
1817
1818void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819{
1820 if (dlm_debug)
1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822}
1823
1824static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825{
1826 kref_get(&debug->d_refcnt);
1827}
1828
1829struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830{
1831 struct ocfs2_dlm_debug *dlm_debug;
1832
1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 if (!dlm_debug) {
1835 mlog_errno(-ENOMEM);
1836 goto out;
1837 }
1838
1839 kref_init(&dlm_debug->d_refcnt);
1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 dlm_debug->d_locking_state = NULL;
1842out:
1843 return dlm_debug;
1844}
1845
1846/* Access to this is arbitrated for us via seq_file->sem. */
1847struct ocfs2_dlm_seq_priv {
1848 struct ocfs2_dlm_debug *p_dlm_debug;
1849 struct ocfs2_lock_res p_iter_res;
1850 struct ocfs2_lock_res p_tmp_res;
1851};
1852
1853static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 struct ocfs2_dlm_seq_priv *priv)
1855{
1856 struct ocfs2_lock_res *iter, *ret = NULL;
1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858
1859 assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860
1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 /* discover the head of the list */
1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 mlog(0, "End of list found, %p\n", ret);
1865 break;
1866 }
1867
1868 /* We track our "dummy" iteration lockres' by a NULL
1869 * l_ops field. */
1870 if (iter->l_ops != NULL) {
1871 ret = iter;
1872 break;
1873 }
1874 }
1875
1876 return ret;
1877}
1878
1879static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880{
1881 struct ocfs2_dlm_seq_priv *priv = m->private;
1882 struct ocfs2_lock_res *iter;
1883
1884 spin_lock(&ocfs2_dlm_tracking_lock);
1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 if (iter) {
1887 /* Since lockres' have the lifetime of their container
1888 * (which can be inodes, ocfs2_supers, etc) we want to
1889 * copy this out to a temporary lockres while still
1890 * under the spinlock. Obviously after this we can't
1891 * trust any pointers on the copy returned, but that's
1892 * ok as the information we want isn't typically held
1893 * in them. */
1894 priv->p_tmp_res = *iter;
1895 iter = &priv->p_tmp_res;
1896 }
1897 spin_unlock(&ocfs2_dlm_tracking_lock);
1898
1899 return iter;
1900}
1901
1902static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903{
1904}
1905
1906static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907{
1908 struct ocfs2_dlm_seq_priv *priv = m->private;
1909 struct ocfs2_lock_res *iter = v;
1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911
1912 spin_lock(&ocfs2_dlm_tracking_lock);
1913 iter = ocfs2_dlm_next_res(iter, priv);
1914 list_del_init(&dummy->l_debug_list);
1915 if (iter) {
1916 list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 priv->p_tmp_res = *iter;
1918 iter = &priv->p_tmp_res;
1919 }
1920 spin_unlock(&ocfs2_dlm_tracking_lock);
1921
1922 return iter;
1923}
1924
1925/* So that debugfs.ocfs2 can determine which format is being used */
1926#define OCFS2_DLM_DEBUG_STR_VERSION 1
1927static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928{
1929 int i;
1930 char *lvb;
1931 struct ocfs2_lock_res *lockres = v;
1932
1933 if (!lockres)
1934 return -EINVAL;
1935
1936 seq_printf(m, "0x%x\t"
1937 "%.*s\t"
1938 "%d\t"
1939 "0x%lx\t"
1940 "0x%x\t"
1941 "0x%x\t"
1942 "%u\t"
1943 "%u\t"
1944 "%d\t"
1945 "%d\t",
1946 OCFS2_DLM_DEBUG_STR_VERSION,
1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 lockres->l_level,
1949 lockres->l_flags,
1950 lockres->l_action,
1951 lockres->l_unlock_action,
1952 lockres->l_ro_holders,
1953 lockres->l_ex_holders,
1954 lockres->l_requested,
1955 lockres->l_blocking);
1956
1957 /* Dump the raw LVB */
1958 lvb = lockres->l_lksb.lvb;
1959 for(i = 0; i < DLM_LVB_LEN; i++)
1960 seq_printf(m, "0x%x\t", lvb[i]);
1961
1962 /* End the line */
1963 seq_printf(m, "\n");
1964 return 0;
1965}
1966
1967static struct seq_operations ocfs2_dlm_seq_ops = {
1968 .start = ocfs2_dlm_seq_start,
1969 .stop = ocfs2_dlm_seq_stop,
1970 .next = ocfs2_dlm_seq_next,
1971 .show = ocfs2_dlm_seq_show,
1972};
1973
1974static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975{
1976 struct seq_file *seq = (struct seq_file *) file->private_data;
1977 struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 struct ocfs2_lock_res *res = &priv->p_iter_res;
1979
1980 ocfs2_remove_lockres_tracking(res);
1981 ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 return seq_release_private(inode, file);
1983}
1984
1985static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986{
1987 int ret;
1988 struct ocfs2_dlm_seq_priv *priv;
1989 struct seq_file *seq;
1990 struct ocfs2_super *osb;
1991
1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 if (!priv) {
1994 ret = -ENOMEM;
1995 mlog_errno(ret);
1996 goto out;
1997 }
1998 osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 priv->p_dlm_debug = osb->osb_dlm_debug;
2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002
2003 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 if (ret) {
2005 kfree(priv);
2006 mlog_errno(ret);
2007 goto out;
2008 }
2009
2010 seq = (struct seq_file *) file->private_data;
2011 seq->private = priv;
2012
2013 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 priv->p_dlm_debug);
2015
2016out:
2017 return ret;
2018}
2019
2020static struct file_operations ocfs2_dlm_debug_fops = {
2021 .open = ocfs2_dlm_debug_open,
2022 .release = ocfs2_dlm_debug_release,
2023 .read = seq_read,
2024 .llseek = seq_lseek,
2025};
2026
2027static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028{
2029 int ret = 0;
2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031
2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 S_IFREG|S_IRUSR,
2034 osb->osb_debug_root,
2035 osb,
2036 &ocfs2_dlm_debug_fops);
2037 if (!dlm_debug->d_locking_state) {
2038 ret = -EINVAL;
2039 mlog(ML_ERROR,
2040 "Unable to create locking state debugfs file.\n");
2041 goto out;
2042 }
2043
2044 ocfs2_get_dlm_debug(dlm_debug);
2045out:
2046 return ret;
2047}
2048
2049static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050{
2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052
2053 if (dlm_debug) {
2054 debugfs_remove(dlm_debug->d_locking_state);
2055 ocfs2_put_dlm_debug(dlm_debug);
2056 }
2057}
2058
2059int ocfs2_dlm_init(struct ocfs2_super *osb)
2060{
2061 int status;
2062 u32 dlm_key;
2063 struct dlm_ctxt *dlm;
2064
2065 mlog_entry_void();
2066
2067 status = ocfs2_dlm_init_debug(osb);
2068 if (status < 0) {
2069 mlog_errno(status);
2070 goto bail;
2071 }
2072
2073 /* launch vote thread */
2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
2075 osb->osb_id);
2076 if (IS_ERR(osb->vote_task)) {
2077 status = PTR_ERR(osb->vote_task);
2078 osb->vote_task = NULL;
2079 mlog_errno(status);
2080 goto bail;
2081 }
2082
2083 /* used by the dlm code to make message headers unique, each
2084 * node in this domain must agree on this. */
2085 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2086
2087 /* for now, uuid == domain */
2088 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2089 if (IS_ERR(dlm)) {
2090 status = PTR_ERR(dlm);
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2095 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2096 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2097
2098 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2099
2100 osb->dlm = dlm;
2101
2102 status = 0;
2103bail:
2104 if (status < 0) {
2105 ocfs2_dlm_shutdown_debug(osb);
2106 if (osb->vote_task)
2107 kthread_stop(osb->vote_task);
2108 }
2109
2110 mlog_exit(status);
2111 return status;
2112}
2113
2114void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2115{
2116 mlog_entry_void();
2117
2118 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2119
2120 ocfs2_drop_osb_locks(osb);
2121
2122 if (osb->vote_task) {
2123 kthread_stop(osb->vote_task);
2124 osb->vote_task = NULL;
2125 }
2126
2127 ocfs2_lock_res_free(&osb->osb_super_lockres);
2128 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2129
2130 dlm_unregister_domain(osb->dlm);
2131 osb->dlm = NULL;
2132
2133 ocfs2_dlm_shutdown_debug(osb);
2134
2135 mlog_exit_void();
2136}
2137
2138static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2139{
2140 struct ocfs2_lock_res *lockres = opaque;
2141 unsigned long flags;
2142
2143 mlog_entry_void();
2144
2145 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2146 lockres->l_unlock_action);
2147
2148 spin_lock_irqsave(&lockres->l_lock, flags);
2149 /* We tried to cancel a convert request, but it was already
2150 * granted. All we want to do here is clear our unlock
2151 * state. The wake_up call done at the bottom is redundant
2152 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2153 * hurt anything anyway */
2154 if (status == DLM_CANCELGRANT &&
2155 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2156 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2157
2158 /* We don't clear the busy flag in this case as it
2159 * should have been cleared by the ast which the dlm
2160 * has called. */
2161 goto complete_unlock;
2162 }
2163
2164 if (status != DLM_NORMAL) {
2165 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2166 "unlock_action %d\n", status, lockres->l_name,
2167 lockres->l_unlock_action);
2168 spin_unlock_irqrestore(&lockres->l_lock, flags);
2169 return;
2170 }
2171
2172 switch(lockres->l_unlock_action) {
2173 case OCFS2_UNLOCK_CANCEL_CONVERT:
2174 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2175 lockres->l_action = OCFS2_AST_INVALID;
2176 break;
2177 case OCFS2_UNLOCK_DROP_LOCK:
2178 lockres->l_level = LKM_IVMODE;
2179 break;
2180 default:
2181 BUG();
2182 }
2183
2184 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2185complete_unlock:
2186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2187 spin_unlock_irqrestore(&lockres->l_lock, flags);
2188
2189 wake_up(&lockres->l_event);
2190
2191 mlog_exit_void();
2192}
2193
2194typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2195
2196struct drop_lock_cb {
2197 ocfs2_pre_drop_cb_t *drop_func;
2198 void *drop_data;
2199};
2200
2201static int ocfs2_drop_lock(struct ocfs2_super *osb,
2202 struct ocfs2_lock_res *lockres,
2203 struct drop_lock_cb *dcb)
2204{
2205 enum dlm_status status;
2206 unsigned long flags;
2207
2208 /* We didn't get anywhere near actually using this lockres. */
2209 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2210 goto out;
2211
2212 spin_lock_irqsave(&lockres->l_lock, flags);
2213
2214 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2215 "lockres %s, flags 0x%lx\n",
2216 lockres->l_name, lockres->l_flags);
2217
2218 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2219 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2220 "%u, unlock_action = %u\n",
2221 lockres->l_name, lockres->l_flags, lockres->l_action,
2222 lockres->l_unlock_action);
2223
2224 spin_unlock_irqrestore(&lockres->l_lock, flags);
2225
2226 /* XXX: Today we just wait on any busy
2227 * locks... Perhaps we need to cancel converts in the
2228 * future? */
2229 ocfs2_wait_on_busy_lock(lockres);
2230
2231 spin_lock_irqsave(&lockres->l_lock, flags);
2232 }
2233
2234 if (dcb)
2235 dcb->drop_func(lockres, dcb->drop_data);
2236
2237 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2238 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2239 lockres->l_name);
2240 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2241 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2242
2243 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2244 spin_unlock_irqrestore(&lockres->l_lock, flags);
2245 goto out;
2246 }
2247
2248 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2249
2250 /* make sure we never get here while waiting for an ast to
2251 * fire. */
2252 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2253
2254 /* is this necessary? */
2255 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2256 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2257 spin_unlock_irqrestore(&lockres->l_lock, flags);
2258
2259 mlog(0, "lock %s\n", lockres->l_name);
2260
2261 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2262 lockres->l_ops->unlock_ast, lockres);
2263 if (status != DLM_NORMAL) {
2264 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2265 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2266 dlm_print_one_lock(lockres->l_lksb.lockid);
2267 BUG();
2268 }
2269 mlog(0, "lock %s, successfull return from dlmunlock\n",
2270 lockres->l_name);
2271
2272 ocfs2_wait_on_busy_lock(lockres);
2273out:
2274 mlog_exit(0);
2275 return 0;
2276}
2277
2278/* Mark the lockres as being dropped. It will no longer be
2279 * queued if blocking, but we still may have to wait on it
2280 * being dequeued from the vote thread before we can consider
2281 * it safe to drop.
2282 *
2283 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2284void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2285{
2286 int status;
2287 struct ocfs2_mask_waiter mw;
2288 unsigned long flags;
2289
2290 ocfs2_init_mask_waiter(&mw);
2291
2292 spin_lock_irqsave(&lockres->l_lock, flags);
2293 lockres->l_flags |= OCFS2_LOCK_FREEING;
2294 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2295 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2296 spin_unlock_irqrestore(&lockres->l_lock, flags);
2297
2298 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2299
2300 status = ocfs2_wait_for_mask(&mw);
2301 if (status)
2302 mlog_errno(status);
2303
2304 spin_lock_irqsave(&lockres->l_lock, flags);
2305 }
2306 spin_unlock_irqrestore(&lockres->l_lock, flags);
2307}
2308
2309static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2310{
2311 int status;
2312
2313 mlog_entry_void();
2314
2315 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2316
2317 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2318 if (status < 0)
2319 mlog_errno(status);
2320
2321 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2322
2323 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2324 if (status < 0)
2325 mlog_errno(status);
2326
2327 mlog_exit(status);
2328}
2329
2330static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2331{
2332 struct inode *inode = data;
2333
2334 /* the metadata lock requires a bit more work as we have an
2335 * LVB to worry about. */
2336 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2337 lockres->l_level == LKM_EXMODE &&
2338 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2339 __ocfs2_stuff_meta_lvb(inode);
2340}
2341
2342int ocfs2_drop_inode_locks(struct inode *inode)
2343{
2344 int status, err;
2345 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2346
2347 mlog_entry_void();
2348
2349 /* No need to call ocfs2_mark_lockres_freeing here -
2350 * ocfs2_clear_inode has done it for us. */
2351
2352 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2353 &OCFS2_I(inode)->ip_data_lockres,
2354 NULL);
2355 if (err < 0)
2356 mlog_errno(err);
2357
2358 status = err;
2359
2360 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2361 &OCFS2_I(inode)->ip_meta_lockres,
2362 &meta_dcb);
2363 if (err < 0)
2364 mlog_errno(err);
2365 if (err < 0 && !status)
2366 status = err;
2367
2368 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2369 &OCFS2_I(inode)->ip_rw_lockres,
2370 NULL);
2371 if (err < 0)
2372 mlog_errno(err);
2373 if (err < 0 && !status)
2374 status = err;
2375
2376 mlog_exit(status);
2377 return status;
2378}
2379
2380static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2381 int new_level)
2382{
2383 assert_spin_locked(&lockres->l_lock);
2384
2385 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2386
2387 if (lockres->l_level <= new_level) {
2388 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2389 lockres->l_level, new_level);
2390 BUG();
2391 }
2392
2393 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2394 lockres->l_name, new_level, lockres->l_blocking);
2395
2396 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2397 lockres->l_requested = new_level;
2398 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2399}
2400
2401static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2402 struct ocfs2_lock_res *lockres,
2403 int new_level,
2404 int lvb)
2405{
2406 int ret, dlm_flags = LKM_CONVERT;
2407 enum dlm_status status;
2408
2409 mlog_entry_void();
2410
2411 if (lvb)
2412 dlm_flags |= LKM_VALBLK;
2413
2414 status = dlmlock(osb->dlm,
2415 new_level,
2416 &lockres->l_lksb,
2417 dlm_flags,
2418 lockres->l_name,
2419 lockres->l_ops->ast,
2420 lockres,
2421 lockres->l_ops->bast);
2422 if (status != DLM_NORMAL) {
2423 ocfs2_log_dlm_error("dlmlock", status, lockres);
2424 ret = -EINVAL;
2425 ocfs2_recover_from_dlm_error(lockres, 1);
2426 goto bail;
2427 }
2428
2429 ret = 0;
2430bail:
2431 mlog_exit(ret);
2432 return ret;
2433}
2434
2435/* returns 1 when the caller should unlock and call dlmunlock */
2436static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2437 struct ocfs2_lock_res *lockres)
2438{
2439 assert_spin_locked(&lockres->l_lock);
2440
2441 mlog_entry_void();
2442 mlog(0, "lock %s\n", lockres->l_name);
2443
2444 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2445 /* If we're already trying to cancel a lock conversion
2446 * then just drop the spinlock and allow the caller to
2447 * requeue this lock. */
2448
2449 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2450 return 0;
2451 }
2452
2453 /* were we in a convert when we got the bast fire? */
2454 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2455 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2456 /* set things up for the unlockast to know to just
2457 * clear out the ast_action and unset busy, etc. */
2458 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2459
2460 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2461 "lock %s, invalid flags: 0x%lx\n",
2462 lockres->l_name, lockres->l_flags);
2463
2464 return 1;
2465}
2466
2467static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2468 struct ocfs2_lock_res *lockres)
2469{
2470 int ret;
2471 enum dlm_status status;
2472
2473 mlog_entry_void();
2474 mlog(0, "lock %s\n", lockres->l_name);
2475
2476 ret = 0;
2477 status = dlmunlock(osb->dlm,
2478 &lockres->l_lksb,
2479 LKM_CANCEL,
2480 lockres->l_ops->unlock_ast,
2481 lockres);
2482 if (status != DLM_NORMAL) {
2483 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2484 ret = -EINVAL;
2485 ocfs2_recover_from_dlm_error(lockres, 0);
2486 }
2487
2488 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2489
2490 mlog_exit(ret);
2491 return ret;
2492}
2493
2494static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2495 struct ocfs2_lock_res *lockres,
2496 int new_level)
2497{
2498 int ret;
2499
2500 mlog_entry_void();
2501
2502 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2503
2504 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2505 ret = 0;
2506 mlog(0, "lockres %s currently being refreshed -- backing "
2507 "off!\n", lockres->l_name);
2508 } else if (new_level == LKM_PRMODE)
2509 ret = !lockres->l_ex_holders &&
2510 ocfs2_inode_fully_checkpointed(inode);
2511 else /* Must be NLMODE we're converting to. */
2512 ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2513 ocfs2_inode_fully_checkpointed(inode);
2514
2515 mlog_exit(ret);
2516 return ret;
2517}
2518
2519static int ocfs2_do_unblock_meta(struct inode *inode,
2520 int *requeue)
2521{
2522 int new_level;
2523 int set_lvb = 0;
2524 int ret = 0;
2525 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2526 unsigned long flags;
2527
2528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2529
2530 mlog_entry_void();
2531
2532 spin_lock_irqsave(&lockres->l_lock, flags);
2533
2534 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2535
2536 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2537 lockres->l_blocking);
2538
2539 BUG_ON(lockres->l_level != LKM_EXMODE &&
2540 lockres->l_level != LKM_PRMODE);
2541
2542 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2543 *requeue = 1;
2544 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2545 spin_unlock_irqrestore(&lockres->l_lock, flags);
2546 if (ret) {
2547 ret = ocfs2_cancel_convert(osb, lockres);
2548 if (ret < 0)
2549 mlog_errno(ret);
2550 }
2551 goto leave;
2552 }
2553
2554 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2555
2556 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2557 lockres->l_level, lockres->l_blocking, new_level);
2558
2559 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2560 if (lockres->l_level == LKM_EXMODE)
2561 set_lvb = 1;
2562
2563 /* If the lock hasn't been refreshed yet (rare), then
2564 * our memory inode values are old and we skip
2565 * stuffing the lvb. There's no need to actually clear
2566 * out the lvb here as it's value is still valid. */
2567 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2568 if (set_lvb)
2569 __ocfs2_stuff_meta_lvb(inode);
2570 } else
2571 mlog(0, "lockres %s: downconverting stale lock!\n",
2572 lockres->l_name);
2573
2574 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2575 "l_blocking=%d, new_level=%d\n",
2576 lockres->l_level, lockres->l_blocking, new_level);
2577
2578 ocfs2_prepare_downconvert(lockres, new_level);
2579 spin_unlock_irqrestore(&lockres->l_lock, flags);
2580 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2581 goto leave;
2582 }
2583 if (!ocfs2_inode_fully_checkpointed(inode))
2584 ocfs2_start_checkpoint(osb);
2585
2586 *requeue = 1;
2587 spin_unlock_irqrestore(&lockres->l_lock, flags);
2588 ret = 0;
2589leave:
2590 mlog_exit(ret);
2591 return ret;
2592}
2593
2594static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2595 struct ocfs2_lock_res *lockres,
2596 int *requeue,
2597 ocfs2_convert_worker_t *worker)
2598{
2599 unsigned long flags;
2600 int blocking;
2601 int new_level;
2602 int ret = 0;
2603
2604 mlog_entry_void();
2605
2606 spin_lock_irqsave(&lockres->l_lock, flags);
2607
2608 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2609
2610recheck:
2611 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2612 *requeue = 1;
2613 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2614 spin_unlock_irqrestore(&lockres->l_lock, flags);
2615 if (ret) {
2616 ret = ocfs2_cancel_convert(osb, lockres);
2617 if (ret < 0)
2618 mlog_errno(ret);
2619 }
2620 goto leave;
2621 }
2622
2623 /* if we're blocking an exclusive and we have *any* holders,
2624 * then requeue. */
2625 if ((lockres->l_blocking == LKM_EXMODE)
2626 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2627 spin_unlock_irqrestore(&lockres->l_lock, flags);
2628 *requeue = 1;
2629 ret = 0;
2630 goto leave;
2631 }
2632
2633 /* If it's a PR we're blocking, then only
2634 * requeue if we've got any EX holders */
2635 if (lockres->l_blocking == LKM_PRMODE &&
2636 lockres->l_ex_holders) {
2637 spin_unlock_irqrestore(&lockres->l_lock, flags);
2638 *requeue = 1;
2639 ret = 0;
2640 goto leave;
2641 }
2642
2643 /* If we get here, then we know that there are no more
2644 * incompatible holders (and anyone asking for an incompatible
2645 * lock is blocked). We can now downconvert the lock */
2646 if (!worker)
2647 goto downconvert;
2648
2649 /* Some lockres types want to do a bit of work before
2650 * downconverting a lock. Allow that here. The worker function
2651 * may sleep, so we save off a copy of what we're blocking as
2652 * it may change while we're not holding the spin lock. */
2653 blocking = lockres->l_blocking;
2654 spin_unlock_irqrestore(&lockres->l_lock, flags);
2655
2656 worker(lockres, blocking);
2657
2658 spin_lock_irqsave(&lockres->l_lock, flags);
2659 if (blocking != lockres->l_blocking) {
2660 /* If this changed underneath us, then we can't drop
2661 * it just yet. */
2662 goto recheck;
2663 }
2664
2665downconvert:
2666 *requeue = 0;
2667 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2668
2669 ocfs2_prepare_downconvert(lockres, new_level);
2670 spin_unlock_irqrestore(&lockres->l_lock, flags);
2671 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2672leave:
2673 mlog_exit(ret);
2674 return ret;
2675}
2676
2677static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2678 int blocking)
2679{
2680 struct inode *inode;
2681 struct address_space *mapping;
2682
2683 mlog_entry_void();
2684
2685 inode = ocfs2_lock_res_inode(lockres);
2686 mapping = inode->i_mapping;
2687
2688 if (filemap_fdatawrite(mapping)) {
2689 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
2690 OCFS2_I(inode)->ip_blkno);
2691 }
2692 sync_mapping_buffers(mapping);
2693 if (blocking == LKM_EXMODE) {
2694 truncate_inode_pages(mapping, 0);
2695 unmap_mapping_range(mapping, 0, 0, 0);
2696 } else {
2697 /* We only need to wait on the I/O if we're not also
2698 * truncating pages because truncate_inode_pages waits
2699 * for us above. We don't truncate pages if we're
2700 * blocking anything < EXMODE because we want to keep
2701 * them around in that case. */
2702 filemap_fdatawait(mapping);
2703 }
2704
2705 mlog_exit_void();
2706}
2707
2708int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2709 int *requeue)
2710{
2711 int status;
2712 struct inode *inode;
2713 struct ocfs2_super *osb;
2714
2715 mlog_entry_void();
2716
2717 inode = ocfs2_lock_res_inode(lockres);
2718 osb = OCFS2_SB(inode->i_sb);
2719
2720 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2721
2722 status = ocfs2_generic_unblock_lock(osb,
2723 lockres,
2724 requeue,
2725 ocfs2_data_convert_worker);
2726 if (status < 0)
2727 mlog_errno(status);
2728
2729 mlog(0, "inode %"MLFu64", requeue = %d\n",
2730 OCFS2_I(inode)->ip_blkno, *requeue);
2731
2732 mlog_exit(status);
2733 return status;
2734}
2735
2736static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2737 int *requeue)
2738{
2739 int status;
2740 struct inode *inode;
2741
2742 mlog_entry_void();
2743
2744 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2745
2746 inode = ocfs2_lock_res_inode(lockres);
2747
2748 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2749 lockres,
2750 requeue,
2751 NULL);
2752 if (status < 0)
2753 mlog_errno(status);
2754
2755 mlog_exit(status);
2756 return status;
2757}
2758
2759
2760int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2761 int *requeue)
2762{
2763 int status;
2764 struct inode *inode;
2765
2766 mlog_entry_void();
2767
2768 inode = ocfs2_lock_res_inode(lockres);
2769
2770 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2771
2772 status = ocfs2_do_unblock_meta(inode, requeue);
2773 if (status < 0)
2774 mlog_errno(status);
2775
2776 mlog(0, "inode %"MLFu64", requeue = %d\n",
2777 OCFS2_I(inode)->ip_blkno, *requeue);
2778
2779 mlog_exit(status);
2780 return status;
2781}
2782
2783/* Generic unblock function for any lockres whose private data is an
2784 * ocfs2_super pointer. */
2785static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2786 int *requeue)
2787{
2788 int status;
2789 struct ocfs2_super *osb;
2790
2791 mlog_entry_void();
2792
2793 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2794
2795 osb = ocfs2_lock_res_super(lockres);
2796
2797 status = ocfs2_generic_unblock_lock(osb,
2798 lockres,
2799 requeue,
2800 NULL);
2801 if (status < 0)
2802 mlog_errno(status);
2803
2804 mlog_exit(status);
2805 return status;
2806}
2807
2808void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2809 struct ocfs2_lock_res *lockres)
2810{
2811 int status;
2812 int requeue = 0;
2813 unsigned long flags;
2814
2815 /* Our reference to the lockres in this function can be
2816 * considered valid until we remove the OCFS2_LOCK_QUEUED
2817 * flag. */
2818
2819 mlog_entry_void();
2820
2821 BUG_ON(!lockres);
2822 BUG_ON(!lockres->l_ops);
2823 BUG_ON(!lockres->l_ops->unblock);
2824
2825 mlog(0, "lockres %s blocked.\n", lockres->l_name);
2826
2827 /* Detect whether a lock has been marked as going away while
2828 * the vote thread was processing other things. A lock can
2829 * still be marked with OCFS2_LOCK_FREEING after this check,
2830 * but short circuiting here will still save us some
2831 * performance. */
2832 spin_lock_irqsave(&lockres->l_lock, flags);
2833 if (lockres->l_flags & OCFS2_LOCK_FREEING)
2834 goto unqueue;
2835 spin_unlock_irqrestore(&lockres->l_lock, flags);
2836
2837 status = lockres->l_ops->unblock(lockres, &requeue);
2838 if (status < 0)
2839 mlog_errno(status);
2840
2841 spin_lock_irqsave(&lockres->l_lock, flags);
2842unqueue:
2843 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2844 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2845 } else
2846 ocfs2_schedule_blocked_lock(osb, lockres);
2847
2848 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2849 requeue ? "yes" : "no");
2850 spin_unlock_irqrestore(&lockres->l_lock, flags);
2851
2852 mlog_exit_void();
2853}
2854
2855static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2856 struct ocfs2_lock_res *lockres)
2857{
2858 mlog_entry_void();
2859
2860 assert_spin_locked(&lockres->l_lock);
2861
2862 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2863 /* Do not schedule a lock for downconvert when it's on
2864 * the way to destruction - any nodes wanting access
2865 * to the resource will get it soon. */
2866 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2867 lockres->l_name, lockres->l_flags);
2868 return;
2869 }
2870
2871 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2872
2873 spin_lock(&osb->vote_task_lock);
2874 if (list_empty(&lockres->l_blocked_list)) {
2875 list_add_tail(&lockres->l_blocked_list,
2876 &osb->blocked_lock_list);
2877 osb->blocked_lock_count++;
2878 }
2879 spin_unlock(&osb->vote_task_lock);
2880
2881 mlog_exit_void();
2882}
2883
2884/* This aids in debugging situations where a bad LVB might be involved. */
2885void ocfs2_dump_meta_lvb_info(u64 level,
2886 const char *function,
2887 unsigned int line,
2888 struct ocfs2_lock_res *lockres)
2889{
2890 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2891
2892 mlog(level, "LVB information for %s (called from %s:%u):\n",
2893 lockres->l_name, function, line);
2894 mlog(level, "version: %u, clusters: %u\n",
2895 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2896 mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
2897 be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
2898 be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
2899 mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
2900 "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
2901 be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
2902 be64_to_cpu(lvb->lvb_ictime_packed),
2903 be64_to_cpu(lvb->lvb_imtime_packed));
2904}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 000000000000..8f2d1db2d9ea
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef DLMGLUE_H
28#define DLMGLUE_H
29
30#define OCFS2_LVB_VERSION 2
31
32struct ocfs2_meta_lvb {
33 __be32 lvb_version;
34 __be32 lvb_iclusters;
35 __be32 lvb_iuid;
36 __be32 lvb_igid;
37 __be64 lvb_iatime_packed;
38 __be64 lvb_ictime_packed;
39 __be64 lvb_imtime_packed;
40 __be64 lvb_isize;
41 __be16 lvb_imode;
42 __be16 lvb_inlink;
43 __be32 lvb_reserved[3];
44};
45
46/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
47/* don't wait on recovery. */
48#define OCFS2_META_LOCK_RECOVERY (0x01)
49/* Instruct the dlm not to queue ourselves on the other node. */
50#define OCFS2_META_LOCK_NOQUEUE (0x02)
51/* don't block waiting for the vote thread, instead return -EAGAIN */
52#define OCFS2_LOCK_NONBLOCK (0x04)
53
54int ocfs2_dlm_init(struct ocfs2_super *osb);
55void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
56void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
57void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
58 enum ocfs2_lock_type type,
59 struct inode *inode);
60void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
61int ocfs2_create_new_inode_locks(struct inode *inode);
62int ocfs2_drop_inode_locks(struct inode *inode);
63int ocfs2_data_lock_full(struct inode *inode,
64 int write,
65 int arg_flags);
66#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
67int ocfs2_data_lock_with_page(struct inode *inode,
68 int write,
69 struct page *page);
70void ocfs2_data_unlock(struct inode *inode,
71 int write);
72int ocfs2_rw_lock(struct inode *inode, int write);
73void ocfs2_rw_unlock(struct inode *inode, int write);
74int ocfs2_meta_lock_full(struct inode *inode,
75 struct ocfs2_journal_handle *handle,
76 struct buffer_head **ret_bh,
77 int ex,
78 int arg_flags);
79int ocfs2_meta_lock_with_page(struct inode *inode,
80 struct ocfs2_journal_handle *handle,
81 struct buffer_head **ret_bh,
82 int ex,
83 struct page *page);
84/* 99% of the time we don't want to supply any additional flags --
85 * those are for very specific cases only. */
86#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
87void ocfs2_meta_unlock(struct inode *inode,
88 int ex);
89int ocfs2_super_lock(struct ocfs2_super *osb,
90 int ex);
91void ocfs2_super_unlock(struct ocfs2_super *osb,
92 int ex);
93int ocfs2_rename_lock(struct ocfs2_super *osb);
94void ocfs2_rename_unlock(struct ocfs2_super *osb);
95void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
96
97/* for the vote thread */
98void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
99 struct ocfs2_lock_res *lockres);
100
101struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
102void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
103
104/* aids in debugging and tracking lvbs */
105void ocfs2_dump_meta_lvb_info(u64 level,
106 const char *function,
107 unsigned int line,
108 struct ocfs2_lock_res *lockres);
109#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
110
111#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 000000000000..f226b2207628
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_ENDIAN_H
23#define OCFS2_ENDIAN_H
24
25static inline void le16_add_cpu(__le16 *var, u16 val)
26{
27 *var = cpu_to_le16(le16_to_cpu(*var) + val);
28}
29
30static inline void le32_add_cpu(__le32 *var, u32 val)
31{
32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33}
34
35static inline void le32_and_cpu(__le32 *var, u32 val)
36{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val);
38}
39
40static inline void be32_add_cpu(__be32 *var, u32 val)
41{
42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
43}
44
45#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 000000000000..5810160d92a8
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.c
5 *
6 * Functions to facilitate NFS exporting
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28
29#define MLOG_MASK_PREFIX ML_EXPORT
30#include <cluster/masklog.h>
31
32#include "ocfs2.h"
33
34#include "dir.h"
35#include "dlmglue.h"
36#include "export.h"
37#include "inode.h"
38
39#include "buffer_head_io.h"
40
41struct ocfs2_inode_handle
42{
43 u64 ih_blkno;
44 u32 ih_generation;
45};
46
47static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
48{
49 struct ocfs2_inode_handle *handle = vobjp;
50 struct inode *inode;
51 struct dentry *result;
52
53 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
54
55 if (handle->ih_blkno == 0) {
56 mlog_errno(-ESTALE);
57 return ERR_PTR(-ESTALE);
58 }
59
60 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
61
62 if (IS_ERR(inode)) {
63 mlog_errno(PTR_ERR(inode));
64 return (void *)inode;
65 }
66
67 if (handle->ih_generation != inode->i_generation) {
68 iput(inode);
69 mlog_errno(-ESTALE);
70 return ERR_PTR(-ESTALE);
71 }
72
73 result = d_alloc_anon(inode);
74
75 if (!result) {
76 iput(inode);
77 mlog_errno(-ENOMEM);
78 return ERR_PTR(-ENOMEM);
79 }
80
81 mlog_exit_ptr(result);
82 return result;
83}
84
85static struct dentry *ocfs2_get_parent(struct dentry *child)
86{
87 int status;
88 u64 blkno;
89 struct dentry *parent;
90 struct inode *inode;
91 struct inode *dir = child->d_inode;
92 struct buffer_head *dirent_bh = NULL;
93 struct ocfs2_dir_entry *dirent;
94
95 mlog_entry("(0x%p, '%.*s')\n", child,
96 child->d_name.len, child->d_name.name);
97
98 mlog(0, "find parent of directory %"MLFu64"\n",
99 OCFS2_I(dir)->ip_blkno);
100
101 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
102 if (status < 0) {
103 if (status != -ENOENT)
104 mlog_errno(status);
105 parent = ERR_PTR(status);
106 goto bail;
107 }
108
109 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
110 &dirent);
111 if (status < 0) {
112 parent = ERR_PTR(-ENOENT);
113 goto bail_unlock;
114 }
115
116 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
117 if (IS_ERR(inode)) {
118 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
119 parent = ERR_PTR(-EACCES);
120 goto bail_unlock;
121 }
122
123 parent = d_alloc_anon(inode);
124 if (!parent) {
125 iput(inode);
126 parent = ERR_PTR(-ENOMEM);
127 }
128
129bail_unlock:
130 ocfs2_meta_unlock(dir, 0);
131
132 if (dirent_bh)
133 brelse(dirent_bh);
134
135bail:
136 mlog_exit_ptr(parent);
137
138 return parent;
139}
140
141static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
142 int connectable)
143{
144 struct inode *inode = dentry->d_inode;
145 int len = *max_len;
146 int type = 1;
147 u64 blkno;
148 u32 generation;
149
150 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
151 dentry->d_name.len, dentry->d_name.name,
152 fh, len, connectable);
153
154 if (len < 3 || (connectable && len < 6)) {
155 mlog(ML_ERROR, "fh buffer is too small for encoding\n");
156 type = 255;
157 goto bail;
158 }
159
160 blkno = OCFS2_I(inode)->ip_blkno;
161 generation = inode->i_generation;
162
163 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
164 blkno, generation);
165
166 len = 3;
167 fh[0] = cpu_to_le32((u32)(blkno >> 32));
168 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
169 fh[2] = cpu_to_le32(generation);
170
171 if (connectable && !S_ISDIR(inode->i_mode)) {
172 struct inode *parent;
173
174 spin_lock(&dentry->d_lock);
175
176 parent = dentry->d_parent->d_inode;
177 blkno = OCFS2_I(parent)->ip_blkno;
178 generation = parent->i_generation;
179
180 fh[3] = cpu_to_le32((u32)(blkno >> 32));
181 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
182 fh[5] = cpu_to_le32(generation);
183
184 spin_unlock(&dentry->d_lock);
185
186 len = 6;
187 type = 2;
188
189 mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
190 blkno, generation);
191 }
192
193 *max_len = len;
194
195bail:
196 mlog_exit(type);
197 return type;
198}
199
200static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
201 int fh_len, int fileid_type,
202 int (*acceptable)(void *context,
203 struct dentry *de),
204 void *context)
205{
206 struct ocfs2_inode_handle handle, parent;
207 struct dentry *ret = NULL;
208
209 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
210 sb, fh, fh_len, fileid_type, acceptable, context);
211
212 if (fh_len < 3 || fileid_type > 2)
213 goto bail;
214
215 if (fileid_type == 2) {
216 if (fh_len < 6)
217 goto bail;
218
219 parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
220 parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
221 parent.ih_generation = le32_to_cpu(fh[5]);
222
223 mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
224 parent.ih_blkno, parent.ih_generation);
225 }
226
227 handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
228 handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
229 handle.ih_generation = le32_to_cpu(fh[2]);
230
231 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
232 handle.ih_blkno, handle.ih_generation);
233
234 ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
235 acceptable, context);
236
237bail:
238 mlog_exit_ptr(ret);
239 return ret;
240}
241
242struct export_operations ocfs2_export_ops = {
243 .decode_fh = ocfs2_decode_fh,
244 .encode_fh = ocfs2_encode_fh,
245
246 .get_parent = ocfs2_get_parent,
247 .get_dentry = ocfs2_get_dentry,
248};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 000000000000..5b77ee7866ef
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H
28
29extern struct export_operations ocfs2_export_ops;
30
31#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 000000000000..f2fb40cd296a
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.c
5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in
7 * the library.
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License, version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "extent_map.h"
38#include "inode.h"
39#include "super.h"
40
41#include "buffer_head_io.h"
42
43
44/*
45 * SUCK SUCK SUCK
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 *
112 * The rb_node garbage lets insertion share the search. Trivial
113 * callers pass NULL.
114 */
115static struct ocfs2_extent_map_entry *
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{
121 struct rb_node **p = &em->em_extents.rb_node;
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140
141 if (ret_p != NULL)
142 *ret_p = p;
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146}
147
148/*
149 * Find the leaf containing the interval we want. While we're on our
150 * way down the tree, fill in every record we see at any depth, because
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{
161 int i, ret;
162 struct buffer_head *eb_bh = NULL;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 goto out_free;
185 }
186
187 if (rec_end <= cpos) {
188 ret = ocfs2_extent_map_insert(inode, rec,
189 le16_to_cpu(el->l_tree_depth));
190 if (ret && (ret != -EEXIST)) {
191 mlog_errno(ret);
192 goto out_free;
193 }
194 continue;
195 }
196 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 ret = ocfs2_extent_map_insert(inode, rec,
198 le16_to_cpu(el->l_tree_depth));
199 if (ret && (ret != -EEXIST)) {
200 mlog_errno(ret);
201 goto out_free;
202 }
203 continue;
204 }
205
206 /*
207 * We've found a record that matches our
208 * interval. We don't insert it because we're
209 * about to traverse it.
210 */
211
212 /* Check to see if we're stradling */
213 ret = -ESRCH;
214 if (!ocfs2_extent_rec_contains_clusters(rec,
215 cpos,
216 clusters)) {
217 mlog_errno(ret);
218 goto out_free;
219 }
220
221 /*
222 * If we've already found a record, the el has
223 * two records covering the same interval.
224 * EEEK!
225 */
226 ret = -EBADR;
227 if (blkno) {
228 mlog_errno(ret);
229 goto out_free;
230 }
231
232 blkno = le64_to_cpu(rec->e_blkno);
233 }
234
235 /*
236 * We don't support holes, and we're still up
237 * in the branches, so we'd better have found someone
238 */
239 ret = -EBADR;
240 if (!blkno) {
241 mlog_errno(ret);
242 goto out_free;
243 }
244
245 if (eb_bh) {
246 brelse(eb_bh);
247 eb_bh = NULL;
248 }
249 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 blkno, &eb_bh, OCFS2_BH_CACHED,
251 inode);
252 if (ret) {
253 mlog_errno(ret);
254 goto out_free;
255 }
256 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 ret = -EIO;
260 goto out_free;
261 }
262 el = &eb->h_list;
263 }
264
265 if (el->l_tree_depth)
266 BUG();
267
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i];
270 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth));
272 if (ret) {
273 mlog_errno(ret);
274 goto out_free;
275 }
276 }
277
278 ret = 0;
279
280out_free:
281 if (eb_bh)
282 brelse(eb_bh);
283
284 return ret;
285}
286
287/*
288 * This lookup actually will read from disk. It has one invariant:
289 * It will never re-traverse blocks. This means that all inserts should
290 * be new regions or more granular regions (both allowed by insert).
291 */
292static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 u32 cpos,
294 u32 clusters,
295 struct ocfs2_extent_map_entry **ret_ent)
296{
297 int ret;
298 u64 blkno;
299 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 struct ocfs2_extent_map_entry *ent;
301 struct buffer_head *bh = NULL;
302 struct ocfs2_extent_block *eb;
303 struct ocfs2_dinode *di;
304 struct ocfs2_extent_list *el;
305
306 spin_lock(&OCFS2_I(inode)->ip_lock);
307 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 if (ent) {
309 if (!ent->e_tree_depth) {
310 spin_unlock(&OCFS2_I(inode)->ip_lock);
311 *ret_ent = ent;
312 return 0;
313 }
314 blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 spin_unlock(&OCFS2_I(inode)->ip_lock);
316
317 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 OCFS2_BH_CACHED, inode);
319 if (ret) {
320 mlog_errno(ret);
321 if (bh)
322 brelse(bh);
323 return ret;
324 }
325 eb = (struct ocfs2_extent_block *)bh->b_data;
326 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 brelse(bh);
329 return -EIO;
330 }
331 el = &eb->h_list;
332 } else {
333 spin_unlock(&OCFS2_I(inode)->ip_lock);
334
335 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 OCFS2_I(inode)->ip_blkno, &bh,
337 OCFS2_BH_CACHED, inode);
338 if (ret) {
339 mlog_errno(ret);
340 if (bh)
341 brelse(bh);
342 return ret;
343 }
344 di = (struct ocfs2_dinode *)bh->b_data;
345 if (!OCFS2_IS_VALID_DINODE(di)) {
346 brelse(bh);
347 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 return -EIO;
349 }
350 el = &di->id2.i_list;
351 }
352
353 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 brelse(bh);
355 if (ret) {
356 mlog_errno(ret);
357 return ret;
358 }
359
360 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 if (!ent) {
362 ret = -ESRCH;
363 mlog_errno(ret);
364 return ret;
365 }
366
367 if (ent->e_tree_depth)
368 BUG(); /* FIXME: Make sure this isn't a corruption */
369
370 *ret_ent = ent;
371
372 return 0;
373}
374
375/*
376 * Callers must hold ip_lock. This can insert pieces of the tree,
377 * thus racing lookup if the lock weren't held.
378 */
379static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 struct ocfs2_extent_map_entry *ent)
381{
382 struct rb_node **p, *parent;
383 struct ocfs2_extent_map_entry *old_ent;
384
385 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 le32_to_cpu(ent->e_rec.e_clusters),
387 &p, &parent);
388 if (old_ent)
389 return -EEXIST;
390
391 rb_link_node(&ent->e_node, parent, p);
392 rb_insert_color(&ent->e_node, &em->em_extents);
393
394 return 0;
395}
396
397
398/*
399 * Simple rule: on any return code other than -EAGAIN, anything left
400 * in the insert_context will be freed.
401 */
402static int ocfs2_extent_map_try_insert(struct inode *inode,
403 struct ocfs2_extent_rec *rec,
404 int tree_depth,
405 struct ocfs2_em_insert_context *ctxt)
406{
407 int ret;
408 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 struct ocfs2_extent_map_entry *old_ent;
410
411 ctxt->need_left = 0;
412 ctxt->need_right = 0;
413 ctxt->old_ent = NULL;
414
415 spin_lock(&OCFS2_I(inode)->ip_lock);
416 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 if (!ret) {
418 ctxt->new_ent = NULL;
419 goto out_unlock;
420 }
421
422 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 le32_to_cpu(rec->e_clusters), NULL,
424 NULL);
425
426 if (!old_ent)
427 BUG();
428
429 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth)
431 goto out_unlock;
432
433 if (old_ent->e_tree_depth == tree_depth) {
434 if (!memcmp(rec, &old_ent->e_rec,
435 sizeof(struct ocfs2_extent_rec)))
436 ret = 0;
437
438 /* FIXME: Should this be ESRCH/EBADR??? */
439 goto out_unlock;
440 }
441
442 /*
443 * We do it in this order specifically so that no actual tree
444 * changes occur until we have all the pieces we need. We
445 * don't want malloc failures to leave an inconsistent tree.
446 * Whenever we drop the lock, another process could be
447 * inserting. Also note that, if another process just beat us
448 * to an insert, we might not need the same pieces we needed
449 * the first go round. In the end, the pieces we need will
450 * be used, and the pieces we don't will be freed.
451 */
452 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 le32_to_cpu(old_ent->e_rec.e_cpos));
454 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 ret = -EAGAIN;
458 if (ctxt->need_left) {
459 if (!ctxt->left_ent)
460 goto out_unlock;
461 *(ctxt->left_ent) = *old_ent;
462 ctxt->left_ent->e_rec.e_clusters =
463 cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 }
466 if (ctxt->need_right) {
467 if (!ctxt->right_ent)
468 goto out_unlock;
469 *(ctxt->right_ent) = *old_ent;
470 ctxt->right_ent->e_rec.e_cpos =
471 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 le32_to_cpu(rec->e_clusters));
473 ctxt->right_ent->e_rec.e_clusters =
474 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 }
478
479 rb_erase(&old_ent->e_node, &em->em_extents);
480 /* Now that he's erased, set him up for deletion */
481 ctxt->old_ent = old_ent;
482
483 if (ctxt->need_left) {
484 ret = ocfs2_extent_map_insert_entry(em,
485 ctxt->left_ent);
486 if (ret)
487 goto out_unlock;
488 ctxt->left_ent = NULL;
489 }
490
491 if (ctxt->need_right) {
492 ret = ocfs2_extent_map_insert_entry(em,
493 ctxt->right_ent);
494 if (ret)
495 goto out_unlock;
496 ctxt->right_ent = NULL;
497 }
498
499 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500
501 if (!ret)
502 ctxt->new_ent = NULL;
503
504out_unlock:
505 spin_unlock(&OCFS2_I(inode)->ip_lock);
506
507 return ret;
508}
509
510
511static int ocfs2_extent_map_insert(struct inode *inode,
512 struct ocfs2_extent_rec *rec,
513 int tree_depth)
514{
515 int ret;
516 struct ocfs2_em_insert_context ctxt = {0, };
517
518 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 OCFS2_I(inode)->ip_map.em_clusters) {
520 ret = -EBADR;
521 mlog_errno(ret);
522 return ret;
523 }
524
525 /* Zero e_clusters means a truncated tail record. It better be EOF */
526 if (!rec->e_clusters) {
527 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR;
530 mlog_errno(ret);
531 return ret;
532 }
533
534 /* Ignore the truncated tail */
535 return 0;
536 }
537
538 ret = -ENOMEM;
539 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 GFP_KERNEL);
541 if (!ctxt.new_ent) {
542 mlog_errno(ret);
543 return ret;
544 }
545
546 ctxt.new_ent->e_rec = *rec;
547 ctxt.new_ent->e_tree_depth = tree_depth;
548
549 do {
550 ret = -ENOMEM;
551 if (ctxt.need_left && !ctxt.left_ent) {
552 ctxt.left_ent =
553 kmem_cache_alloc(ocfs2_em_ent_cachep,
554 GFP_KERNEL);
555 if (!ctxt.left_ent)
556 break;
557 }
558 if (ctxt.need_right && !ctxt.right_ent) {
559 ctxt.right_ent =
560 kmem_cache_alloc(ocfs2_em_ent_cachep,
561 GFP_KERNEL);
562 if (!ctxt.right_ent)
563 break;
564 }
565
566 ret = ocfs2_extent_map_try_insert(inode, rec,
567 tree_depth, &ctxt);
568 } while (ret == -EAGAIN);
569
570 if (ret < 0)
571 mlog_errno(ret);
572
573 if (ctxt.left_ent)
574 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 if (ctxt.right_ent)
576 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 if (ctxt.old_ent)
578 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 if (ctxt.new_ent)
580 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581
582 return ret;
583}
584
585/*
586 * Append this record to the tail of the extent map. It must be
587 * tree_depth 0. The record might be an extension of an existing
588 * record, and as such that needs to be handled. eg:
589 *
590 * Existing record in the extent map:
591 *
592 * cpos = 10, len = 10
593 * |---------|
594 *
595 * New Record:
596 *
597 * cpos = 10, len = 20
598 * |------------------|
599 *
600 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a
602 * contiguous append, the new_clusters has been added to
603 * rec->e_clusters. If the append is an entirely new extent, then
604 * rec->e_clusters is == new_clusters.
605 */
606int ocfs2_extent_map_append(struct inode *inode,
607 struct ocfs2_extent_rec *rec,
608 u32 new_clusters)
609{
610 int ret;
611 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 struct ocfs2_extent_map_entry *ent;
613 struct ocfs2_extent_rec *old;
614
615 BUG_ON(!new_clusters);
616 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617
618 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 /*
620 * Size changed underneath us on disk. Drop any
621 * straddling records and update our idea of
622 * i_clusters
623 */
624 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 }
627
628 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 le32_to_cpu(rec->e_clusters)) !=
630 (em->em_clusters + new_clusters),
631 "Inode %"MLFu64":\n"
632 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 "em->em_clusters = %u + new_clusters = %u = %u\n",
634 OCFS2_I(inode)->ip_blkno,
635 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 em->em_clusters, new_clusters,
638 em->em_clusters + new_clusters);
639
640 em->em_clusters += new_clusters;
641
642 ret = -ENOENT;
643 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 /* This is a contiguous append */
645 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 NULL, NULL);
647 if (ent) {
648 old = &ent->e_rec;
649 BUG_ON((le32_to_cpu(rec->e_cpos) +
650 le32_to_cpu(rec->e_clusters)) !=
651 (le32_to_cpu(old->e_cpos) +
652 le32_to_cpu(old->e_clusters) +
653 new_clusters));
654 if (ent->e_tree_depth == 0) {
655 BUG_ON(le32_to_cpu(old->e_cpos) !=
656 le32_to_cpu(rec->e_cpos));
657 BUG_ON(le64_to_cpu(old->e_blkno) !=
658 le64_to_cpu(rec->e_blkno));
659 ret = 0;
660 }
661 /*
662 * Let non-leafs fall through as -ENOENT to
663 * force insertion of the new leaf.
664 */
665 le32_add_cpu(&old->e_clusters, new_clusters);
666 }
667 }
668
669 if (ret == -ENOENT)
670 ret = ocfs2_extent_map_insert(inode, rec, 0);
671 if (ret < 0)
672 mlog_errno(ret);
673 return ret;
674}
675
676#if 0
677/* Code here is included but defined out as it completes the extent
678 * map api and may be used in the future. */
679
680/*
681 * Look up the record containing this cluster offset. This record is
682 * part of the extent map. Do not free it. Any changes you make to
683 * it will reflect in the extent map. So, if your last extent
684 * is (cpos = 10, clusters = 10) and you truncate the file by 5
685 * clusters, you can do:
686 *
687 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688 * rec->e_clusters -= 5;
689 *
690 * The lookup does not read from disk. If the map isn't filled in for
691 * an entry, you won't find it.
692 *
693 * Also note that the returned record is valid until alloc_sem is
694 * dropped. After that, truncate and extend can happen. Caveat Emptor.
695 */
696int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 struct ocfs2_extent_rec **rec,
698 int *tree_depth)
699{
700 int ret = -ENOENT;
701 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 struct ocfs2_extent_map_entry *ent;
703
704 *rec = NULL;
705
706 if (cpos >= OCFS2_I(inode)->ip_clusters)
707 return -EINVAL;
708
709 if (cpos >= em->em_clusters) {
710 /*
711 * Size changed underneath us on disk. Drop any
712 * straddling records and update our idea of
713 * i_clusters
714 */
715 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 }
718
719 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 NULL, NULL);
721
722 if (ent) {
723 *rec = &ent->e_rec;
724 if (tree_depth)
725 *tree_depth = ent->e_tree_depth;
726 ret = 0;
727 }
728
729 return ret;
730}
731
732int ocfs2_extent_map_get_clusters(struct inode *inode,
733 u32 v_cpos, int count,
734 u32 *p_cpos, int *ret_count)
735{
736 int ret;
737 u32 coff, ccount;
738 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 struct ocfs2_extent_map_entry *ent = NULL;
740
741 *p_cpos = ccount = 0;
742
743 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 return -EINVAL;
745
746 if ((v_cpos + count) > em->em_clusters) {
747 /*
748 * Size changed underneath us on disk. Drop any
749 * straddling records and update our idea of
750 * i_clusters
751 */
752 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 }
755
756
757 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 if (ret)
759 return ret;
760
761 if (ent) {
762 /* We should never find ourselves straddling an interval */
763 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 v_cpos,
765 count))
766 return -ESRCH;
767
768 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 le64_to_cpu(ent->e_rec.e_blkno)) +
771 coff;
772
773 if (ret_count)
774 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775
776 return 0;
777 }
778
779
780 return -ENOENT;
781}
782
783#endif /* 0 */
784
785int ocfs2_extent_map_get_blocks(struct inode *inode,
786 u64 v_blkno, int count,
787 u64 *p_blkno, int *ret_count)
788{
789 int ret;
790 u64 boff;
791 u32 cpos, clusters;
792 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 struct ocfs2_extent_map_entry *ent = NULL;
794 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 struct ocfs2_extent_rec *rec;
796
797 *p_blkno = 0;
798
799 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 (u64)count + bpc - 1);
802 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 ret = -EINVAL;
804 mlog_errno(ret);
805 return ret;
806 }
807
808 if ((cpos + clusters) > em->em_clusters) {
809 /*
810 * Size changed underneath us on disk. Drop any
811 * straddling records and update our idea of
812 * i_clusters
813 */
814 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 }
817
818 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 if (ret) {
820 mlog_errno(ret);
821 return ret;
822 }
823
824 if (ent)
825 {
826 rec = &ent->e_rec;
827
828 /* We should never find ourselves straddling an interval */
829 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 ret = -ESRCH;
831 mlog_errno(ret);
832 return ret;
833 }
834
835 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 le32_to_cpu(rec->e_cpos));
837 boff += (v_blkno & (u64)(bpc - 1));
838 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839
840 if (ret_count) {
841 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 le32_to_cpu(rec->e_clusters)) - boff;
843 }
844
845 return 0;
846 }
847
848 return -ENOENT;
849}
850
851int ocfs2_extent_map_init(struct inode *inode)
852{
853 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854
855 em->em_extents = RB_ROOT;
856 em->em_clusters = 0;
857
858 return 0;
859}
860
861/* Needs the lock */
862static void __ocfs2_extent_map_drop(struct inode *inode,
863 u32 new_clusters,
864 struct rb_node **free_head,
865 struct ocfs2_extent_map_entry **tail_ent)
866{
867 struct rb_node *node, *next;
868 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 struct ocfs2_extent_map_entry *ent;
870
871 *free_head = NULL;
872
873 ent = NULL;
874 node = rb_last(&em->em_extents);
875 while (node)
876 {
877 next = rb_prev(node);
878
879 ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 e_node);
881 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 break;
883
884 rb_erase(&ent->e_node, &em->em_extents);
885
886 node->rb_right = *free_head;
887 *free_head = node;
888
889 ent = NULL;
890 node = next;
891 }
892
893 /* Do we have an entry straddling new_clusters? */
894 if (tail_ent) {
895 if (ent &&
896 ((le32_to_cpu(ent->e_rec.e_cpos) +
897 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 *tail_ent = ent;
899 else
900 *tail_ent = NULL;
901 }
902}
903
904static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905{
906 struct rb_node *node;
907 struct ocfs2_extent_map_entry *ent;
908
909 while (free_head) {
910 node = free_head;
911 free_head = node->rb_right;
912
913 ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 e_node);
915 kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 }
917}
918
919/*
920 * Remove all entries past new_clusters, inclusive of an entry that
921 * contains new_clusters. This is effectively a cache forget.
922 *
923 * If you want to also clip the last extent by some number of clusters,
924 * you need to call ocfs2_extent_map_trunc().
925 * This code does not check or modify ip_clusters.
926 */
927int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928{
929 struct rb_node *free_head = NULL;
930 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 struct ocfs2_extent_map_entry *ent;
932
933 spin_lock(&OCFS2_I(inode)->ip_lock);
934
935 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936
937 if (ent) {
938 rb_erase(&ent->e_node, &em->em_extents);
939 ent->e_node.rb_right = free_head;
940 free_head = &ent->e_node;
941 }
942
943 spin_unlock(&OCFS2_I(inode)->ip_lock);
944
945 if (free_head)
946 __ocfs2_extent_map_drop_cleanup(free_head);
947
948 return 0;
949}
950
951/*
952 * Remove all entries past new_clusters and also clip any extent
953 * straddling new_clusters, if there is one. This does not check
954 * or modify ip_clusters
955 */
956int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957{
958 struct rb_node *free_head = NULL;
959 struct ocfs2_extent_map_entry *ent = NULL;
960
961 spin_lock(&OCFS2_I(inode)->ip_lock);
962
963 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964
965 if (ent)
966 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 le32_to_cpu(ent->e_rec.e_cpos));
968
969 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970
971 spin_unlock(&OCFS2_I(inode)->ip_lock);
972
973 if (free_head)
974 __ocfs2_extent_map_drop_cleanup(free_head);
975
976 return 0;
977}
978
979int __init init_ocfs2_extent_maps(void)
980{
981 ocfs2_em_ent_cachep =
982 kmem_cache_create("ocfs2_em_ent",
983 sizeof(struct ocfs2_extent_map_entry),
984 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 if (!ocfs2_em_ent_cachep)
986 return -ENOMEM;
987
988 return 0;
989}
990
991void __exit exit_ocfs2_extent_maps(void)
992{
993 kmem_cache_destroy(ocfs2_em_ent_cachep);
994}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 000000000000..fa3745efa886
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.h
5 *
6 * In-memory file extent mappings for OCFS2.
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H
27
28int init_ocfs2_extent_maps(void);
29void exit_ocfs2_extent_maps(void);
30
31/*
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
33 * to be held. The allocation cannot change at all while the map is
34 * in the process of being updated.
35 */
36int ocfs2_extent_map_init(struct inode *inode);
37int ocfs2_extent_map_append(struct inode *inode,
38 struct ocfs2_extent_rec *rec,
39 u32 new_clusters);
40int ocfs2_extent_map_get_blocks(struct inode *inode,
41 u64 v_blkno, int count,
42 u64 *p_blkno, int *ret_count);
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
45
46#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 000000000000..72ae9e3306f4
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c
5 *
6 * File open, close, extend, truncate
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#include "alloc.h"
39#include "aops.h"
40#include "dir.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "sysfile.h"
45#include "inode.h"
46#include "journal.h"
47#include "mmap.h"
48#include "suballoc.h"
49#include "super.h"
50
51#include "buffer_head_io.h"
52
53static int ocfs2_sync_inode(struct inode *inode)
54{
55 filemap_fdatawrite(inode->i_mapping);
56 return sync_mapping_buffers(inode->i_mapping);
57}
58
59static int ocfs2_file_open(struct inode *inode, struct file *file)
60{
61 int status;
62 int mode = file->f_flags;
63 struct ocfs2_inode_info *oi = OCFS2_I(inode);
64
65 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
66 file->f_dentry->d_name.len, file->f_dentry->d_name.name);
67
68 spin_lock(&oi->ip_lock);
69
70 /* Check that the inode hasn't been wiped from disk by another
71 * node. If it hasn't then we're safe as long as we hold the
72 * spin lock until our increment of open count. */
73 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
74 spin_unlock(&oi->ip_lock);
75
76 status = -ENOENT;
77 goto leave;
78 }
79
80 if (mode & O_DIRECT)
81 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
82
83 oi->ip_open_count++;
84 spin_unlock(&oi->ip_lock);
85 status = 0;
86leave:
87 mlog_exit(status);
88 return status;
89}
90
91static int ocfs2_file_release(struct inode *inode, struct file *file)
92{
93 struct ocfs2_inode_info *oi = OCFS2_I(inode);
94
95 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
96 file->f_dentry->d_name.len,
97 file->f_dentry->d_name.name);
98
99 spin_lock(&oi->ip_lock);
100 if (!--oi->ip_open_count)
101 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
102 spin_unlock(&oi->ip_lock);
103
104 mlog_exit(0);
105
106 return 0;
107}
108
109static int ocfs2_sync_file(struct file *file,
110 struct dentry *dentry,
111 int datasync)
112{
113 int err = 0;
114 journal_t *journal;
115 struct inode *inode = dentry->d_inode;
116 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
117
118 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
119 dentry->d_name.len, dentry->d_name.name);
120
121 err = ocfs2_sync_inode(dentry->d_inode);
122 if (err)
123 goto bail;
124
125 journal = osb->journal->j_journal;
126 err = journal_force_commit(journal);
127
128bail:
129 mlog_exit(err);
130
131 return (err < 0) ? -EIO : 0;
132}
133
134int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
135 struct inode *inode,
136 struct buffer_head *fe_bh,
137 u64 new_i_size)
138{
139 int status;
140
141 mlog_entry_void();
142 i_size_write(inode, new_i_size);
143 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
144 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
145
146 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
147 if (status < 0) {
148 mlog_errno(status);
149 goto bail;
150 }
151
152bail:
153 mlog_exit(status);
154 return status;
155}
156
157static int ocfs2_simple_size_update(struct inode *inode,
158 struct buffer_head *di_bh,
159 u64 new_i_size)
160{
161 int ret;
162 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
163 struct ocfs2_journal_handle *handle = NULL;
164
165 handle = ocfs2_start_trans(osb, NULL,
166 OCFS2_INODE_UPDATE_CREDITS);
167 if (handle == NULL) {
168 ret = -ENOMEM;
169 mlog_errno(ret);
170 goto out;
171 }
172
173 ret = ocfs2_set_inode_size(handle, inode, di_bh,
174 new_i_size);
175 if (ret < 0)
176 mlog_errno(ret);
177
178 ocfs2_commit_trans(handle);
179out:
180 return ret;
181}
182
183static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
184 struct inode *inode,
185 struct buffer_head *fe_bh,
186 u64 new_i_size)
187{
188 int status;
189 struct ocfs2_journal_handle *handle;
190
191 mlog_entry_void();
192
193 /* TODO: This needs to actually orphan the inode in this
194 * transaction. */
195
196 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
197 if (IS_ERR(handle)) {
198 status = PTR_ERR(handle);
199 mlog_errno(status);
200 goto out;
201 }
202
203 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
204 if (status < 0)
205 mlog_errno(status);
206
207 ocfs2_commit_trans(handle);
208out:
209 mlog_exit(status);
210 return status;
211}
212
213static int ocfs2_truncate_file(struct inode *inode,
214 struct buffer_head *di_bh,
215 u64 new_i_size)
216{
217 int status = 0;
218 struct ocfs2_dinode *fe = NULL;
219 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
220 struct ocfs2_truncate_context *tc = NULL;
221
222 mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
223 OCFS2_I(inode)->ip_blkno, new_i_size);
224
225 truncate_inode_pages(inode->i_mapping, new_i_size);
226
227 fe = (struct ocfs2_dinode *) di_bh->b_data;
228 if (!OCFS2_IS_VALID_DINODE(fe)) {
229 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
230 status = -EIO;
231 goto bail;
232 }
233
234 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
235 "Inode %"MLFu64", inode i_size = %lld != di "
236 "i_size = %"MLFu64", i_flags = 0x%x\n",
237 OCFS2_I(inode)->ip_blkno,
238 i_size_read(inode),
239 le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
240
241 if (new_i_size > le64_to_cpu(fe->i_size)) {
242 mlog(0, "asked to truncate file with size (%"MLFu64") "
243 "to size (%"MLFu64")!\n",
244 le64_to_cpu(fe->i_size), new_i_size);
245 status = -EINVAL;
246 mlog_errno(status);
247 goto bail;
248 }
249
250 mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
251 le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
252
253 /* lets handle the simple truncate cases before doing any more
254 * cluster locking. */
255 if (new_i_size == le64_to_cpu(fe->i_size))
256 goto bail;
257
258 if (le32_to_cpu(fe->i_clusters) ==
259 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
260 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
261 fe->i_clusters);
262 /* No allocation change is required, so lets fast path
263 * this truncate. */
264 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
265 if (status < 0)
266 mlog_errno(status);
267 goto bail;
268 }
269
270 /* This forces other nodes to sync and drop their pages */
271 status = ocfs2_data_lock(inode, 1);
272 if (status < 0) {
273 mlog_errno(status);
274 goto bail;
275 }
276 ocfs2_data_unlock(inode, 1);
277
278 /* alright, we're going to need to do a full blown alloc size
279 * change. Orphan the inode so that recovery can complete the
280 * truncate if necessary. This does the task of marking
281 * i_size. */
282 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
283 if (status < 0) {
284 mlog_errno(status);
285 goto bail;
286 }
287
288 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
289 if (status < 0) {
290 mlog_errno(status);
291 goto bail;
292 }
293
294 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
295 if (status < 0) {
296 mlog_errno(status);
297 goto bail;
298 }
299
300 /* TODO: orphan dir cleanup here. */
301bail:
302
303 mlog_exit(status);
304 return status;
305}
306
307/*
308 * extend allocation only here.
309 * we'll update all the disk stuff, and oip->alloc_size
310 *
311 * expect stuff to be locked, a transaction started and enough data /
312 * metadata reservations in the contexts.
313 *
314 * Will return -EAGAIN, and a reason if a restart is needed.
315 * If passed in, *reason will always be set, even in error.
316 */
317int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
318 struct inode *inode,
319 u32 clusters_to_add,
320 struct buffer_head *fe_bh,
321 struct ocfs2_journal_handle *handle,
322 struct ocfs2_alloc_context *data_ac,
323 struct ocfs2_alloc_context *meta_ac,
324 enum ocfs2_alloc_restarted *reason_ret)
325{
326 int status = 0;
327 int free_extents;
328 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
329 enum ocfs2_alloc_restarted reason = RESTART_NONE;
330 u32 bit_off, num_bits;
331 u64 block;
332
333 BUG_ON(!clusters_to_add);
334
335 free_extents = ocfs2_num_free_extents(osb, inode, fe);
336 if (free_extents < 0) {
337 status = free_extents;
338 mlog_errno(status);
339 goto leave;
340 }
341
342 /* there are two cases which could cause us to EAGAIN in the
343 * we-need-more-metadata case:
344 * 1) we haven't reserved *any*
345 * 2) we are so fragmented, we've needed to add metadata too
346 * many times. */
347 if (!free_extents && !meta_ac) {
348 mlog(0, "we haven't reserved any metadata!\n");
349 status = -EAGAIN;
350 reason = RESTART_META;
351 goto leave;
352 } else if ((!free_extents)
353 && (ocfs2_alloc_context_bits_left(meta_ac)
354 < ocfs2_extend_meta_needed(fe))) {
355 mlog(0, "filesystem is really fragmented...\n");
356 status = -EAGAIN;
357 reason = RESTART_META;
358 goto leave;
359 }
360
361 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
362 &bit_off, &num_bits);
363 if (status < 0) {
364 if (status != -ENOSPC)
365 mlog_errno(status);
366 goto leave;
367 }
368
369 BUG_ON(num_bits > clusters_to_add);
370
371 /* reserve our write early -- insert_extent may update the inode */
372 status = ocfs2_journal_access(handle, inode, fe_bh,
373 OCFS2_JOURNAL_ACCESS_WRITE);
374 if (status < 0) {
375 mlog_errno(status);
376 goto leave;
377 }
378
379 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
380 mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
381 num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
382 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
383 num_bits, meta_ac);
384 if (status < 0) {
385 mlog_errno(status);
386 goto leave;
387 }
388
389 le32_add_cpu(&fe->i_clusters, num_bits);
390 spin_lock(&OCFS2_I(inode)->ip_lock);
391 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
392 spin_unlock(&OCFS2_I(inode)->ip_lock);
393
394 status = ocfs2_journal_dirty(handle, fe_bh);
395 if (status < 0) {
396 mlog_errno(status);
397 goto leave;
398 }
399
400 clusters_to_add -= num_bits;
401
402 if (clusters_to_add) {
403 mlog(0, "need to alloc once more, clusters = %u, wanted = "
404 "%u\n", fe->i_clusters, clusters_to_add);
405 status = -EAGAIN;
406 reason = RESTART_TRANS;
407 }
408
409leave:
410 mlog_exit(status);
411 if (reason_ret)
412 *reason_ret = reason;
413 return status;
414}
415
416static int ocfs2_extend_allocation(struct inode *inode,
417 u32 clusters_to_add)
418{
419 int status = 0;
420 int restart_func = 0;
421 int drop_alloc_sem = 0;
422 int credits, num_free_extents;
423 u32 prev_clusters;
424 struct buffer_head *bh = NULL;
425 struct ocfs2_dinode *fe = NULL;
426 struct ocfs2_journal_handle *handle = NULL;
427 struct ocfs2_alloc_context *data_ac = NULL;
428 struct ocfs2_alloc_context *meta_ac = NULL;
429 enum ocfs2_alloc_restarted why;
430 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
431
432 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
433
434 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
435 OCFS2_BH_CACHED, inode);
436 if (status < 0) {
437 mlog_errno(status);
438 goto leave;
439 }
440
441 fe = (struct ocfs2_dinode *) bh->b_data;
442 if (!OCFS2_IS_VALID_DINODE(fe)) {
443 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
444 status = -EIO;
445 goto leave;
446 }
447
448restart_all:
449 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
450
451 mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
452 "clusters_to_add = %u\n",
453 OCFS2_I(inode)->ip_blkno, i_size_read(inode),
454 fe->i_clusters, clusters_to_add);
455
456 handle = ocfs2_alloc_handle(osb);
457 if (handle == NULL) {
458 status = -ENOMEM;
459 mlog_errno(status);
460 goto leave;
461 }
462
463 num_free_extents = ocfs2_num_free_extents(osb,
464 inode,
465 fe);
466 if (num_free_extents < 0) {
467 status = num_free_extents;
468 mlog_errno(status);
469 goto leave;
470 }
471
472 if (!num_free_extents) {
473 status = ocfs2_reserve_new_metadata(osb,
474 handle,
475 fe,
476 &meta_ac);
477 if (status < 0) {
478 if (status != -ENOSPC)
479 mlog_errno(status);
480 goto leave;
481 }
482 }
483
484 status = ocfs2_reserve_clusters(osb,
485 handle,
486 clusters_to_add,
487 &data_ac);
488 if (status < 0) {
489 if (status != -ENOSPC)
490 mlog_errno(status);
491 goto leave;
492 }
493
494 /* blocks peope in read/write from reading our allocation
495 * until we're done changing it. We depend on i_sem to block
496 * other extend/truncate calls while we're here. Ordering wrt
497 * start_trans is important here -- always do it before! */
498 down_write(&OCFS2_I(inode)->ip_alloc_sem);
499 drop_alloc_sem = 1;
500
501 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
502 handle = ocfs2_start_trans(osb, handle, credits);
503 if (IS_ERR(handle)) {
504 status = PTR_ERR(handle);
505 handle = NULL;
506 mlog_errno(status);
507 goto leave;
508 }
509
510restarted_transaction:
511 /* reserve a write to the file entry early on - that we if we
512 * run out of credits in the allocation path, we can still
513 * update i_size. */
514 status = ocfs2_journal_access(handle, inode, bh,
515 OCFS2_JOURNAL_ACCESS_WRITE);
516 if (status < 0) {
517 mlog_errno(status);
518 goto leave;
519 }
520
521 prev_clusters = OCFS2_I(inode)->ip_clusters;
522
523 status = ocfs2_do_extend_allocation(osb,
524 inode,
525 clusters_to_add,
526 bh,
527 handle,
528 data_ac,
529 meta_ac,
530 &why);
531 if ((status < 0) && (status != -EAGAIN)) {
532 if (status != -ENOSPC)
533 mlog_errno(status);
534 goto leave;
535 }
536
537 status = ocfs2_journal_dirty(handle, bh);
538 if (status < 0) {
539 mlog_errno(status);
540 goto leave;
541 }
542
543 spin_lock(&OCFS2_I(inode)->ip_lock);
544 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
545 spin_unlock(&OCFS2_I(inode)->ip_lock);
546
547 if (why != RESTART_NONE && clusters_to_add) {
548 if (why == RESTART_META) {
549 mlog(0, "restarting function.\n");
550 restart_func = 1;
551 } else {
552 BUG_ON(why != RESTART_TRANS);
553
554 mlog(0, "restarting transaction.\n");
555 /* TODO: This can be more intelligent. */
556 credits = ocfs2_calc_extend_credits(osb->sb,
557 fe,
558 clusters_to_add);
559 status = ocfs2_extend_trans(handle, credits);
560 if (status < 0) {
561 /* handle still has to be committed at
562 * this point. */
563 status = -ENOMEM;
564 mlog_errno(status);
565 goto leave;
566 }
567 goto restarted_transaction;
568 }
569 }
570
571 mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
572 fe->i_clusters, fe->i_size);
573 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
574 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
575
576leave:
577 if (drop_alloc_sem) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 drop_alloc_sem = 0;
580 }
581 if (handle) {
582 ocfs2_commit_trans(handle);
583 handle = NULL;
584 }
585 if (data_ac) {
586 ocfs2_free_alloc_context(data_ac);
587 data_ac = NULL;
588 }
589 if (meta_ac) {
590 ocfs2_free_alloc_context(meta_ac);
591 meta_ac = NULL;
592 }
593 if ((!status) && restart_func) {
594 restart_func = 0;
595 goto restart_all;
596 }
597 if (bh) {
598 brelse(bh);
599 bh = NULL;
600 }
601
602 mlog_exit(status);
603 return status;
604}
605
606/* Some parts of this taken from generic_cont_expand, which turned out
607 * to be too fragile to do exactly what we need without us having to
608 * worry about recursive locking in ->commit_write(). */
609static int ocfs2_write_zero_page(struct inode *inode,
610 u64 size)
611{
612 struct address_space *mapping = inode->i_mapping;
613 struct page *page;
614 unsigned long index;
615 unsigned int offset;
616 struct ocfs2_journal_handle *handle = NULL;
617 int ret;
618
619 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
620 /* ugh. in prepare/commit_write, if from==to==start of block, we
621 ** skip the prepare. make sure we never send an offset for the start
622 ** of a block
623 */
624 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
625 offset++;
626 }
627 index = size >> PAGE_CACHE_SHIFT;
628
629 page = grab_cache_page(mapping, index);
630 if (!page) {
631 ret = -ENOMEM;
632 mlog_errno(ret);
633 goto out;
634 }
635
636 ret = ocfs2_prepare_write(NULL, page, offset, offset);
637 if (ret < 0) {
638 mlog_errno(ret);
639 goto out_unlock;
640 }
641
642 if (ocfs2_should_order_data(inode)) {
643 handle = ocfs2_start_walk_page_trans(inode, page, offset,
644 offset);
645 if (IS_ERR(handle)) {
646 ret = PTR_ERR(handle);
647 handle = NULL;
648 goto out_unlock;
649 }
650 }
651
652 /* must not update i_size! */
653 ret = block_commit_write(page, offset, offset);
654 if (ret < 0)
655 mlog_errno(ret);
656 else
657 ret = 0;
658
659 if (handle)
660 ocfs2_commit_trans(handle);
661out_unlock:
662 unlock_page(page);
663 page_cache_release(page);
664out:
665 return ret;
666}
667
668static int ocfs2_zero_extend(struct inode *inode,
669 u64 zero_to_size)
670{
671 int ret = 0;
672 u64 start_off;
673 struct super_block *sb = inode->i_sb;
674
675 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
676 while (start_off < zero_to_size) {
677 ret = ocfs2_write_zero_page(inode, start_off);
678 if (ret < 0) {
679 mlog_errno(ret);
680 goto out;
681 }
682
683 start_off += sb->s_blocksize;
684 }
685
686out:
687 return ret;
688}
689
690static int ocfs2_extend_file(struct inode *inode,
691 struct buffer_head *di_bh,
692 u64 new_i_size)
693{
694 int ret = 0;
695 u32 clusters_to_add;
696
697 /* setattr sometimes calls us like this. */
698 if (new_i_size == 0)
699 goto out;
700
701 if (i_size_read(inode) == new_i_size)
702 goto out;
703 BUG_ON(new_i_size < i_size_read(inode));
704
705 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
706 OCFS2_I(inode)->ip_clusters;
707
708 if (clusters_to_add) {
709 ret = ocfs2_extend_allocation(inode, clusters_to_add);
710 if (ret < 0) {
711 mlog_errno(ret);
712 goto out;
713 }
714
715 ret = ocfs2_zero_extend(inode, new_i_size);
716 if (ret < 0) {
717 mlog_errno(ret);
718 goto out;
719 }
720 }
721
722 /* No allocation required, we just use this helper to
723 * do a trivial update of i_size. */
724 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
725 if (ret < 0) {
726 mlog_errno(ret);
727 goto out;
728 }
729
730out:
731 return ret;
732}
733
734int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
735{
736 int status = 0, size_change;
737 struct inode *inode = dentry->d_inode;
738 struct super_block *sb = inode->i_sb;
739 struct ocfs2_super *osb = OCFS2_SB(sb);
740 struct buffer_head *bh = NULL;
741 struct ocfs2_journal_handle *handle = NULL;
742
743 mlog_entry("(0x%p, '%.*s')\n", dentry,
744 dentry->d_name.len, dentry->d_name.name);
745
746 if (attr->ia_valid & ATTR_MODE)
747 mlog(0, "mode change: %d\n", attr->ia_mode);
748 if (attr->ia_valid & ATTR_UID)
749 mlog(0, "uid change: %d\n", attr->ia_uid);
750 if (attr->ia_valid & ATTR_GID)
751 mlog(0, "gid change: %d\n", attr->ia_gid);
752 if (attr->ia_valid & ATTR_SIZE)
753 mlog(0, "size change...\n");
754 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
755 mlog(0, "time change...\n");
756
757#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
758 | ATTR_GID | ATTR_UID | ATTR_MODE)
759 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
760 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
761 return 0;
762 }
763
764 status = inode_change_ok(inode, attr);
765 if (status)
766 return status;
767
768 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
769 if (size_change) {
770 status = ocfs2_rw_lock(inode, 1);
771 if (status < 0) {
772 mlog_errno(status);
773 goto bail;
774 }
775 }
776
777 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
778 if (status < 0) {
779 if (status != -ENOENT)
780 mlog_errno(status);
781 goto bail_unlock_rw;
782 }
783
784 if (size_change && attr->ia_size != i_size_read(inode)) {
785 if (i_size_read(inode) > attr->ia_size)
786 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
787 else
788 status = ocfs2_extend_file(inode, bh, attr->ia_size);
789 if (status < 0) {
790 if (status != -ENOSPC)
791 mlog_errno(status);
792 status = -ENOSPC;
793 goto bail_unlock;
794 }
795 }
796
797 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
798 if (IS_ERR(handle)) {
799 status = PTR_ERR(handle);
800 mlog_errno(status);
801 goto bail_unlock;
802 }
803
804 status = inode_setattr(inode, attr);
805 if (status < 0) {
806 mlog_errno(status);
807 goto bail_commit;
808 }
809
810 status = ocfs2_mark_inode_dirty(handle, inode, bh);
811 if (status < 0)
812 mlog_errno(status);
813
814bail_commit:
815 ocfs2_commit_trans(handle);
816bail_unlock:
817 ocfs2_meta_unlock(inode, 1);
818bail_unlock_rw:
819 if (size_change)
820 ocfs2_rw_unlock(inode, 1);
821bail:
822 if (bh)
823 brelse(bh);
824
825 mlog_exit(status);
826 return status;
827}
828
829int ocfs2_getattr(struct vfsmount *mnt,
830 struct dentry *dentry,
831 struct kstat *stat)
832{
833 struct inode *inode = dentry->d_inode;
834 struct super_block *sb = dentry->d_inode->i_sb;
835 struct ocfs2_super *osb = sb->s_fs_info;
836 int err;
837
838 mlog_entry_void();
839
840 err = ocfs2_inode_revalidate(dentry);
841 if (err) {
842 if (err != -ENOENT)
843 mlog_errno(err);
844 goto bail;
845 }
846
847 generic_fillattr(inode, stat);
848
849 /* We set the blksize from the cluster size for performance */
850 stat->blksize = osb->s_clustersize;
851
852bail:
853 mlog_exit(err);
854
855 return err;
856}
857
858static int ocfs2_write_remove_suid(struct inode *inode)
859{
860 int ret;
861 struct buffer_head *bh = NULL;
862 struct ocfs2_inode_info *oi = OCFS2_I(inode);
863 struct ocfs2_journal_handle *handle;
864 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
865 struct ocfs2_dinode *di;
866
867 mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
868 inode->i_mode);
869
870 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
871 if (handle == NULL) {
872 ret = -ENOMEM;
873 mlog_errno(ret);
874 goto out;
875 }
876
877 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
878 if (ret < 0) {
879 mlog_errno(ret);
880 goto out_trans;
881 }
882
883 ret = ocfs2_journal_access(handle, inode, bh,
884 OCFS2_JOURNAL_ACCESS_WRITE);
885 if (ret < 0) {
886 mlog_errno(ret);
887 goto out_bh;
888 }
889
890 inode->i_mode &= ~S_ISUID;
891 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
892 inode->i_mode &= ~S_ISGID;
893
894 di = (struct ocfs2_dinode *) bh->b_data;
895 di->i_mode = cpu_to_le16(inode->i_mode);
896
897 ret = ocfs2_journal_dirty(handle, bh);
898 if (ret < 0)
899 mlog_errno(ret);
900out_bh:
901 brelse(bh);
902out_trans:
903 ocfs2_commit_trans(handle);
904out:
905 mlog_exit(ret);
906 return ret;
907}
908
909static inline int ocfs2_write_should_remove_suid(struct inode *inode)
910{
911 mode_t mode = inode->i_mode;
912
913 if (!capable(CAP_FSETID)) {
914 if (unlikely(mode & S_ISUID))
915 return 1;
916
917 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
918 return 1;
919 }
920 return 0;
921}
922
923static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
924 const char __user *buf,
925 size_t count,
926 loff_t pos)
927{
928 struct iovec local_iov = { .iov_base = (void __user *)buf,
929 .iov_len = count };
930 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
931 u32 clusters;
932 struct file *filp = iocb->ki_filp;
933 struct inode *inode = filp->f_dentry->d_inode;
934 loff_t newsize, saved_pos;
935#ifdef OCFS2_ORACORE_WORKAROUNDS
936 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
937#endif
938
939 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
940 (unsigned int)count,
941 filp->f_dentry->d_name.len,
942 filp->f_dentry->d_name.name);
943
944 /* happy write of zero bytes */
945 if (count == 0)
946 return 0;
947
948 if (!inode) {
949 mlog(0, "bad inode\n");
950 return -EIO;
951 }
952
953#ifdef OCFS2_ORACORE_WORKAROUNDS
954 /* ugh, work around some applications which open everything O_DIRECT +
955 * O_APPEND and really don't mean to use O_DIRECT. */
956 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
957 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
958 filp->f_flags &= ~O_DIRECT;
959#endif
960
961 down(&inode->i_sem);
962 /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
963 if (filp->f_flags & O_DIRECT) {
964 have_alloc_sem = 1;
965 down_read(&inode->i_alloc_sem);
966 }
967
968 /* concurrent O_DIRECT writes are allowed */
969 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
970 ret = ocfs2_rw_lock(inode, rw_level);
971 if (ret < 0) {
972 rw_level = -1;
973 mlog_errno(ret);
974 goto out;
975 }
976
977 /*
978 * We sample i_size under a read level meta lock to see if our write
979 * is extending the file, if it is we back off and get a write level
980 * meta lock.
981 */
982 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
983 for(;;) {
984 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
985 if (ret < 0) {
986 meta_level = -1;
987 mlog_errno(ret);
988 goto out;
989 }
990
991 /* Clear suid / sgid if necessary. We do this here
992 * instead of later in the write path because
993 * remove_suid() calls ->setattr without any hint that
994 * we may have already done our cluster locking. Since
995 * ocfs2_setattr() *must* take cluster locks to
996 * proceeed, this will lead us to recursively lock the
997 * inode. There's also the dinode i_size state which
998 * can be lost via setattr during extending writes (we
999 * set inode->i_size at the end of a write. */
1000 if (ocfs2_write_should_remove_suid(inode)) {
1001 if (meta_level == 0) {
1002 ocfs2_meta_unlock(inode, meta_level);
1003 meta_level = 1;
1004 continue;
1005 }
1006
1007 ret = ocfs2_write_remove_suid(inode);
1008 if (ret < 0) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012 }
1013
1014 /* work on a copy of ppos until we're sure that we won't have
1015 * to recalculate it due to relocking. */
1016 if (filp->f_flags & O_APPEND) {
1017 saved_pos = i_size_read(inode);
1018 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1019 } else {
1020 saved_pos = iocb->ki_pos;
1021 }
1022 newsize = count + saved_pos;
1023
1024 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
1025 saved_pos, newsize, i_size_read(inode));
1026
1027 /* No need for a higher level metadata lock if we're
1028 * never going past i_size. */
1029 if (newsize <= i_size_read(inode))
1030 break;
1031
1032 if (meta_level == 0) {
1033 ocfs2_meta_unlock(inode, meta_level);
1034 meta_level = 1;
1035 continue;
1036 }
1037
1038 spin_lock(&OCFS2_I(inode)->ip_lock);
1039 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1040 OCFS2_I(inode)->ip_clusters;
1041 spin_unlock(&OCFS2_I(inode)->ip_lock);
1042
1043 mlog(0, "Writing at EOF, may need more allocation: "
1044 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
1045 i_size_read(inode), newsize, clusters);
1046
1047 /* We only want to continue the rest of this loop if
1048 * our extend will actually require more
1049 * allocation. */
1050 if (!clusters)
1051 break;
1052
1053 ret = ocfs2_extend_allocation(inode, clusters);
1054 if (ret < 0) {
1055 if (ret != -ENOSPC)
1056 mlog_errno(ret);
1057 goto out;
1058 }
1059
1060 /* Fill any holes which would've been created by this
1061 * write. If we're O_APPEND, this will wind up
1062 * (correctly) being a noop. */
1063 ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1064 if (ret < 0) {
1065 mlog_errno(ret);
1066 goto out;
1067 }
1068 break;
1069 }
1070
1071 /* ok, we're done with i_size and alloc work */
1072 iocb->ki_pos = saved_pos;
1073 ocfs2_meta_unlock(inode, meta_level);
1074 meta_level = -1;
1075
1076 /* communicate with ocfs2_dio_end_io */
1077 ocfs2_iocb_set_rw_locked(iocb);
1078
1079#ifdef OCFS2_ORACORE_WORKAROUNDS
1080 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1081 filp->f_flags & O_DIRECT) {
1082 unsigned int saved_flags = filp->f_flags;
1083 int sector_size = 1 << osb->s_sectsize_bits;
1084
1085 if ((saved_pos & (sector_size - 1)) ||
1086 (count & (sector_size - 1)) ||
1087 ((unsigned long)buf & (sector_size - 1))) {
1088 filp->f_flags |= O_SYNC;
1089 filp->f_flags &= ~O_DIRECT;
1090 }
1091
1092 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1093 &iocb->ki_pos);
1094
1095 filp->f_flags = saved_flags;
1096 } else
1097#endif
1098 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1099 &iocb->ki_pos);
1100
1101 /* buffered aio wouldn't have proper lock coverage today */
1102 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1103
1104 /*
1105 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1106 * function pointer which is called when o_direct io completes so that
1107 * it can unlock our rw lock. (it's the clustered equivalent of
1108 * i_alloc_sem; protects truncate from racing with pending ios).
1109 * Unfortunately there are error cases which call end_io and others
1110 * that don't. so we don't have to unlock the rw_lock if either an
1111 * async dio is going to do it in the future or an end_io after an
1112 * error has already done it.
1113 */
1114 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1115 rw_level = -1;
1116 have_alloc_sem = 0;
1117 }
1118
1119out:
1120 if (meta_level != -1)
1121 ocfs2_meta_unlock(inode, meta_level);
1122 if (have_alloc_sem)
1123 up_read(&inode->i_alloc_sem);
1124 if (rw_level != -1)
1125 ocfs2_rw_unlock(inode, rw_level);
1126 up(&inode->i_sem);
1127
1128 mlog_exit(ret);
1129 return ret;
1130}
1131
1132static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1133 char __user *buf,
1134 size_t count,
1135 loff_t pos)
1136{
1137 int ret = 0, rw_level = -1, have_alloc_sem = 0;
1138 struct file *filp = iocb->ki_filp;
1139 struct inode *inode = filp->f_dentry->d_inode;
1140#ifdef OCFS2_ORACORE_WORKAROUNDS
1141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1142#endif
1143
1144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1145 (unsigned int)count,
1146 filp->f_dentry->d_name.len,
1147 filp->f_dentry->d_name.name);
1148
1149 if (!inode) {
1150 ret = -EINVAL;
1151 mlog_errno(ret);
1152 goto bail;
1153 }
1154
1155#ifdef OCFS2_ORACORE_WORKAROUNDS
1156 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1157 if (filp->f_flags & O_DIRECT) {
1158 int sector_size = 1 << osb->s_sectsize_bits;
1159
1160 if ((pos & (sector_size - 1)) ||
1161 (count & (sector_size - 1)) ||
1162 ((unsigned long)buf & (sector_size - 1)) ||
1163 (i_size_read(inode) & (sector_size -1))) {
1164 filp->f_flags &= ~O_DIRECT;
1165 }
1166 }
1167 }
1168#endif
1169
1170 /*
1171 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1172 * need locks to protect pending reads from racing with truncate.
1173 */
1174 if (filp->f_flags & O_DIRECT) {
1175 down_read(&inode->i_alloc_sem);
1176 have_alloc_sem = 1;
1177
1178 ret = ocfs2_rw_lock(inode, 0);
1179 if (ret < 0) {
1180 mlog_errno(ret);
1181 goto bail;
1182 }
1183 rw_level = 0;
1184 /* communicate with ocfs2_dio_end_io */
1185 ocfs2_iocb_set_rw_locked(iocb);
1186 }
1187
1188 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
1189 if (ret == -EINVAL)
1190 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1191
1192 /* buffered aio wouldn't have proper lock coverage today */
1193 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1194
1195 /* see ocfs2_file_aio_write */
1196 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1197 rw_level = -1;
1198 have_alloc_sem = 0;
1199 }
1200
1201bail:
1202 if (have_alloc_sem)
1203 up_read(&inode->i_alloc_sem);
1204 if (rw_level != -1)
1205 ocfs2_rw_unlock(inode, rw_level);
1206 mlog_exit(ret);
1207
1208 return ret;
1209}
1210
1211struct inode_operations ocfs2_file_iops = {
1212 .setattr = ocfs2_setattr,
1213 .getattr = ocfs2_getattr,
1214};
1215
1216struct inode_operations ocfs2_special_file_iops = {
1217 .setattr = ocfs2_setattr,
1218 .getattr = ocfs2_getattr,
1219};
1220
1221struct file_operations ocfs2_fops = {
1222 .read = do_sync_read,
1223 .write = do_sync_write,
1224 .sendfile = generic_file_sendfile,
1225 .mmap = ocfs2_mmap,
1226 .fsync = ocfs2_sync_file,
1227 .release = ocfs2_file_release,
1228 .open = ocfs2_file_open,
1229 .aio_read = ocfs2_file_aio_read,
1230 .aio_write = ocfs2_file_aio_write,
1231};
1232
1233struct file_operations ocfs2_dops = {
1234 .read = generic_read_dir,
1235 .readdir = ocfs2_readdir,
1236 .fsync = ocfs2_sync_file,
1237};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 000000000000..a5ea33b24060
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_FILE_H
27#define OCFS2_FILE_H
28
29extern struct file_operations ocfs2_fops;
30extern struct file_operations ocfs2_dops;
31extern struct inode_operations ocfs2_file_iops;
32extern struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context;
34
35enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0,
37 RESTART_TRANS,
38 RESTART_META
39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode,
42 u32 clusters_to_add,
43 struct buffer_head *fe_bh,
44 struct ocfs2_journal_handle *handle,
45 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat);
51
52int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
53 struct inode *inode,
54 struct buffer_head *fe_bh,
55 u64 new_i_size);
56
57#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 000000000000..0bbd22f46c80
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.c
5 *
6 * Register ourselves with the heartbaet service, keep our node maps
7 * up to date, and fire off recovery when needed.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_SUPER
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "heartbeat.h"
45#include "inode.h"
46#include "journal.h"
47#include "vote.h"
48
49#include "buffer_head_io.h"
50
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
57 int bit);
58static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
59static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
60 struct ocfs2_node_map *from);
61static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
62 struct ocfs2_node_map *from);
63
64void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{
66 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70}
71
72static void ocfs2_do_node_down(int node_num,
73 struct ocfs2_super *osb)
74{
75 BUG_ON(osb->node_num == node_num);
76
77 mlog(0, "ocfs2: node down event for %d\n", node_num);
78
79 if (!osb->dlm) {
80 /*
81 * No DLM means we're not even ready to participate yet.
82 * We check the slots after the DLM comes up, so we will
83 * notice the node death then. We can safely ignore it
84 * here.
85 */
86 return;
87 }
88
89 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
90 /* If a node is in the umount map, then we've been
91 * expecting him to go down and we know ahead of time
92 * that recovery is not necessary. */
93 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
94 return;
95 }
96
97 ocfs2_recovery_thread(osb, node_num);
98
99 ocfs2_remove_node_from_vote_queues(osb, node_num);
100}
101
102static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
103 int node_num,
104 void *data)
105{
106 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
107}
108
109/* Called from the dlm when it's about to evict a node. We may also
110 * get a heartbeat callback later. */
111static void ocfs2_dlm_eviction_cb(int node_num,
112 void *data)
113{
114 struct ocfs2_super *osb = (struct ocfs2_super *) data;
115 struct super_block *sb = osb->sb;
116
117 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
118 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
119
120 ocfs2_do_node_down(node_num, osb);
121}
122
123static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
124 int node_num,
125 void *data)
126{
127 struct ocfs2_super *osb = data;
128
129 BUG_ON(osb->node_num == node_num);
130
131 mlog(0, "node up event for %d\n", node_num);
132 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
133}
134
135void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
136{
137 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
138 ocfs2_hb_node_down_cb, osb,
139 OCFS2_HB_NODE_DOWN_PRI);
140
141 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
142 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
143
144 /* Not exactly a heartbeat callback, but leads to essentially
145 * the same path so we set it up here. */
146 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
147 ocfs2_dlm_eviction_cb,
148 osb);
149}
150
151/* Most functions here are just stubs for now... */
152int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
153{
154 int status;
155
156 status = o2hb_register_callback(&osb->osb_hb_down);
157 if (status < 0) {
158 mlog_errno(status);
159 goto bail;
160 }
161
162 status = o2hb_register_callback(&osb->osb_hb_up);
163 if (status < 0)
164 mlog_errno(status);
165
166bail:
167 return status;
168}
169
170void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
171{
172 int status;
173
174 status = o2hb_unregister_callback(&osb->osb_hb_down);
175 if (status < 0)
176 mlog_errno(status);
177
178 status = o2hb_unregister_callback(&osb->osb_hb_up);
179 if (status < 0)
180 mlog_errno(status);
181}
182
183void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
184{
185 int ret;
186 char *argv[5], *envp[3];
187
188 if (!osb->uuid_str) {
189 /* This can happen if we don't get far enough in mount... */
190 mlog(0, "No UUID with which to stop heartbeat!\n\n");
191 return;
192 }
193
194 argv[0] = (char *)o2nm_get_hb_ctl_path();
195 argv[1] = "-K";
196 argv[2] = "-u";
197 argv[3] = osb->uuid_str;
198 argv[4] = NULL;
199
200 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
201
202 /* minimal command environment taken from cpu_run_sbin_hotplug */
203 envp[0] = "HOME=/";
204 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
205 envp[2] = NULL;
206
207 ret = call_usermodehelper(argv[0], argv, envp, 1);
208 if (ret < 0)
209 mlog_errno(ret);
210}
211
212/* special case -1 for now
213 * TODO: should *really* make sure the calling func never passes -1!! */
214void ocfs2_node_map_init(struct ocfs2_node_map *map)
215{
216 map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
217 memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
218 sizeof(unsigned long));
219}
220
221static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
222 int bit)
223{
224 set_bit(bit, map->map);
225}
226
227void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
228 struct ocfs2_node_map *map,
229 int bit)
230{
231 if (bit==-1)
232 return;
233 BUG_ON(bit >= map->num_nodes);
234 spin_lock(&osb->node_map_lock);
235 __ocfs2_node_map_set_bit(map, bit);
236 spin_unlock(&osb->node_map_lock);
237}
238
239static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
240 int bit)
241{
242 clear_bit(bit, map->map);
243}
244
245void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
246 struct ocfs2_node_map *map,
247 int bit)
248{
249 if (bit==-1)
250 return;
251 BUG_ON(bit >= map->num_nodes);
252 spin_lock(&osb->node_map_lock);
253 __ocfs2_node_map_clear_bit(map, bit);
254 spin_unlock(&osb->node_map_lock);
255}
256
257int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
258 struct ocfs2_node_map *map,
259 int bit)
260{
261 int ret;
262 if (bit >= map->num_nodes) {
263 mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
264 BUG();
265 }
266 spin_lock(&osb->node_map_lock);
267 ret = test_bit(bit, map->map);
268 spin_unlock(&osb->node_map_lock);
269 return ret;
270}
271
272static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
273{
274 int bit;
275 bit = find_next_bit(map->map, map->num_nodes, 0);
276 if (bit < map->num_nodes)
277 return 0;
278 return 1;
279}
280
281int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
282 struct ocfs2_node_map *map)
283{
284 int ret;
285 BUG_ON(map->num_nodes == 0);
286 spin_lock(&osb->node_map_lock);
287 ret = __ocfs2_node_map_is_empty(map);
288 spin_unlock(&osb->node_map_lock);
289 return ret;
290}
291
292static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
293 struct ocfs2_node_map *from)
294{
295 BUG_ON(from->num_nodes == 0);
296 ocfs2_node_map_init(target);
297 __ocfs2_node_map_set(target, from);
298}
299
300/* returns 1 if bit is the only bit set in target, 0 otherwise */
301int ocfs2_node_map_is_only(struct ocfs2_super *osb,
302 struct ocfs2_node_map *target,
303 int bit)
304{
305 struct ocfs2_node_map temp;
306 int ret;
307
308 spin_lock(&osb->node_map_lock);
309 __ocfs2_node_map_dup(&temp, target);
310 __ocfs2_node_map_clear_bit(&temp, bit);
311 ret = __ocfs2_node_map_is_empty(&temp);
312 spin_unlock(&osb->node_map_lock);
313
314 return ret;
315}
316
317static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
318 struct ocfs2_node_map *from)
319{
320 int num_longs, i;
321
322 BUG_ON(target->num_nodes != from->num_nodes);
323 BUG_ON(target->num_nodes == 0);
324
325 num_longs = BITS_TO_LONGS(target->num_nodes);
326 for (i = 0; i < num_longs; i++)
327 target->map[i] = from->map[i];
328}
329
330/* Returns whether the recovery bit was actually set - it may not be
331 * if a node is still marked as needing recovery */
332int ocfs2_recovery_map_set(struct ocfs2_super *osb,
333 int num)
334{
335 int set = 0;
336
337 spin_lock(&osb->node_map_lock);
338
339 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
340
341 if (!test_bit(num, osb->recovery_map.map)) {
342 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
343 set = 1;
344 }
345
346 spin_unlock(&osb->node_map_lock);
347
348 return set;
349}
350
351void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
352 int num)
353{
354 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
355}
356
357int ocfs2_node_map_iterate(struct ocfs2_super *osb,
358 struct ocfs2_node_map *map,
359 int idx)
360{
361 int i = idx;
362
363 idx = O2NM_INVALID_NODE_NUM;
364 spin_lock(&osb->node_map_lock);
365 if ((i != O2NM_INVALID_NODE_NUM) &&
366 (i >= 0) &&
367 (i < map->num_nodes)) {
368 while(i < map->num_nodes) {
369 if (test_bit(i, map->map)) {
370 idx = i;
371 break;
372 }
373 i++;
374 }
375 }
376 spin_unlock(&osb->node_map_lock);
377 return idx;
378}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 000000000000..e8fb079122e4
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_HEARTBEAT_H
27#define OCFS2_HEARTBEAT_H
28
29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35
36/* node map functions - used to keep track of mounted and in-recovery
37 * nodes. */
38void ocfs2_node_map_init(struct ocfs2_node_map *map);
39int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
40 struct ocfs2_node_map *map);
41void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
42 struct ocfs2_node_map *map,
43 int bit);
44void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map,
46 int bit);
47int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int bit);
50int ocfs2_node_map_iterate(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map,
52 int idx);
53static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
54 struct ocfs2_node_map *map)
55{
56 return ocfs2_node_map_iterate(osb, map, 0);
57}
58int ocfs2_recovery_map_set(struct ocfs2_super *osb,
59 int num);
60void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
61 int num);
62/* returns 1 if bit is the only bit set in target, 0 otherwise */
63int ocfs2_node_map_is_only(struct ocfs2_super *osb,
64 struct ocfs2_node_map *target,
65 int bit);
66
67#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 000000000000..a91ba4dec936
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c
5 *
6 * vfs' aops, fops, dops and iops
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32
33#include <asm/byteorder.h>
34
35#define MLOG_MASK_PREFIX ML_INODE
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "alloc.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "inode.h"
45#include "journal.h"
46#include "namei.h"
47#include "suballoc.h"
48#include "super.h"
49#include "symlink.h"
50#include "sysfile.h"
51#include "uptodate.h"
52#include "vote.h"
53
54#include "buffer_head_io.h"
55
56#define OCFS2_FI_FLAG_NOWAIT 0x1
57#define OCFS2_FI_FLAG_DELETE 0x2
58struct ocfs2_find_inode_args
59{
60 u64 fi_blkno;
61 unsigned long fi_ino;
62 unsigned int fi_flags;
63};
64
65static int ocfs2_read_locked_inode(struct inode *inode,
66 struct ocfs2_find_inode_args *args);
67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
68static int ocfs2_find_actor(struct inode *inode, void *opaque);
69static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
70 struct inode *inode,
71 struct buffer_head *fe_bh);
72
73struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
74 u64 blkno,
75 int delete_vote)
76{
77 struct ocfs2_find_inode_args args;
78
79 /* ocfs2_ilookup_for_vote should *only* be called from the
80 * vote thread */
81 BUG_ON(current != osb->vote_task);
82
83 args.fi_blkno = blkno;
84 args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
85 if (delete_vote)
86 args.fi_flags |= OCFS2_FI_FLAG_DELETE;
87 args.fi_ino = ino_from_blkno(osb->sb, blkno);
88 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
89}
90
91struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
92{
93 struct inode *inode = NULL;
94 struct super_block *sb = osb->sb;
95 struct ocfs2_find_inode_args args;
96
97 mlog_entry("(blkno = %"MLFu64")\n", blkno);
98
99 /* Ok. By now we've either got the offsets passed to us by the
100 * caller, or we just pulled them off the bh. Lets do some
101 * sanity checks to make sure they're OK. */
102 if (blkno == 0) {
103 inode = ERR_PTR(-EINVAL);
104 mlog_errno(PTR_ERR(inode));
105 goto bail;
106 }
107
108 args.fi_blkno = blkno;
109 args.fi_flags = 0;
110 args.fi_ino = ino_from_blkno(sb, blkno);
111
112 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
113 ocfs2_init_locked_inode, &args);
114 /* inode was *not* in the inode cache. 2.6.x requires
115 * us to do our own read_inode call and unlock it
116 * afterwards. */
117 if (inode && inode->i_state & I_NEW) {
118 mlog(0, "Inode was not in inode cache, reading it.\n");
119 ocfs2_read_locked_inode(inode, &args);
120 unlock_new_inode(inode);
121 }
122 if (inode == NULL) {
123 inode = ERR_PTR(-ENOMEM);
124 mlog_errno(PTR_ERR(inode));
125 goto bail;
126 }
127 if (is_bad_inode(inode)) {
128 iput(inode);
129 inode = ERR_PTR(-ESTALE);
130 mlog_errno(PTR_ERR(inode));
131 goto bail;
132 }
133
134bail:
135 if (!IS_ERR(inode)) {
136 mlog(0, "returning inode with number %"MLFu64"\n",
137 OCFS2_I(inode)->ip_blkno);
138 mlog_exit_ptr(inode);
139 } else
140 mlog_errno(PTR_ERR(inode));
141
142 return inode;
143}
144
145
146/*
147 * here's how inodes get read from disk:
148 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
149 * found? : return the in-memory inode
150 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
151 */
152
153static int ocfs2_find_actor(struct inode *inode, void *opaque)
154{
155 struct ocfs2_find_inode_args *args = NULL;
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157 int ret = 0;
158
159 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
160
161 args = opaque;
162
163 mlog_bug_on_msg(!inode, "No inode in find actor!\n");
164
165 if (oi->ip_blkno != args->fi_blkno)
166 goto bail;
167
168 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
169 * ocfs2_ilookup_for_vote which won't create an inode for one
170 * that isn't found. The vote thread which doesn't want to get
171 * an inode which is in the process of going away - otherwise
172 * the call to __wait_on_freeing_inode in find_inode_fast will
173 * cause it to deadlock on an inode which may be waiting on a
174 * vote (or lock release) in delete_inode */
175 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
176 (inode->i_state & (I_FREEING|I_CLEAR))) {
177 /* As stated above, we're not going to return an
178 * inode. In the case of a delete vote, the voting
179 * code is going to signal the other node to go
180 * ahead. Mark that state here, so this freeing inode
181 * has the state when it gets to delete_inode. */
182 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
183 spin_lock(&oi->ip_lock);
184 ocfs2_mark_inode_remotely_deleted(inode);
185 spin_unlock(&oi->ip_lock);
186 }
187 goto bail;
188 }
189
190 ret = 1;
191bail:
192 mlog_exit(ret);
193 return ret;
194}
195
196/*
197 * initialize the new inode, but don't do anything that would cause
198 * us to sleep.
199 * return 0 on success, 1 on failure
200 */
201static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
202{
203 struct ocfs2_find_inode_args *args = opaque;
204
205 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
206
207 inode->i_ino = args->fi_ino;
208 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
209
210 mlog_exit(0);
211 return 0;
212}
213
214int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
215 int create_ino)
216{
217 struct super_block *sb;
218 struct ocfs2_super *osb;
219 int status = -EINVAL;
220
221 mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
222
223 sb = inode->i_sb;
224 osb = OCFS2_SB(sb);
225
226 /* this means that read_inode cannot create a superblock inode
227 * today. change if needed. */
228 if (!OCFS2_IS_VALID_DINODE(fe) ||
229 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
230 mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
231 "signature = %.*s, flags = 0x%x\n",
232 inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
233 fe->i_signature, le32_to_cpu(fe->i_flags));
234 goto bail;
235 }
236
237 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
238 mlog(ML_ERROR, "file entry generation does not match "
239 "superblock! osb->fs_generation=%x, "
240 "fe->i_fs_generation=%x\n",
241 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
242 goto bail;
243 }
244
245 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation);
247 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
248 inode->i_mode = le16_to_cpu(fe->i_mode);
249 inode->i_uid = le32_to_cpu(fe->i_uid);
250 inode->i_gid = le32_to_cpu(fe->i_gid);
251 inode->i_blksize = (u32)osb->s_clustersize;
252
253 /* Fast symlinks will have i_size but no allocated clusters. */
254 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
255 inode->i_blocks = 0;
256 else
257 inode->i_blocks =
258 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
259 inode->i_mapping->a_ops = &ocfs2_aops;
260 inode->i_flags |= S_NOATIME;
261 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
262 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
263 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
264 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
265 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
266 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
267
268 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
269 mlog(ML_ERROR,
270 "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
271 OCFS2_I(inode)->ip_blkno, fe->i_blkno);
272
273 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
274 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
275
276 if (create_ino)
277 inode->i_ino = ino_from_blkno(inode->i_sb,
278 le64_to_cpu(fe->i_blkno));
279
280 mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
281 fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
282
283 inode->i_nlink = le16_to_cpu(fe->i_links_count);
284
285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
291 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
292 /* we can't actually hit this as read_inode can't
293 * handle superblocks today ;-) */
294 BUG();
295 }
296
297 switch (inode->i_mode & S_IFMT) {
298 case S_IFREG:
299 inode->i_fop = &ocfs2_fops;
300 inode->i_op = &ocfs2_file_iops;
301 i_size_write(inode, le64_to_cpu(fe->i_size));
302 break;
303 case S_IFDIR:
304 inode->i_op = &ocfs2_dir_iops;
305 inode->i_fop = &ocfs2_dops;
306 i_size_write(inode, le64_to_cpu(fe->i_size));
307 break;
308 case S_IFLNK:
309 if (ocfs2_inode_is_fast_symlink(inode))
310 inode->i_op = &ocfs2_fast_symlink_inode_operations;
311 else
312 inode->i_op = &ocfs2_symlink_inode_operations;
313 i_size_write(inode, le64_to_cpu(fe->i_size));
314 break;
315 default:
316 inode->i_op = &ocfs2_special_file_iops;
317 init_special_inode(inode, inode->i_mode,
318 inode->i_rdev);
319 break;
320 }
321
322 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
323 OCFS2_LOCK_TYPE_RW, inode);
324 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
325 OCFS2_LOCK_TYPE_META, inode);
326 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
327 OCFS2_LOCK_TYPE_DATA, inode);
328
329 status = 0;
330bail:
331 mlog_exit(status);
332 return status;
333}
334
335static int ocfs2_read_locked_inode(struct inode *inode,
336 struct ocfs2_find_inode_args *args)
337{
338 struct super_block *sb;
339 struct ocfs2_super *osb;
340 struct ocfs2_dinode *fe;
341 struct buffer_head *bh = NULL;
342 int status;
343 int sysfile = 0;
344
345 mlog_entry("(0x%p, 0x%p)\n", inode, args);
346
347 status = -EINVAL;
348 if (inode == NULL || inode->i_sb == NULL) {
349 mlog(ML_ERROR, "bad inode\n");
350 goto bail;
351 }
352 sb = inode->i_sb;
353 osb = OCFS2_SB(sb);
354
355 if (!args) {
356 mlog(ML_ERROR, "bad inode args\n");
357 make_bad_inode(inode);
358 goto bail;
359 }
360
361 /* Read the FE off disk. This is safe because the kernel only
362 * does one read_inode2 for a new inode, and if it doesn't
363 * exist yet then nobody can be working on it! */
364 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
365 if (status < 0) {
366 mlog_errno(status);
367 make_bad_inode(inode);
368 goto bail;
369 }
370
371 fe = (struct ocfs2_dinode *) bh->b_data;
372 if (!OCFS2_IS_VALID_DINODE(fe)) {
373 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
374 fe->i_blkno, 7, fe->i_signature);
375 make_bad_inode(inode);
376 goto bail;
377 }
378
379 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
380 sysfile = 1;
381
382 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
383 S_ISBLK(le16_to_cpu(fe->i_mode)))
384 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
385
386 status = -EINVAL;
387 if (ocfs2_populate_inode(inode, fe, 0) < 0) {
388 mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
389 "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
390 make_bad_inode(inode);
391 goto bail;
392 }
393
394 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
395
396 if (sysfile)
397 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
398
399 status = 0;
400
401bail:
402 if (args && bh)
403 brelse(bh);
404
405 mlog_exit(status);
406 return status;
407}
408
409void ocfs2_sync_blockdev(struct super_block *sb)
410{
411 sync_blockdev(sb->s_bdev);
412}
413
414static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
415 struct inode *inode,
416 struct buffer_head *fe_bh)
417{
418 int status = 0;
419 struct ocfs2_journal_handle *handle = NULL;
420 struct ocfs2_truncate_context *tc = NULL;
421 struct ocfs2_dinode *fe;
422
423 mlog_entry_void();
424
425 fe = (struct ocfs2_dinode *) fe_bh->b_data;
426
427 /* zero allocation, zero truncate :) */
428 if (!fe->i_clusters)
429 goto bail;
430
431 handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
432 if (IS_ERR(handle)) {
433 status = PTR_ERR(handle);
434 handle = NULL;
435 mlog_errno(status);
436 goto bail;
437 }
438
439 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
440 if (status < 0) {
441 mlog_errno(status);
442 goto bail;
443 }
444
445 ocfs2_commit_trans(handle);
446 handle = NULL;
447
448 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
449 if (status < 0) {
450 mlog_errno(status);
451 goto bail;
452 }
453
454 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
455 if (status < 0) {
456 mlog_errno(status);
457 goto bail;
458 }
459bail:
460 if (handle)
461 ocfs2_commit_trans(handle);
462
463 mlog_exit(status);
464 return status;
465}
466
467static int ocfs2_remove_inode(struct inode *inode,
468 struct buffer_head *di_bh,
469 struct inode *orphan_dir_inode,
470 struct buffer_head *orphan_dir_bh)
471{
472 int status;
473 struct inode *inode_alloc_inode = NULL;
474 struct buffer_head *inode_alloc_bh = NULL;
475 struct ocfs2_journal_handle *handle;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
478
479 inode_alloc_inode =
480 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
481 le16_to_cpu(di->i_suballoc_slot));
482 if (!inode_alloc_inode) {
483 status = -EEXIST;
484 mlog_errno(status);
485 goto bail;
486 }
487
488 down(&inode_alloc_inode->i_sem);
489 status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
490 if (status < 0) {
491 up(&inode_alloc_inode->i_sem);
492
493 mlog_errno(status);
494 goto bail;
495 }
496
497 handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
498 if (IS_ERR(handle)) {
499 status = PTR_ERR(handle);
500 mlog_errno(status);
501 goto bail_unlock;
502 }
503
504 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
505 orphan_dir_bh);
506 if (status < 0) {
507 mlog_errno(status);
508 goto bail_commit;
509 }
510
511 /* set the inodes dtime */
512 status = ocfs2_journal_access(handle, inode, di_bh,
513 OCFS2_JOURNAL_ACCESS_WRITE);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail_commit;
517 }
518
519 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
520 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
521
522 status = ocfs2_journal_dirty(handle, di_bh);
523 if (status < 0) {
524 mlog_errno(status);
525 goto bail_commit;
526 }
527
528 ocfs2_remove_from_cache(inode, di_bh);
529
530 status = ocfs2_free_dinode(handle, inode_alloc_inode,
531 inode_alloc_bh, di);
532 if (status < 0)
533 mlog_errno(status);
534
535bail_commit:
536 ocfs2_commit_trans(handle);
537bail_unlock:
538 ocfs2_meta_unlock(inode_alloc_inode, 1);
539 up(&inode_alloc_inode->i_sem);
540 brelse(inode_alloc_bh);
541bail:
542 iput(inode_alloc_inode);
543
544 return status;
545}
546
547static int ocfs2_wipe_inode(struct inode *inode,
548 struct buffer_head *di_bh)
549{
550 int status, orphaned_slot;
551 struct inode *orphan_dir_inode = NULL;
552 struct buffer_head *orphan_dir_bh = NULL;
553 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
554
555 /* We've already voted on this so it should be readonly - no
556 * spinlock needed. */
557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
558 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
559 ORPHAN_DIR_SYSTEM_INODE,
560 orphaned_slot);
561 if (!orphan_dir_inode) {
562 status = -EEXIST;
563 mlog_errno(status);
564 goto bail;
565 }
566
567 /* Lock the orphan dir. The lock will be held for the entire
568 * delete_inode operation. We do this now to avoid races with
569 * recovery completion on other nodes. */
570 down(&orphan_dir_inode->i_sem);
571 status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
572 if (status < 0) {
573 up(&orphan_dir_inode->i_sem);
574
575 mlog_errno(status);
576 goto bail;
577 }
578
579 /* we do this while holding the orphan dir lock because we
580 * don't want recovery being run from another node to vote for
581 * an inode delete on us -- this will result in two nodes
582 * truncating the same file! */
583 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
584 if (status < 0) {
585 mlog_errno(status);
586 goto bail_unlock_dir;
587 }
588
589 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
590 orphan_dir_bh);
591 if (status < 0)
592 mlog_errno(status);
593
594bail_unlock_dir:
595 ocfs2_meta_unlock(orphan_dir_inode, 1);
596 up(&orphan_dir_inode->i_sem);
597 brelse(orphan_dir_bh);
598bail:
599 iput(orphan_dir_inode);
600
601 return status;
602}
603
604/* There is a series of simple checks that should be done before a
605 * vote is even considered. Encapsulate those in this function. */
606static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
607{
608 int ret = 0;
609 struct ocfs2_inode_info *oi = OCFS2_I(inode);
610 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
611
612 /* We shouldn't be getting here for the root directory
613 * inode.. */
614 if (inode == osb->root_inode) {
615 mlog(ML_ERROR, "Skipping delete of root inode.\n");
616 goto bail;
617 }
618
619 /* If we're coming from process_vote we can't go into our own
620 * voting [hello, deadlock city!], so unforuntately we just
621 * have to skip deleting this guy. That's OK though because
622 * the node who's doing the actual deleting should handle it
623 * anyway. */
624 if (current == osb->vote_task) {
625 mlog(0, "Skipping delete of %lu because we're currently "
626 "in process_vote\n", inode->i_ino);
627 goto bail;
628 }
629
630 spin_lock(&oi->ip_lock);
631 /* OCFS2 *never* deletes system files. This should technically
632 * never get here as system file inodes should always have a
633 * positive link count. */
634 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
635 mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
636 oi->ip_blkno);
637 goto bail_unlock;
638 }
639
640 /* If we have voted "yes" on the wipe of this inode for
641 * another node, it will be marked here so we can safely skip
642 * it. Recovery will cleanup any inodes we might inadvertantly
643 * skip here. */
644 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
645 mlog(0, "Skipping delete of %lu because another node "
646 "has done this for us.\n", inode->i_ino);
647 goto bail_unlock;
648 }
649
650 ret = 1;
651bail_unlock:
652 spin_unlock(&oi->ip_lock);
653bail:
654 return ret;
655}
656
657/* Query the cluster to determine whether we should wipe an inode from
658 * disk or not.
659 *
660 * Requires the inode to have the cluster lock. */
661static int ocfs2_query_inode_wipe(struct inode *inode,
662 struct buffer_head *di_bh,
663 int *wipe)
664{
665 int status = 0;
666 struct ocfs2_inode_info *oi = OCFS2_I(inode);
667 struct ocfs2_dinode *di;
668
669 *wipe = 0;
670
671 /* While we were waiting for the cluster lock in
672 * ocfs2_delete_inode, another node might have asked to delete
673 * the inode. Recheck our flags to catch this. */
674 if (!ocfs2_inode_is_valid_to_delete(inode)) {
675 mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
676 oi->ip_blkno);
677 goto bail;
678 }
679
680 /* Now that we have an up to date inode, we can double check
681 * the link count. */
682 if (inode->i_nlink) {
683 mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
684 oi->ip_blkno, inode->i_nlink);
685 goto bail;
686 }
687
688 /* Do some basic inode verification... */
689 di = (struct ocfs2_dinode *) di_bh->b_data;
690 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
691 /* for lack of a better error? */
692 status = -EEXIST;
693 mlog(ML_ERROR,
694 "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
695 "Disk flags 0x%x, inode flags 0x%x\n",
696 oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
697 goto bail;
698 }
699
700 /* has someone already deleted us?! baaad... */
701 if (di->i_dtime) {
702 status = -EEXIST;
703 mlog_errno(status);
704 goto bail;
705 }
706
707 status = ocfs2_request_delete_vote(inode);
708 /* -EBUSY means that other nodes are still using the
709 * inode. We're done here though, so avoid doing anything on
710 * disk and let them worry about deleting it. */
711 if (status == -EBUSY) {
712 status = 0;
713 mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
714 "other nodes\n", oi->ip_blkno);
715 goto bail;
716 }
717 if (status < 0) {
718 mlog_errno(status);
719 goto bail;
720 }
721
722 spin_lock(&oi->ip_lock);
723 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
724 /* Nobody knew which slot this inode was orphaned
725 * into. This may happen during node death and
726 * recovery knows how to clean it up so we can safely
727 * ignore this inode for now on. */
728 mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
729 oi->ip_blkno);
730 } else {
731 *wipe = 1;
732
733 mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
734 oi->ip_blkno, oi->ip_orphaned_slot);
735 }
736 spin_unlock(&oi->ip_lock);
737
738bail:
739 return status;
740}
741
742/* Support function for ocfs2_delete_inode. Will help us keep the
743 * inode data in a consistent state for clear_inode. Always truncates
744 * pages, optionally sync's them first. */
745static void ocfs2_cleanup_delete_inode(struct inode *inode,
746 int sync_data)
747{
748 mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
749 OCFS2_I(inode)->ip_blkno, sync_data);
750 if (sync_data)
751 write_inode_now(inode, 1);
752 truncate_inode_pages(&inode->i_data, 0);
753}
754
755void ocfs2_delete_inode(struct inode *inode)
756{
757 int wipe, status;
758 sigset_t blocked, oldset;
759 struct buffer_head *di_bh = NULL;
760
761 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
762
763 if (is_bad_inode(inode)) {
764 mlog(0, "Skipping delete of bad inode\n");
765 goto bail;
766 }
767
768 if (!ocfs2_inode_is_valid_to_delete(inode)) {
769 /* It's probably not necessary to truncate_inode_pages
770 * here but we do it for safety anyway (it will most
771 * likely be a no-op anyway) */
772 ocfs2_cleanup_delete_inode(inode, 0);
773 goto bail;
774 }
775
776 /* We want to block signals in delete_inode as the lock and
777 * messaging paths may return us -ERESTARTSYS. Which would
778 * cause us to exit early, resulting in inodes being orphaned
779 * forever. */
780 sigfillset(&blocked);
781 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
782 if (status < 0) {
783 mlog_errno(status);
784 ocfs2_cleanup_delete_inode(inode, 1);
785 goto bail;
786 }
787
788 /* Lock down the inode. This gives us an up to date view of
789 * it's metadata (for verification), and allows us to
790 * serialize delete_inode votes.
791 *
792 * Even though we might be doing a truncate, we don't take the
793 * allocation lock here as it won't be needed - nobody will
794 * have the file open.
795 */
796 status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
797 if (status < 0) {
798 if (status != -ENOENT)
799 mlog_errno(status);
800 ocfs2_cleanup_delete_inode(inode, 0);
801 goto bail_unblock;
802 }
803
804 /* Query the cluster. This will be the final decision made
805 * before we go ahead and wipe the inode. */
806 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
807 if (!wipe || status < 0) {
808 /* Error and inode busy vote both mean we won't be
809 * removing the inode, so they take almost the same
810 * path. */
811 if (status < 0)
812 mlog_errno(status);
813
814 /* Someone in the cluster has voted to not wipe this
815 * inode, or it was never completely orphaned. Write
816 * out the pages and exit now. */
817 ocfs2_cleanup_delete_inode(inode, 1);
818 goto bail_unlock_inode;
819 }
820
821 ocfs2_cleanup_delete_inode(inode, 0);
822
823 status = ocfs2_wipe_inode(inode, di_bh);
824 if (status < 0) {
825 mlog_errno(status);
826 goto bail_unlock_inode;
827 }
828
829 /* Mark the inode as successfully deleted. This is important
830 * for ocfs2_clear_inode as it will check this flag and skip
831 * any checkpointing work */
832 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
833
834bail_unlock_inode:
835 ocfs2_meta_unlock(inode, 1);
836 brelse(di_bh);
837bail_unblock:
838 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
839 if (status < 0)
840 mlog_errno(status);
841bail:
842 clear_inode(inode);
843 mlog_exit_void();
844}
845
846void ocfs2_clear_inode(struct inode *inode)
847{
848 int status;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850
851 mlog_entry_void();
852
853 if (!inode)
854 goto bail;
855
856 mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
857 OCFS2_I(inode)->ip_blkno, inode->i_nlink);
858
859 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
860 "Inode=%lu\n", inode->i_ino);
861
862 /* Do these before all the other work so that we don't bounce
863 * the vote thread while waiting to destroy the locks. */
864 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
865 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
866 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
867
868 /* We very well may get a clear_inode before all an inodes
869 * metadata has hit disk. Of course, we can't drop any cluster
870 * locks until the journal has finished with it. The only
871 * exception here are successfully wiped inodes - their
872 * metadata can now be considered to be part of the system
873 * inodes from which it came. */
874 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
875 ocfs2_checkpoint_inode(inode);
876
877 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
878 "Clear inode of %"MLFu64", inode has io markers\n",
879 oi->ip_blkno);
880
881 ocfs2_extent_map_drop(inode, 0);
882 ocfs2_extent_map_init(inode);
883
884 status = ocfs2_drop_inode_locks(inode);
885 if (status < 0)
886 mlog_errno(status);
887
888 ocfs2_lock_res_free(&oi->ip_rw_lockres);
889 ocfs2_lock_res_free(&oi->ip_meta_lockres);
890 ocfs2_lock_res_free(&oi->ip_data_lockres);
891
892 ocfs2_metadata_cache_purge(inode);
893
894 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
895 "Clear inode of %"MLFu64", inode has %u cache items\n",
896 oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
897
898 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
899 "Clear inode of %"MLFu64", inode has a bad flag\n",
900 oi->ip_blkno);
901
902 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno);
905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
907 "Clear inode of %"MLFu64", io_sem is locked\n",
908 oi->ip_blkno);
909 up(&oi->ip_io_sem);
910
911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1
913 * kernel 1, world 0
914 */
915 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
916 "Clear inode of %"MLFu64", alloc_sem is locked\n",
917 oi->ip_blkno);
918 up_write(&oi->ip_alloc_sem);
919
920 mlog_bug_on_msg(oi->ip_open_count,
921 "Clear inode of %"MLFu64" has open count %d\n",
922 oi->ip_blkno, oi->ip_open_count);
923 mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
924 "Clear inode of %"MLFu64" has non empty handle list\n",
925 oi->ip_blkno);
926 mlog_bug_on_msg(oi->ip_handle,
927 "Clear inode of %"MLFu64" has non empty handle pointer\n",
928 oi->ip_blkno);
929
930 /* Clear all other flags. */
931 oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
932 oi->ip_created_trans = 0;
933 oi->ip_last_trans = 0;
934 oi->ip_dir_start_lookup = 0;
935 oi->ip_blkno = 0ULL;
936
937bail:
938 mlog_exit_void();
939}
940
941/* Called under inode_lock, with no more references on the
942 * struct inode, so it's safe here to check the flags field
943 * and to manipulate i_nlink without any other locks. */
944void ocfs2_drop_inode(struct inode *inode)
945{
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947
948 mlog_entry_void();
949
950 mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
951 oi->ip_blkno, inode->i_nlink, oi->ip_flags);
952
953 /* Testing ip_orphaned_slot here wouldn't work because we may
954 * not have gotten a delete_inode vote from any other nodes
955 * yet. */
956 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
957 mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
958 inode->i_nlink = 0;
959 }
960
961 generic_drop_inode(inode);
962
963 mlog_exit_void();
964}
965
966/*
967 * TODO: this should probably be merged into ocfs2_get_block
968 *
969 * However, you now need to pay attention to the cont_prepare_write()
970 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
971 * expects never to extend).
972 */
973struct buffer_head *ocfs2_bread(struct inode *inode,
974 int block, int *err, int reada)
975{
976 struct buffer_head *bh = NULL;
977 int tmperr;
978 u64 p_blkno;
979 int readflags = OCFS2_BH_CACHED;
980
981#if 0
982 /* only turn this on if we know we can deal with read_block
983 * returning nothing */
984 if (reada)
985 readflags |= OCFS2_BH_READAHEAD;
986#endif
987
988 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
989 i_size_read(inode)) {
990 BUG_ON(!reada);
991 return NULL;
992 }
993
994 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
995 &p_blkno, NULL);
996 if (tmperr < 0) {
997 mlog_errno(tmperr);
998 goto fail;
999 }
1000
1001 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1002 readflags, inode);
1003 if (tmperr < 0)
1004 goto fail;
1005
1006 tmperr = 0;
1007
1008 *err = 0;
1009 return bh;
1010
1011fail:
1012 if (bh) {
1013 brelse(bh);
1014 bh = NULL;
1015 }
1016 *err = -EIO;
1017 return NULL;
1018}
1019
1020/*
1021 * This is called from our getattr.
1022 */
1023int ocfs2_inode_revalidate(struct dentry *dentry)
1024{
1025 struct inode *inode = dentry->d_inode;
1026 int status = 0;
1027
1028 mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
1029 inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
1030
1031 if (!inode) {
1032 mlog(0, "eep, no inode!\n");
1033 status = -ENOENT;
1034 goto bail;
1035 }
1036
1037 spin_lock(&OCFS2_I(inode)->ip_lock);
1038 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
1039 spin_unlock(&OCFS2_I(inode)->ip_lock);
1040 mlog(0, "inode deleted!\n");
1041 status = -ENOENT;
1042 goto bail;
1043 }
1044 spin_unlock(&OCFS2_I(inode)->ip_lock);
1045
1046 /* Let ocfs2_meta_lock do the work of updating our struct
1047 * inode for us. */
1048 status = ocfs2_meta_lock(inode, NULL, NULL, 0);
1049 if (status < 0) {
1050 if (status != -ENOENT)
1051 mlog_errno(status);
1052 goto bail;
1053 }
1054 ocfs2_meta_unlock(inode, 0);
1055bail:
1056 mlog_exit(status);
1057
1058 return status;
1059}
1060
1061/*
1062 * Updates a disk inode from a
1063 * struct inode.
1064 * Only takes ip_lock.
1065 */
1066int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
1067 struct inode *inode,
1068 struct buffer_head *bh)
1069{
1070 int status;
1071 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
1072
1073 mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
1074
1075 status = ocfs2_journal_access(handle, inode, bh,
1076 OCFS2_JOURNAL_ACCESS_WRITE);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto leave;
1080 }
1081
1082 spin_lock(&OCFS2_I(inode)->ip_lock);
1083 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1084 spin_unlock(&OCFS2_I(inode)->ip_lock);
1085
1086 fe->i_size = cpu_to_le64(i_size_read(inode));
1087 fe->i_links_count = cpu_to_le16(inode->i_nlink);
1088 fe->i_uid = cpu_to_le32(inode->i_uid);
1089 fe->i_gid = cpu_to_le32(inode->i_gid);
1090 fe->i_mode = cpu_to_le16(inode->i_mode);
1091 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
1092 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
1093 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1094 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1095 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1096 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1097
1098 status = ocfs2_journal_dirty(handle, bh);
1099 if (status < 0)
1100 mlog_errno(status);
1101
1102 status = 0;
1103leave:
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/*
1110 *
1111 * Updates a struct inode from a disk inode.
1112 * does no i/o, only takes ip_lock.
1113 */
1114void ocfs2_refresh_inode(struct inode *inode,
1115 struct ocfs2_dinode *fe)
1116{
1117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1118
1119 spin_lock(&OCFS2_I(inode)->ip_lock);
1120
1121 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1122 i_size_write(inode, le64_to_cpu(fe->i_size));
1123 inode->i_nlink = le16_to_cpu(fe->i_links_count);
1124 inode->i_uid = le32_to_cpu(fe->i_uid);
1125 inode->i_gid = le32_to_cpu(fe->i_gid);
1126 inode->i_mode = le16_to_cpu(fe->i_mode);
1127 inode->i_blksize = (u32) osb->s_clustersize;
1128 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1129 inode->i_blocks = 0;
1130 else
1131 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
1132 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1133 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1134 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
1135 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
1136 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
1137 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
1138
1139 spin_unlock(&OCFS2_I(inode)->ip_lock);
1140}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 000000000000..9b0177433653
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H
28
29/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info
31{
32 u64 ip_blkno;
33
34 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres;
37
38 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem;
40
41 /* These fields are protected by ip_lock */
42 spinlock_t ip_lock;
43 u32 ip_open_count;
44 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48
49 struct semaphore ip_io_sem;
50
51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock
53 * out other I/O to the inode until we either commit or
54 * abort. */
55 struct list_head ip_handle_list;
56 struct ocfs2_journal_handle *ip_handle;
57
58 u32 ip_flags; /* see below */
59
60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan;
62
63 u32 ip_dir_start_lookup;
64
65 /* next two are protected by trans_inc_lock */
66 /* which transaction were we created on? Zero if none. */
67 unsigned long ip_created_trans;
68 /* last transaction we were a part of. */
69 unsigned long ip_last_trans;
70
71 struct ocfs2_caching_info ip_metadata_cache;
72
73 struct inode vfs_inode;
74};
75
76/*
77 * Flags for the ip_flags field
78 */
79/* System file inodes */
80#define OCFS2_INODE_SYSTEM_FILE 0x00000001
81#define OCFS2_INODE_JOURNAL 0x00000002
82#define OCFS2_INODE_BITMAP 0x00000004
83/* This inode has been wiped from disk */
84#define OCFS2_INODE_DELETED 0x00000008
85/* Another node is deleting, so our delete is a nop */
86#define OCFS2_INODE_SKIP_DELETE 0x00000010
87/* Has the inode been orphaned on another node?
88 *
89 * This hints to ocfs2_drop_inode that it should clear i_nlink before
90 * continuing.
91 *
92 * We *only* set this on unlink vote from another node. If the inode
93 * was locally orphaned, then we're sure of the state and don't need
94 * to twiddle i_nlink later - it's either zero or not depending on
95 * whether our unlink succeeded. Otherwise we got this from a node
96 * whose intention was to orphan the inode, however he may have
97 * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
98 * rely on ocfs2_delete_inode to sort things out under the proper
99 * cluster locks.
100 */
101#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
102/* Does someone have the file open O_DIRECT */
103#define OCFS2_INODE_OPEN_DIRECT 0x00000040
104/* Indicates that the metadata cache should be used as an array. */
105#define OCFS2_INODE_CACHE_INLINE 0x00000080
106
107static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
108{
109 return container_of(inode, struct ocfs2_inode_info, vfs_inode);
110}
111
112#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
113#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
114
115extern kmem_cache_t *ocfs2_inode_cache;
116
117extern struct address_space_operations ocfs2_aops;
118
119struct buffer_head *ocfs2_bread(struct inode *inode, int block,
120 int *err, int reada);
121void ocfs2_clear_inode(struct inode *inode);
122void ocfs2_delete_inode(struct inode *inode);
123void ocfs2_drop_inode(struct inode *inode);
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
131 int create_ino);
132void ocfs2_read_inode(struct inode *inode);
133void ocfs2_read_inode2(struct inode *inode, void *opaque);
134ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
135 size_t size, loff_t *offp);
136void ocfs2_sync_blockdev(struct super_block *sb);
137void ocfs2_refresh_inode(struct inode *inode,
138 struct ocfs2_dinode *fe);
139int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
140 struct inode *inode,
141 struct buffer_head *bh);
142int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
143int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144
145#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 000000000000..04428042e5e5
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.c
5 *
6 * Defines functions of journalling api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/kthread.h>
31
32#define MLOG_MASK_PREFIX ML_JOURNAL
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "heartbeat.h"
41#include "inode.h"
42#include "journal.h"
43#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h"
46#include "super.h"
47#include "vote.h"
48#include "sysfile.h"
49
50#include "buffer_head_io.h"
51
52spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
53
54static int ocfs2_force_read_journal(struct inode *inode);
55static int ocfs2_recover_node(struct ocfs2_super *osb,
56 int node_num);
57static int __ocfs2_recovery_thread(void *arg);
58static int ocfs2_commit_cache(struct ocfs2_super *osb);
59static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
61 struct ocfs2_journal_handle *handle);
62static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
63static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
64 int dirty);
65static int ocfs2_trylock_journal(struct ocfs2_super *osb,
66 int slot_num);
67static int ocfs2_recover_orphans(struct ocfs2_super *osb,
68 int slot);
69static int ocfs2_commit_thread(void *arg);
70
71static int ocfs2_commit_cache(struct ocfs2_super *osb)
72{
73 int status = 0;
74 unsigned int flushed;
75 unsigned long old_id;
76 struct ocfs2_journal *journal = NULL;
77
78 mlog_entry_void();
79
80 journal = osb->journal;
81
82 /* Flush all pending commits and checkpoint the journal. */
83 down_write(&journal->j_trans_barrier);
84
85 if (atomic_read(&journal->j_num_trans) == 0) {
86 up_write(&journal->j_trans_barrier);
87 mlog(0, "No transactions for me to flush!\n");
88 goto finally;
89 }
90
91 journal_lock_updates(journal->j_journal);
92 status = journal_flush(journal->j_journal);
93 journal_unlock_updates(journal->j_journal);
94 if (status < 0) {
95 up_write(&journal->j_trans_barrier);
96 mlog_errno(status);
97 goto finally;
98 }
99
100 old_id = ocfs2_inc_trans_id(journal);
101
102 flushed = atomic_read(&journal->j_num_trans);
103 atomic_set(&journal->j_num_trans, 0);
104 up_write(&journal->j_trans_barrier);
105
106 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
107 journal->j_trans_id, flushed);
108
109 ocfs2_kick_vote_thread(osb);
110 wake_up(&journal->j_checkpointed);
111finally:
112 mlog_exit(status);
113 return status;
114}
115
116struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
117{
118 struct ocfs2_journal_handle *retval = NULL;
119
120 retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
121 if (!retval) {
122 mlog(ML_ERROR, "Failed to allocate memory for journal "
123 "handle!\n");
124 return NULL;
125 }
126
127 retval->max_buffs = 0;
128 retval->num_locks = 0;
129 retval->k_handle = NULL;
130
131 INIT_LIST_HEAD(&retval->locks);
132 INIT_LIST_HEAD(&retval->inode_list);
133 retval->journal = osb->journal;
134
135 return retval;
136}
137
138/* pass it NULL and it will allocate a new handle object for you. If
139 * you pass it a handle however, it may still return error, in which
140 * case it has free'd the passed handle for you. */
141struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
142 struct ocfs2_journal_handle *handle,
143 int max_buffs)
144{
145 int ret;
146 journal_t *journal = osb->journal->j_journal;
147
148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149
150 if (!osb || !osb->journal->j_journal)
151 BUG();
152
153 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS;
155 goto done_free;
156 }
157
158 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
159 BUG_ON(max_buffs <= 0);
160
161 /* JBD might support this, but our journalling code doesn't yet. */
162 if (journal_current_handle()) {
163 mlog(ML_ERROR, "Recursive transaction attempted!\n");
164 BUG();
165 }
166
167 if (!handle)
168 handle = ocfs2_alloc_handle(osb);
169 if (!handle) {
170 ret = -ENOMEM;
171 mlog(ML_ERROR, "Failed to allocate memory for journal "
172 "handle!\n");
173 goto done_free;
174 }
175
176 handle->max_buffs = max_buffs;
177
178 down_read(&osb->journal->j_trans_barrier);
179
180 /* actually start the transaction now */
181 handle->k_handle = journal_start(journal, max_buffs);
182 if (IS_ERR(handle->k_handle)) {
183 up_read(&osb->journal->j_trans_barrier);
184
185 ret = PTR_ERR(handle->k_handle);
186 handle->k_handle = NULL;
187 mlog_errno(ret);
188
189 if (is_journal_aborted(journal)) {
190 ocfs2_abort(osb->sb, "Detected aborted journal");
191 ret = -EROFS;
192 }
193 goto done_free;
194 }
195
196 atomic_inc(&(osb->journal->j_num_trans));
197 handle->flags |= OCFS2_HANDLE_STARTED;
198
199 mlog_exit_ptr(handle);
200 return handle;
201
202done_free:
203 if (handle)
204 ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
205
206 mlog_exit(ret);
207 return ERR_PTR(ret);
208}
209
210void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
211 struct inode *inode)
212{
213 BUG_ON(!handle);
214 BUG_ON(!inode);
215
216 atomic_inc(&inode->i_count);
217
218 /* we're obviously changing it... */
219 down(&inode->i_sem);
220
221 /* sanity check */
222 BUG_ON(OCFS2_I(inode)->ip_handle);
223 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
224
225 OCFS2_I(inode)->ip_handle = handle;
226 list_del(&(OCFS2_I(inode)->ip_handle_list));
227 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
228}
229
230static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
231{
232 struct list_head *p, *n;
233 struct inode *inode;
234 struct ocfs2_inode_info *oi;
235
236 list_for_each_safe(p, n, &handle->inode_list) {
237 oi = list_entry(p, struct ocfs2_inode_info,
238 ip_handle_list);
239 inode = &oi->vfs_inode;
240
241 OCFS2_I(inode)->ip_handle = NULL;
242 list_del_init(&OCFS2_I(inode)->ip_handle_list);
243
244 up(&inode->i_sem);
245 iput(inode);
246 }
247}
248
249/* This is trivial so we do it out of the main commit
250 * paths. Beware, it can be called from start_trans too! */
251static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
252{
253 mlog_entry_void();
254
255 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
256
257 ocfs2_handle_unlock_inodes(handle);
258 /* You are allowed to add journal locks before the transaction
259 * has started. */
260 ocfs2_handle_cleanup_locks(handle->journal, handle);
261
262 kfree(handle);
263
264 mlog_exit_void();
265}
266
267void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
268{
269 handle_t *jbd_handle;
270 int retval;
271 struct ocfs2_journal *journal = handle->journal;
272
273 mlog_entry_void();
274
275 BUG_ON(!handle);
276
277 if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
278 ocfs2_commit_unstarted_handle(handle);
279 mlog_exit_void();
280 return;
281 }
282
283 /* release inode semaphores we took during this transaction */
284 ocfs2_handle_unlock_inodes(handle);
285
286 /* ocfs2_extend_trans may have had to call journal_restart
287 * which will always commit the transaction, but may return
288 * error for any number of reasons. If this is the case, we
289 * clear k_handle as it's not valid any more. */
290 if (handle->k_handle) {
291 jbd_handle = handle->k_handle;
292
293 if (handle->flags & OCFS2_HANDLE_SYNC)
294 jbd_handle->h_sync = 1;
295 else
296 jbd_handle->h_sync = 0;
297
298 /* actually stop the transaction. if we've set h_sync,
299 * it'll have been committed when we return */
300 retval = journal_stop(jbd_handle);
301 if (retval < 0) {
302 mlog_errno(retval);
303 mlog(ML_ERROR, "Could not commit transaction\n");
304 BUG();
305 }
306
307 handle->k_handle = NULL; /* it's been free'd in journal_stop */
308 }
309
310 ocfs2_handle_cleanup_locks(journal, handle);
311
312 up_read(&journal->j_trans_barrier);
313
314 kfree(handle);
315 mlog_exit_void();
316}
317
318/*
319 * 'nblocks' is what you want to add to the current
320 * transaction. extend_trans will either extend the current handle by
321 * nblocks, or commit it and start a new one with nblocks credits.
322 *
323 * WARNING: This will not release any semaphores or disk locks taken
324 * during the transaction, so make sure they were taken *before*
325 * start_trans or we'll have ordering deadlocks.
326 *
327 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
328 * good because transaction ids haven't yet been recorded on the
329 * cluster locks associated with this handle.
330 */
331int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
332 int nblocks)
333{
334 int status;
335
336 BUG_ON(!handle);
337 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
338 BUG_ON(!nblocks);
339
340 mlog_entry_void();
341
342 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
343
344 status = journal_extend(handle->k_handle, nblocks);
345 if (status < 0) {
346 mlog_errno(status);
347 goto bail;
348 }
349
350 if (status > 0) {
351 mlog(0, "journal_extend failed, trying journal_restart\n");
352 status = journal_restart(handle->k_handle, nblocks);
353 if (status < 0) {
354 handle->k_handle = NULL;
355 mlog_errno(status);
356 goto bail;
357 }
358 handle->max_buffs = nblocks;
359 } else
360 handle->max_buffs += nblocks;
361
362 status = 0;
363bail:
364
365 mlog_exit(status);
366 return status;
367}
368
369int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
370 struct inode *inode,
371 struct buffer_head *bh,
372 int type)
373{
374 int status;
375
376 BUG_ON(!inode);
377 BUG_ON(!handle);
378 BUG_ON(!bh);
379 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
380
381 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
382 (unsigned long long)bh->b_blocknr, type,
383 (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
384 "OCFS2_JOURNAL_ACCESS_CREATE" :
385 "OCFS2_JOURNAL_ACCESS_WRITE",
386 bh->b_size);
387
388 /* we can safely remove this assertion after testing. */
389 if (!buffer_uptodate(bh)) {
390 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
391 mlog(ML_ERROR, "b_blocknr=%llu\n",
392 (unsigned long long)bh->b_blocknr);
393 BUG();
394 }
395
396 /* Set the current transaction information on the inode so
397 * that the locking code knows whether it can drop it's locks
398 * on this inode or not. We're protected from the commit
399 * thread updating the current transaction id until
400 * ocfs2_commit_trans() because ocfs2_start_trans() took
401 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403
404 down(&OCFS2_I(inode)->ip_io_sem);
405 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE:
408 status = journal_get_write_access(handle->k_handle, bh);
409 break;
410
411 case OCFS2_JOURNAL_ACCESS_UNDO:
412 status = journal_get_undo_access(handle->k_handle, bh);
413 break;
414
415 default:
416 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n");
418 }
419 up(&OCFS2_I(inode)->ip_io_sem);
420
421 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
423 status, type);
424
425 mlog_exit(status);
426 return status;
427}
428
429int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
430 struct buffer_head *bh)
431{
432 int status;
433
434 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
435
436 mlog_entry("(bh->b_blocknr=%llu)\n",
437 (unsigned long long)bh->b_blocknr);
438
439 status = journal_dirty_metadata(handle->k_handle, bh);
440 if (status < 0)
441 mlog(ML_ERROR, "Could not dirty metadata buffer. "
442 "(bh->b_blocknr=%llu)\n",
443 (unsigned long long)bh->b_blocknr);
444
445 mlog_exit(status);
446 return status;
447}
448
449int ocfs2_journal_dirty_data(handle_t *handle,
450 struct buffer_head *bh)
451{
452 int err = journal_dirty_data(handle, bh);
453 if (err)
454 mlog_errno(err);
455 /* TODO: When we can handle it, abort the handle and go RO on
456 * error here. */
457
458 return err;
459}
460
461/* We always assume you're adding a metadata lock at level 'ex' */
462int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
463 struct inode *inode)
464{
465 int status;
466 struct ocfs2_journal_lock *lock;
467
468 BUG_ON(!inode);
469
470 lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
471 if (!lock) {
472 status = -ENOMEM;
473 mlog_errno(-ENOMEM);
474 goto bail;
475 }
476
477 if (!igrab(inode))
478 BUG();
479 lock->jl_inode = inode;
480
481 list_add_tail(&(lock->jl_lock_list), &(handle->locks));
482 handle->num_locks++;
483
484 status = 0;
485bail:
486 mlog_exit(status);
487 return status;
488}
489
490static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
491 struct ocfs2_journal_handle *handle)
492{
493 struct list_head *p, *n;
494 struct ocfs2_journal_lock *lock;
495 struct inode *inode;
496
497 list_for_each_safe(p, n, &(handle->locks)) {
498 lock = list_entry(p, struct ocfs2_journal_lock,
499 jl_lock_list);
500 list_del(&lock->jl_lock_list);
501 handle->num_locks--;
502
503 inode = lock->jl_inode;
504 ocfs2_meta_unlock(inode, 1);
505 if (atomic_read(&inode->i_count) == 1)
506 mlog(ML_ERROR,
507 "Inode %"MLFu64", I'm doing a last iput for!",
508 OCFS2_I(inode)->ip_blkno);
509 iput(inode);
510 kmem_cache_free(ocfs2_lock_cache, lock);
511 }
512}
513
514#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
515
516void ocfs2_set_journal_params(struct ocfs2_super *osb)
517{
518 journal_t *journal = osb->journal->j_journal;
519
520 spin_lock(&journal->j_state_lock);
521 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
522 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
523 journal->j_flags |= JFS_BARRIER;
524 else
525 journal->j_flags &= ~JFS_BARRIER;
526 spin_unlock(&journal->j_state_lock);
527}
528
529int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
530{
531 int status = -1;
532 struct inode *inode = NULL; /* the journal inode */
533 journal_t *j_journal = NULL;
534 struct ocfs2_dinode *di = NULL;
535 struct buffer_head *bh = NULL;
536 struct ocfs2_super *osb;
537 int meta_lock = 0;
538
539 mlog_entry_void();
540
541 BUG_ON(!journal);
542
543 osb = journal->j_osb;
544
545 /* already have the inode for our journal */
546 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
547 osb->slot_num);
548 if (inode == NULL) {
549 status = -EACCES;
550 mlog_errno(status);
551 goto done;
552 }
553 if (is_bad_inode(inode)) {
554 mlog(ML_ERROR, "access error (bad inode)\n");
555 iput(inode);
556 inode = NULL;
557 status = -EACCES;
558 goto done;
559 }
560
561 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++;
563
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
565 if (status < 0) {
566 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n");
568 goto done;
569 }
570
571 meta_lock = 1;
572 di = (struct ocfs2_dinode *)bh->b_data;
573
574 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
575 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
576 inode->i_size);
577 status = -EINVAL;
578 goto done;
579 }
580
581 mlog(0, "inode->i_size = %lld\n", inode->i_size);
582 mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
583 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
584
585 /* call the kernels journal init function now */
586 j_journal = journal_init_inode(inode);
587 if (j_journal == NULL) {
588 mlog(ML_ERROR, "Linux journal layer error\n");
589 status = -EINVAL;
590 goto done;
591 }
592
593 mlog(0, "Returned from journal_init_inode\n");
594 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
595
596 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
597 OCFS2_JOURNAL_DIRTY_FL);
598
599 journal->j_journal = j_journal;
600 journal->j_inode = inode;
601 journal->j_bh = bh;
602
603 ocfs2_set_journal_params(osb);
604
605 journal->j_state = OCFS2_JOURNAL_LOADED;
606
607 status = 0;
608done:
609 if (status < 0) {
610 if (meta_lock)
611 ocfs2_meta_unlock(inode, 1);
612 if (bh != NULL)
613 brelse(bh);
614 if (inode) {
615 OCFS2_I(inode)->ip_open_count--;
616 iput(inode);
617 }
618 }
619
620 mlog_exit(status);
621 return status;
622}
623
624static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
625 int dirty)
626{
627 int status;
628 unsigned int flags;
629 struct ocfs2_journal *journal = osb->journal;
630 struct buffer_head *bh = journal->j_bh;
631 struct ocfs2_dinode *fe;
632
633 mlog_entry_void();
634
635 fe = (struct ocfs2_dinode *)bh->b_data;
636 if (!OCFS2_IS_VALID_DINODE(fe)) {
637 /* This is called from startup/shutdown which will
638 * handle the errors in a specific manner, so no need
639 * to call ocfs2_error() here. */
640 mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid "
641 "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
642 status = -EIO;
643 goto out;
644 }
645
646 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
647 if (dirty)
648 flags |= OCFS2_JOURNAL_DIRTY_FL;
649 else
650 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
651 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
652
653 status = ocfs2_write_block(osb, bh, journal->j_inode);
654 if (status < 0)
655 mlog_errno(status);
656
657out:
658 mlog_exit(status);
659 return status;
660}
661
662/*
663 * If the journal has been kmalloc'd it needs to be freed after this
664 * call.
665 */
666void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667{
668 struct ocfs2_journal *journal = NULL;
669 int status = 0;
670 struct inode *inode = NULL;
671 int num_running_trans = 0;
672
673 mlog_entry_void();
674
675 if (!osb)
676 BUG();
677
678 journal = osb->journal;
679 if (!journal)
680 goto done;
681
682 inode = journal->j_inode;
683
684 if (journal->j_state != OCFS2_JOURNAL_LOADED)
685 goto done;
686
687 /* need to inc inode use count as journal_destroy will iput. */
688 if (!igrab(inode))
689 BUG();
690
691 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
692 if (num_running_trans > 0)
693 mlog(0, "Shutting down journal: must wait on %d "
694 "running transactions!\n",
695 num_running_trans);
696
697 /* Do a commit_cache here. It will flush our journal, *and*
698 * release any locks that are still held.
699 * set the SHUTDOWN flag and release the trans lock.
700 * the commit thread will take the trans lock for us below. */
701 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
702
703 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
704 * drop the trans_lock (which we want to hold until we
705 * completely destroy the journal. */
706 if (osb->commit_task) {
707 /* Wait for the commit thread */
708 mlog(0, "Waiting for ocfs2commit to exit....\n");
709 kthread_stop(osb->commit_task);
710 osb->commit_task = NULL;
711 }
712
713 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
714
715 status = ocfs2_journal_toggle_dirty(osb, 0);
716 if (status < 0)
717 mlog_errno(status);
718
719 /* Shutdown the kernel journal system */
720 journal_destroy(journal->j_journal);
721
722 OCFS2_I(inode)->ip_open_count--;
723
724 /* unlock our journal */
725 ocfs2_meta_unlock(inode, 1);
726
727 brelse(journal->j_bh);
728 journal->j_bh = NULL;
729
730 journal->j_state = OCFS2_JOURNAL_FREE;
731
732// up_write(&journal->j_trans_barrier);
733done:
734 if (inode)
735 iput(inode);
736 mlog_exit_void();
737}
738
739static void ocfs2_clear_journal_error(struct super_block *sb,
740 journal_t *journal,
741 int slot)
742{
743 int olderr;
744
745 olderr = journal_errno(journal);
746 if (olderr) {
747 mlog(ML_ERROR, "File system error %d recorded in "
748 "journal %u.\n", olderr, slot);
749 mlog(ML_ERROR, "File system on device %s needs checking.\n",
750 sb->s_id);
751
752 journal_ack_err(journal);
753 journal_clear_err(journal);
754 }
755}
756
757int ocfs2_journal_load(struct ocfs2_journal *journal)
758{
759 int status = 0;
760 struct ocfs2_super *osb;
761
762 mlog_entry_void();
763
764 if (!journal)
765 BUG();
766
767 osb = journal->j_osb;
768
769 status = journal_load(journal->j_journal);
770 if (status < 0) {
771 mlog(ML_ERROR, "Failed to load journal!\n");
772 goto done;
773 }
774
775 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
776
777 status = ocfs2_journal_toggle_dirty(osb, 1);
778 if (status < 0) {
779 mlog_errno(status);
780 goto done;
781 }
782
783 /* Launch the commit thread */
784 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
785 osb->osb_id);
786 if (IS_ERR(osb->commit_task)) {
787 status = PTR_ERR(osb->commit_task);
788 osb->commit_task = NULL;
789 mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
790 status);
791 goto done;
792 }
793
794done:
795 mlog_exit(status);
796 return status;
797}
798
799
800/* 'full' flag tells us whether we clear out all blocks or if we just
801 * mark the journal clean */
802int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
803{
804 int status;
805
806 mlog_entry_void();
807
808 if (!journal)
809 BUG();
810
811 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) {
813 mlog_errno(status);
814 goto bail;
815 }
816
817 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
818 if (status < 0)
819 mlog_errno(status);
820
821bail:
822 mlog_exit(status);
823 return status;
824}
825
826/*
827 * JBD Might read a cached version of another nodes journal file. We
828 * don't want this as this file changes often and we get no
829 * notification on those changes. The only way to be sure that we've
830 * got the most up to date version of those blocks then is to force
831 * read them off disk. Just searching through the buffer cache won't
832 * work as there may be pages backing this file which are still marked
833 * up to date. We know things can't change on this file underneath us
834 * as we have the lock by now :)
835 */
836static int ocfs2_force_read_journal(struct inode *inode)
837{
838 int status = 0;
839 int i, p_blocks;
840 u64 v_blkno, p_blkno;
841#define CONCURRENT_JOURNAL_FILL 32
842 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
843
844 mlog_entry_void();
845
846 BUG_ON(inode->i_blocks !=
847 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
848
849 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
850
851 mlog(0, "Force reading %lu blocks\n",
852 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
853
854 v_blkno = 0;
855 while (v_blkno <
856 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
857
858 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
859 1, &p_blkno,
860 &p_blocks);
861 if (status < 0) {
862 mlog_errno(status);
863 goto bail;
864 }
865
866 if (p_blocks > CONCURRENT_JOURNAL_FILL)
867 p_blocks = CONCURRENT_JOURNAL_FILL;
868
869 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
870 p_blkno, p_blocks, bhs, 0,
871 inode);
872 if (status < 0) {
873 mlog_errno(status);
874 goto bail;
875 }
876
877 for(i = 0; i < p_blocks; i++) {
878 brelse(bhs[i]);
879 bhs[i] = NULL;
880 }
881
882 v_blkno += p_blocks;
883 }
884
885bail:
886 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
887 if (bhs[i])
888 brelse(bhs[i]);
889 mlog_exit(status);
890 return status;
891}
892
893struct ocfs2_la_recovery_item {
894 struct list_head lri_list;
895 int lri_slot;
896 struct ocfs2_dinode *lri_la_dinode;
897 struct ocfs2_dinode *lri_tl_dinode;
898};
899
900/* Does the second half of the recovery process. By this point, the
901 * node is marked clean and can actually be considered recovered,
902 * hence it's no longer in the recovery map, but there's still some
903 * cleanup we can do which shouldn't happen within the recovery thread
904 * as locking in that context becomes very difficult if we are to take
905 * recovering nodes into account.
906 *
907 * NOTE: This function can and will sleep on recovery of other nodes
908 * during cluster locking, just like any other ocfs2 process.
909 */
910void ocfs2_complete_recovery(void *data)
911{
912 int ret;
913 struct ocfs2_super *osb = data;
914 struct ocfs2_journal *journal = osb->journal;
915 struct ocfs2_dinode *la_dinode, *tl_dinode;
916 struct ocfs2_la_recovery_item *item;
917 struct list_head *p, *n;
918 LIST_HEAD(tmp_la_list);
919
920 mlog_entry_void();
921
922 mlog(0, "completing recovery from keventd\n");
923
924 spin_lock(&journal->j_lock);
925 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
926 spin_unlock(&journal->j_lock);
927
928 list_for_each_safe(p, n, &tmp_la_list) {
929 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
930 list_del_init(&item->lri_list);
931
932 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
933
934 la_dinode = item->lri_la_dinode;
935 if (la_dinode) {
936 mlog(0, "Clean up local alloc %"MLFu64"\n",
937 la_dinode->i_blkno);
938
939 ret = ocfs2_complete_local_alloc_recovery(osb,
940 la_dinode);
941 if (ret < 0)
942 mlog_errno(ret);
943
944 kfree(la_dinode);
945 }
946
947 tl_dinode = item->lri_tl_dinode;
948 if (tl_dinode) {
949 mlog(0, "Clean up truncate log %"MLFu64"\n",
950 tl_dinode->i_blkno);
951
952 ret = ocfs2_complete_truncate_log_recovery(osb,
953 tl_dinode);
954 if (ret < 0)
955 mlog_errno(ret);
956
957 kfree(tl_dinode);
958 }
959
960 ret = ocfs2_recover_orphans(osb, item->lri_slot);
961 if (ret < 0)
962 mlog_errno(ret);
963
964 kfree(item);
965 }
966
967 mlog(0, "Recovery completion\n");
968 mlog_exit_void();
969}
970
971/* NOTE: This function always eats your references to la_dinode and
972 * tl_dinode, either manually on error, or by passing them to
973 * ocfs2_complete_recovery */
974static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
975 int slot_num,
976 struct ocfs2_dinode *la_dinode,
977 struct ocfs2_dinode *tl_dinode)
978{
979 struct ocfs2_la_recovery_item *item;
980
981 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
982 if (!item) {
983 /* Though we wish to avoid it, we are in fact safe in
984 * skipping local alloc cleanup as fsck.ocfs2 is more
985 * than capable of reclaiming unused space. */
986 if (la_dinode)
987 kfree(la_dinode);
988
989 if (tl_dinode)
990 kfree(tl_dinode);
991
992 mlog_errno(-ENOMEM);
993 return;
994 }
995
996 INIT_LIST_HEAD(&item->lri_list);
997 item->lri_la_dinode = la_dinode;
998 item->lri_slot = slot_num;
999 item->lri_tl_dinode = tl_dinode;
1000
1001 spin_lock(&journal->j_lock);
1002 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1003 queue_work(ocfs2_wq, &journal->j_recovery_work);
1004 spin_unlock(&journal->j_lock);
1005}
1006
1007/* Called by the mount code to queue recovery the last part of
1008 * recovery for it's own slot. */
1009void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1010{
1011 struct ocfs2_journal *journal = osb->journal;
1012
1013 if (osb->dirty) {
1014 /* No need to queue up our truncate_log as regular
1015 * cleanup will catch that. */
1016 ocfs2_queue_recovery_completion(journal,
1017 osb->slot_num,
1018 osb->local_alloc_copy,
1019 NULL);
1020 ocfs2_schedule_truncate_log_flush(osb, 0);
1021
1022 osb->local_alloc_copy = NULL;
1023 osb->dirty = 0;
1024 }
1025}
1026
1027static int __ocfs2_recovery_thread(void *arg)
1028{
1029 int status, node_num;
1030 struct ocfs2_super *osb = arg;
1031
1032 mlog_entry_void();
1033
1034 status = ocfs2_wait_on_mount(osb);
1035 if (status < 0) {
1036 goto bail;
1037 }
1038
1039restart:
1040 status = ocfs2_super_lock(osb, 1);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto bail;
1044 }
1045
1046 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1047 node_num = ocfs2_node_map_first_set_bit(osb,
1048 &osb->recovery_map);
1049 if (node_num == O2NM_INVALID_NODE_NUM) {
1050 mlog(0, "Out of nodes to recover.\n");
1051 break;
1052 }
1053
1054 status = ocfs2_recover_node(osb, node_num);
1055 if (status < 0) {
1056 mlog(ML_ERROR,
1057 "Error %d recovering node %d on device (%u,%u)!\n",
1058 status, node_num,
1059 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1060 mlog(ML_ERROR, "Volume requires unmount.\n");
1061 continue;
1062 }
1063
1064 ocfs2_recovery_map_clear(osb, node_num);
1065 }
1066 ocfs2_super_unlock(osb, 1);
1067
1068 /* We always run recovery on our own orphan dir - the dead
1069 * node(s) may have voted "no" on an inode delete earlier. A
1070 * revote is therefore required. */
1071 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1072 NULL);
1073
1074bail:
1075 down(&osb->recovery_lock);
1076 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock);
1079 goto restart;
1080 }
1081
1082 osb->recovery_thread_task = NULL;
1083 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event);
1085
1086 up(&osb->recovery_lock);
1087
1088 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api
1090 * requires that we call do_exit(). And it isn't exported, but
1091 * complete_and_exit() seems to be a minimal wrapper around it. */
1092 complete_and_exit(NULL, status);
1093 return status;
1094}
1095
1096void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1097{
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num);
1100
1101 down(&osb->recovery_lock);
1102 if (osb->disable_recovery)
1103 goto out;
1104
1105 /* People waiting on recovery will wait on
1106 * the recovery map to empty. */
1107 if (!ocfs2_recovery_map_set(osb, node_num))
1108 mlog(0, "node %d already be in recovery.\n", node_num);
1109
1110 mlog(0, "starting recovery thread...\n");
1111
1112 if (osb->recovery_thread_task)
1113 goto out;
1114
1115 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
1116 "ocfs2rec-%d", osb->osb_id);
1117 if (IS_ERR(osb->recovery_thread_task)) {
1118 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
1119 osb->recovery_thread_task = NULL;
1120 }
1121
1122out:
1123 up(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event);
1125
1126 mlog_exit_void();
1127}
1128
1129/* Does the actual journal replay and marks the journal inode as
1130 * clean. Will only replay if the journal inode is marked dirty. */
1131static int ocfs2_replay_journal(struct ocfs2_super *osb,
1132 int node_num,
1133 int slot_num)
1134{
1135 int status;
1136 int got_lock = 0;
1137 unsigned int flags;
1138 struct inode *inode = NULL;
1139 struct ocfs2_dinode *fe;
1140 journal_t *journal = NULL;
1141 struct buffer_head *bh = NULL;
1142
1143 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1144 slot_num);
1145 if (inode == NULL) {
1146 status = -EACCES;
1147 mlog_errno(status);
1148 goto done;
1149 }
1150 if (is_bad_inode(inode)) {
1151 status = -EACCES;
1152 iput(inode);
1153 inode = NULL;
1154 mlog_errno(status);
1155 goto done;
1156 }
1157 SET_INODE_JOURNAL(inode);
1158
1159 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
1160 OCFS2_META_LOCK_RECOVERY);
1161 if (status < 0) {
1162 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
1163 if (status != -ERESTARTSYS)
1164 mlog(ML_ERROR, "Could not lock journal!\n");
1165 goto done;
1166 }
1167 got_lock = 1;
1168
1169 fe = (struct ocfs2_dinode *) bh->b_data;
1170
1171 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1172
1173 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1174 mlog(0, "No recovery required for node %d\n", node_num);
1175 goto done;
1176 }
1177
1178 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1179 node_num, slot_num,
1180 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1181
1182 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1183
1184 status = ocfs2_force_read_journal(inode);
1185 if (status < 0) {
1186 mlog_errno(status);
1187 goto done;
1188 }
1189
1190 mlog(0, "calling journal_init_inode\n");
1191 journal = journal_init_inode(inode);
1192 if (journal == NULL) {
1193 mlog(ML_ERROR, "Linux journal layer error\n");
1194 status = -EIO;
1195 goto done;
1196 }
1197
1198 status = journal_load(journal);
1199 if (status < 0) {
1200 mlog_errno(status);
1201 if (!igrab(inode))
1202 BUG();
1203 journal_destroy(journal);
1204 goto done;
1205 }
1206
1207 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1208
1209 /* wipe the journal */
1210 mlog(0, "flushing the journal.\n");
1211 journal_lock_updates(journal);
1212 status = journal_flush(journal);
1213 journal_unlock_updates(journal);
1214 if (status < 0)
1215 mlog_errno(status);
1216
1217 /* This will mark the node clean */
1218 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1219 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1220 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1221
1222 status = ocfs2_write_block(osb, bh, inode);
1223 if (status < 0)
1224 mlog_errno(status);
1225
1226 if (!igrab(inode))
1227 BUG();
1228
1229 journal_destroy(journal);
1230
1231done:
1232 /* drop the lock on this nodes journal */
1233 if (got_lock)
1234 ocfs2_meta_unlock(inode, 1);
1235
1236 if (inode)
1237 iput(inode);
1238
1239 if (bh)
1240 brelse(bh);
1241
1242 mlog_exit(status);
1243 return status;
1244}
1245
1246/*
1247 * Do the most important parts of node recovery:
1248 * - Replay it's journal
1249 * - Stamp a clean local allocator file
1250 * - Stamp a clean truncate log
1251 * - Mark the node clean
1252 *
1253 * If this function completes without error, a node in OCFS2 can be
1254 * said to have been safely recovered. As a result, failure during the
1255 * second part of a nodes recovery process (local alloc recovery) is
1256 * far less concerning.
1257 */
1258static int ocfs2_recover_node(struct ocfs2_super *osb,
1259 int node_num)
1260{
1261 int status = 0;
1262 int slot_num;
1263 struct ocfs2_slot_info *si = osb->slot_info;
1264 struct ocfs2_dinode *la_copy = NULL;
1265 struct ocfs2_dinode *tl_copy = NULL;
1266
1267 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1268 node_num, osb->node_num);
1269
1270 mlog(0, "checking node %d\n", node_num);
1271
1272 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num)
1275 BUG();
1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) {
1279 status = 0;
1280 mlog(0, "no slot for this node, so no recovery required.\n");
1281 goto done;
1282 }
1283
1284 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1285
1286 status = ocfs2_replay_journal(osb, node_num, slot_num);
1287 if (status < 0) {
1288 mlog_errno(status);
1289 goto done;
1290 }
1291
1292 /* Stamp a clean local alloc file AFTER recovering the journal... */
1293 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto done;
1297 }
1298
1299 /* An error from begin_truncate_log_recovery is not
1300 * serious enough to warrant halting the rest of
1301 * recovery. */
1302 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1303 if (status < 0)
1304 mlog_errno(status);
1305
1306 /* Likewise, this would be a strange but ultimately not so
1307 * harmful place to get an error... */
1308 ocfs2_clear_slot(si, slot_num);
1309 status = ocfs2_update_disk_slots(osb, si);
1310 if (status < 0)
1311 mlog_errno(status);
1312
1313 /* This will kfree the memory pointed to by la_copy and tl_copy */
1314 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1315 tl_copy);
1316
1317 status = 0;
1318done:
1319
1320 mlog_exit(status);
1321 return status;
1322}
1323
1324/* Test node liveness by trylocking his journal. If we get the lock,
1325 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1326 * still alive (we couldn't get the lock) and < 0 on error. */
1327static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1328 int slot_num)
1329{
1330 int status, flags;
1331 struct inode *inode = NULL;
1332
1333 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1334 slot_num);
1335 if (inode == NULL) {
1336 mlog(ML_ERROR, "access error\n");
1337 status = -EACCES;
1338 goto bail;
1339 }
1340 if (is_bad_inode(inode)) {
1341 mlog(ML_ERROR, "access error (bad inode)\n");
1342 iput(inode);
1343 inode = NULL;
1344 status = -EACCES;
1345 goto bail;
1346 }
1347 SET_INODE_JOURNAL(inode);
1348
1349 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1350 status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
1351 if (status < 0) {
1352 if (status != -EAGAIN)
1353 mlog_errno(status);
1354 goto bail;
1355 }
1356
1357 ocfs2_meta_unlock(inode, 1);
1358bail:
1359 if (inode)
1360 iput(inode);
1361
1362 return status;
1363}
1364
1365/* Call this underneath ocfs2_super_lock. It also assumes that the
1366 * slot info struct has been updated from disk. */
1367int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1368{
1369 int status, i, node_num;
1370 struct ocfs2_slot_info *si = osb->slot_info;
1371
1372 /* This is called with the super block cluster lock, so we
1373 * know that the slot map can't change underneath us. */
1374
1375 spin_lock(&si->si_lock);
1376 for(i = 0; i < si->si_num_slots; i++) {
1377 if (i == osb->slot_num)
1378 continue;
1379 if (ocfs2_is_empty_slot(si, i))
1380 continue;
1381
1382 node_num = si->si_global_node_nums[i];
1383 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1384 continue;
1385 spin_unlock(&si->si_lock);
1386
1387 /* Ok, we have a slot occupied by another node which
1388 * is not in the recovery map. We trylock his journal
1389 * file here to test if he's alive. */
1390 status = ocfs2_trylock_journal(osb, i);
1391 if (!status) {
1392 /* Since we're called from mount, we know that
1393 * the recovery thread can't race us on
1394 * setting / checking the recovery bits. */
1395 ocfs2_recovery_thread(osb, node_num);
1396 } else if ((status < 0) && (status != -EAGAIN)) {
1397 mlog_errno(status);
1398 goto bail;
1399 }
1400
1401 spin_lock(&si->si_lock);
1402 }
1403 spin_unlock(&si->si_lock);
1404
1405 status = 0;
1406bail:
1407 mlog_exit(status);
1408 return status;
1409}
1410
1411static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1412 int slot)
1413{
1414 int status = 0;
1415 int have_disk_lock = 0;
1416 struct inode *inode = NULL;
1417 struct inode *iter;
1418 struct inode *orphan_dir_inode = NULL;
1419 unsigned long offset, blk, local;
1420 struct buffer_head *bh = NULL;
1421 struct ocfs2_dir_entry *de;
1422 struct super_block *sb = osb->sb;
1423 struct ocfs2_inode_info *oi;
1424
1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1426
1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1428 ORPHAN_DIR_SYSTEM_INODE,
1429 slot);
1430 if (!orphan_dir_inode) {
1431 status = -ENOENT;
1432 mlog_errno(status);
1433 goto out;
1434 }
1435
1436 down(&orphan_dir_inode->i_sem);
1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
1438 if (status < 0) {
1439 up(&orphan_dir_inode->i_sem);
1440 mlog_errno(status);
1441 goto out;
1442 }
1443 have_disk_lock = 1;
1444
1445 offset = 0;
1446 iter = NULL;
1447 while(offset < i_size_read(orphan_dir_inode)) {
1448 blk = offset >> sb->s_blocksize_bits;
1449
1450 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1451 if (!bh)
1452 status = -EINVAL;
1453 if (status < 0) {
1454 up(&orphan_dir_inode->i_sem);
1455 if (bh)
1456 brelse(bh);
1457 mlog_errno(status);
1458 goto out;
1459 }
1460
1461 local = 0;
1462 while(offset < i_size_read(orphan_dir_inode)
1463 && local < sb->s_blocksize) {
1464 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1465
1466 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1467 de, bh, local)) {
1468 up(&orphan_dir_inode->i_sem);
1469 status = -EINVAL;
1470 mlog_errno(status);
1471 brelse(bh);
1472 goto out;
1473 }
1474
1475 local += le16_to_cpu(de->rec_len);
1476 offset += le16_to_cpu(de->rec_len);
1477
1478 /* I guess we silently fail on no inode? */
1479 if (!le64_to_cpu(de->inode))
1480 continue;
1481 if (de->file_type > OCFS2_FT_MAX) {
1482 mlog(ML_ERROR,
1483 "block %llu contains invalid de: "
1484 "inode = %"MLFu64", rec_len = %u, "
1485 "name_len = %u, file_type = %u, "
1486 "name='%.*s'\n",
1487 (unsigned long long)bh->b_blocknr,
1488 le64_to_cpu(de->inode),
1489 le16_to_cpu(de->rec_len),
1490 de->name_len,
1491 de->file_type,
1492 de->name_len,
1493 de->name);
1494 continue;
1495 }
1496 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1497 continue;
1498 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1499 continue;
1500
1501 iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
1502 if (IS_ERR(iter))
1503 continue;
1504
1505 mlog(0, "queue orphan %"MLFu64"\n",
1506 OCFS2_I(iter)->ip_blkno);
1507 OCFS2_I(iter)->ip_next_orphan = inode;
1508 inode = iter;
1509 }
1510 brelse(bh);
1511 }
1512 up(&orphan_dir_inode->i_sem);
1513
1514 ocfs2_meta_unlock(orphan_dir_inode, 0);
1515 have_disk_lock = 0;
1516
1517 iput(orphan_dir_inode);
1518 orphan_dir_inode = NULL;
1519
1520 while (inode) {
1521 oi = OCFS2_I(inode);
1522 mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
1523
1524 iter = oi->ip_next_orphan;
1525
1526 spin_lock(&oi->ip_lock);
1527 /* Delete voting may have set these on the assumption
1528 * that the other node would wipe them successfully.
1529 * If they are still in the node's orphan dir, we need
1530 * to reset that state. */
1531 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1532
1533 /* Set the proper information to get us going into
1534 * ocfs2_delete_inode. */
1535 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1536 oi->ip_orphaned_slot = slot;
1537 spin_unlock(&oi->ip_lock);
1538
1539 iput(inode);
1540
1541 inode = iter;
1542 }
1543
1544out:
1545 if (have_disk_lock)
1546 ocfs2_meta_unlock(orphan_dir_inode, 0);
1547
1548 if (orphan_dir_inode)
1549 iput(orphan_dir_inode);
1550
1551 return status;
1552}
1553
1554static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1555{
1556 /* This check is good because ocfs2 will wait on our recovery
1557 * thread before changing it to something other than MOUNTED
1558 * or DISABLED. */
1559 wait_event(osb->osb_mount_event,
1560 atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
1561 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1562
1563 /* If there's an error on mount, then we may never get to the
1564 * MOUNTED flag, but this is set right before
1565 * dismount_volume() so we can trust it. */
1566 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1567 mlog(0, "mount error, exiting!\n");
1568 return -EBUSY;
1569 }
1570
1571 return 0;
1572}
1573
1574static int ocfs2_commit_thread(void *arg)
1575{
1576 int status;
1577 struct ocfs2_super *osb = arg;
1578 struct ocfs2_journal *journal = osb->journal;
1579
1580 /* we can trust j_num_trans here because _should_stop() is only set in
1581 * shutdown and nobody other than ourselves should be able to start
1582 * transactions. committing on shutdown might take a few iterations
1583 * as final transactions put deleted inodes on the list */
1584 while (!(kthread_should_stop() &&
1585 atomic_read(&journal->j_num_trans) == 0)) {
1586
1587 wait_event_interruptible_timeout(osb->checkpoint_event,
1588 atomic_read(&journal->j_num_trans)
1589 || kthread_should_stop(),
1590 OCFS2_CHECKPOINT_INTERVAL);
1591
1592 status = ocfs2_commit_cache(osb);
1593 if (status < 0)
1594 mlog_errno(status);
1595
1596 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1597 mlog(ML_KTHREAD,
1598 "commit_thread: %u transactions pending on "
1599 "shutdown\n",
1600 atomic_read(&journal->j_num_trans));
1601 }
1602 }
1603
1604 return 0;
1605}
1606
1607/* Look for a dirty journal without taking any cluster locks. Used for
1608 * hard readonly access to determine whether the file system journals
1609 * require recovery. */
1610int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1611{
1612 int ret = 0;
1613 unsigned int slot;
1614 struct buffer_head *di_bh;
1615 struct ocfs2_dinode *di;
1616 struct inode *journal = NULL;
1617
1618 for(slot = 0; slot < osb->max_slots; slot++) {
1619 journal = ocfs2_get_system_file_inode(osb,
1620 JOURNAL_SYSTEM_INODE,
1621 slot);
1622 if (!journal || is_bad_inode(journal)) {
1623 ret = -EACCES;
1624 mlog_errno(ret);
1625 goto out;
1626 }
1627
1628 di_bh = NULL;
1629 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1630 0, journal);
1631 if (ret < 0) {
1632 mlog_errno(ret);
1633 goto out;
1634 }
1635
1636 di = (struct ocfs2_dinode *) di_bh->b_data;
1637
1638 if (le32_to_cpu(di->id1.journal1.ij_flags) &
1639 OCFS2_JOURNAL_DIRTY_FL)
1640 ret = -EROFS;
1641
1642 brelse(di_bh);
1643 if (ret)
1644 break;
1645 }
1646
1647out:
1648 if (journal)
1649 iput(journal);
1650
1651 return ret;
1652}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 000000000000..7d0a816184fa
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.h
5 *
6 * Defines journalling api and structures.
7 *
8 * Copyright (C) 2003, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_JOURNAL_H
27#define OCFS2_JOURNAL_H
28
29#include <linux/fs.h>
30#include <linux/jbd.h>
31
32#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
33
34enum ocfs2_journal_state {
35 OCFS2_JOURNAL_FREE = 0,
36 OCFS2_JOURNAL_LOADED,
37 OCFS2_JOURNAL_IN_SHUTDOWN,
38};
39
40struct ocfs2_super;
41struct ocfs2_dinode;
42struct ocfs2_journal_handle;
43
44struct ocfs2_journal {
45 enum ocfs2_journal_state j_state; /* Journals current state */
46
47 journal_t *j_journal; /* The kernels journal type */
48 struct inode *j_inode; /* Kernel inode pointing to
49 * this journal */
50 struct ocfs2_super *j_osb; /* pointer to the super
51 * block for the node
52 * we're currently
53 * running on -- not
54 * necessarily the super
55 * block from the node
56 * which we usually run
57 * from (recovery,
58 * etc) */
59 struct buffer_head *j_bh; /* Journal disk inode block */
60 atomic_t j_num_trans; /* Number of transactions
61 * currently in the system. */
62 unsigned long j_trans_id;
63 struct rw_semaphore j_trans_barrier;
64 wait_queue_head_t j_checkpointed;
65
66 spinlock_t j_lock;
67 struct list_head j_la_cleanups;
68 struct work_struct j_recovery_work;
69};
70
71extern spinlock_t trans_inc_lock;
72
73/* wrap j_trans_id so we never have it equal to zero. */
74static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
75{
76 unsigned long old_id;
77 spin_lock(&trans_inc_lock);
78 old_id = j->j_trans_id++;
79 if (unlikely(!j->j_trans_id))
80 j->j_trans_id = 1;
81 spin_unlock(&trans_inc_lock);
82 return old_id;
83}
84
85static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
86 struct inode *inode)
87{
88 spin_lock(&trans_inc_lock);
89 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
90 spin_unlock(&trans_inc_lock);
91}
92
93/* Used to figure out whether it's safe to drop a metadata lock on an
94 * inode. Returns true if all the inodes changes have been
95 * checkpointed to disk. You should be holding the spinlock on the
96 * metadata lock while calling this to be sure that nobody can take
97 * the lock and put it on another transaction. */
98static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
99{
100 int ret;
101 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
102
103 spin_lock(&trans_inc_lock);
104 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
105 spin_unlock(&trans_inc_lock);
106 return ret;
107}
108
109/* convenience function to check if an inode is still new (has never
110 * hit disk) Will do you a favor and set created_trans = 0 when you've
111 * been checkpointed. returns '1' if the inode is still new. */
112static inline int ocfs2_inode_is_new(struct inode *inode)
113{
114 int ret;
115
116 /* System files are never "new" as they're written out by
117 * mkfs. This helps us early during mount, before we have the
118 * journal open and j_trans_id could be junk. */
119 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
120 return 0;
121 spin_lock(&trans_inc_lock);
122 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
123 OCFS2_I(inode)->ip_created_trans));
124 if (!ret)
125 OCFS2_I(inode)->ip_created_trans = 0;
126 spin_unlock(&trans_inc_lock);
127 return ret;
128}
129
130static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
131 struct inode *inode)
132{
133 spin_lock(&trans_inc_lock);
134 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
135 spin_unlock(&trans_inc_lock);
136}
137
138extern kmem_cache_t *ocfs2_lock_cache;
139
140struct ocfs2_journal_lock {
141 struct inode *jl_inode;
142 struct list_head jl_lock_list;
143};
144
145struct ocfs2_journal_handle {
146 handle_t *k_handle; /* kernel handle. */
147 struct ocfs2_journal *journal;
148 u32 flags; /* see flags below. */
149 int max_buffs; /* Buffs reserved by this handle */
150
151 /* The following two fields are for ocfs2_handle_add_lock */
152 int num_locks;
153 struct list_head locks; /* A bunch of locks to
154 * release on commit. This
155 * should be a list_head */
156
157 struct list_head inode_list;
158};
159
160#define OCFS2_HANDLE_STARTED 1
161/* should we sync-commit this handle? */
162#define OCFS2_HANDLE_SYNC 2
163static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
164{
165 return handle->flags & OCFS2_HANDLE_STARTED;
166}
167
168static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
169{
170 if (sync)
171 handle->flags |= OCFS2_HANDLE_SYNC;
172 else
173 handle->flags &= ~OCFS2_HANDLE_SYNC;
174}
175
176/* Exported only for the journal struct init code in super.c. Do not call. */
177void ocfs2_complete_recovery(void *data);
178
179/*
180 * Journal Control:
181 * Initialize, Load, Shutdown, Wipe a journal.
182 *
183 * ocfs2_journal_init - Initialize journal structures in the OSB.
184 * ocfs2_journal_load - Load the given journal off disk. Replay it if
185 * there's transactions still in there.
186 * ocfs2_journal_shutdown - Shutdown a journal, this will flush all
187 * uncommitted, uncheckpointed transactions.
188 * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
189 * zero out each block.
190 * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
191 * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
192 * event on.
193 * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
194 */
195void ocfs2_set_journal_params(struct ocfs2_super *osb);
196int ocfs2_journal_init(struct ocfs2_journal *journal,
197 int *dirty);
198void ocfs2_journal_shutdown(struct ocfs2_super *osb);
199int ocfs2_journal_wipe(struct ocfs2_journal *journal,
200 int full);
201int ocfs2_journal_load(struct ocfs2_journal *journal);
202int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
203void ocfs2_recovery_thread(struct ocfs2_super *osb,
204 int node_num);
205int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
206void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
207
208static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
209{
210 atomic_set(&osb->needs_checkpoint, 1);
211 wake_up(&osb->checkpoint_event);
212}
213
214static inline void ocfs2_checkpoint_inode(struct inode *inode)
215{
216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
217
218 if (!ocfs2_inode_fully_checkpointed(inode)) {
219 /* WARNING: This only kicks off a single
220 * checkpoint. If someone races you and adds more
221 * metadata to the journal, you won't know, and will
222 * wind up waiting *alot* longer than necessary. Right
223 * now we only use this in clear_inode so that's
224 * OK. */
225 ocfs2_start_checkpoint(osb);
226
227 wait_event(osb->journal->j_checkpointed,
228 ocfs2_inode_fully_checkpointed(inode));
229 }
230}
231
232/*
233 * Transaction Handling:
234 * Manage the lifetime of a transaction handle.
235 *
236 * ocfs2_alloc_handle - Only allocate a handle so we can start putting
237 * cluster locks on it. To actually change blocks,
238 * call ocfs2_start_trans with the handle returned
239 * from this function. You may call ocfs2_commit_trans
240 * at any time in the lifetime of a handle.
241 * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
242 * the number of blocks that will be changed during
243 * this handle.
244 * ocfs2_commit_trans - Complete a handle.
245 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
246 * commit the handle to disk in the process, but will
247 * not release any locks taken during the transaction.
248 * ocfs2_journal_access - Notify the handle that we want to journal this
249 * buffer. Will have to call ocfs2_journal_dirty once
250 * we've actually dirtied it. Type is one of . or .
251 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
252 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
253 * the current handle commits.
254 * ocfs2_handle_add_lock - Sometimes we need to delay lock release
255 * until after a transaction has been completed. Use
256 * ocfs2_handle_add_lock to indicate that a lock needs
257 * to be released at the end of that handle. Locks
258 * will be released in the order that they are added.
259 * ocfs2_handle_add_inode - Add a locked inode to a transaction.
260 */
261
262/* You must always start_trans with a number of buffs > 0, but it's
263 * perfectly legal to go through an entire transaction without having
264 * dirtied any buffers. */
265struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
266struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
267 struct ocfs2_journal_handle *handle,
268 int max_buffs);
269void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
270int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
271 int nblocks);
272
273/*
274 * Create access is for when we get a newly created buffer and we're
275 * not gonna read it off disk, but rather fill it ourselves. Right
276 * now, we don't do anything special with this (it turns into a write
277 * request), but this is a good placeholder in case we do...
278 *
279 * Write access is for when we read a block off disk and are going to
280 * modify it. This way the journalling layer knows it may need to make
281 * a copy of that block (if it's part of another, uncommitted
282 * transaction) before we do so.
283 */
284#define OCFS2_JOURNAL_ACCESS_CREATE 0
285#define OCFS2_JOURNAL_ACCESS_WRITE 1
286#define OCFS2_JOURNAL_ACCESS_UNDO 2
287
288int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
289 struct inode *inode,
290 struct buffer_head *bh,
291 int type);
292/*
293 * A word about the journal_access/journal_dirty "dance". It is
294 * entirely legal to journal_access a buffer more than once (as long
295 * as the access type is the same -- I'm not sure what will happen if
296 * access type is different but this should never happen anyway) It is
297 * also legal to journal_dirty a buffer more than once. In fact, you
298 * can even journal_access a buffer after you've done a
299 * journal_access/journal_dirty pair. The only thing you cannot do
300 * however, is journal_dirty a buffer which you haven't yet passed to
301 * journal_access at least once.
302 *
303 * That said, 99% of the time this doesn't matter and this is what the
304 * path looks like:
305 *
306 * <read a bh>
307 * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
308 * <modify the bh>
309 * ocfs2_journal_dirty(handle, bh);
310 */
311int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
312 struct buffer_head *bh);
313int ocfs2_journal_dirty_data(handle_t *handle,
314 struct buffer_head *bh);
315int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
316 struct inode *inode);
317/*
318 * Use this to protect from other processes reading buffer state while
319 * it's in flight.
320 */
321void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
322 struct inode *inode);
323
324/*
325 * Credit Macros:
326 * Convenience macros to calculate number of credits needed.
327 *
328 * For convenience sake, I have a set of macros here which calculate
329 * the *maximum* number of sectors which will be changed for various
330 * metadata updates.
331 */
332
333/* simple file updates like chmod, etc. */
334#define OCFS2_INODE_UPDATE_CREDITS 1
335
336/* get one bit out of a suballocator: dinode + group descriptor +
337 * prev. group desc. if we relink. */
338#define OCFS2_SUBALLOC_ALLOC (3)
339
340/* dinode + group descriptor update. We don't relink on free yet. */
341#define OCFS2_SUBALLOC_FREE (2)
342
343#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
344#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
345 + OCFS2_TRUNCATE_LOG_UPDATE)
346
347/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
348 * bitmap block for the new bit) */
349#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
350
351/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
352 * group descriptor + mkdir/symlink blocks */
353#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
354 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
355
356/* local alloc metadata change + main bitmap updates */
357#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
359
360/* used when we don't need an allocation change for a dir extend. One
361 * for the dinode, one for the new block. */
362#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
363
364/* file update (nlink, etc) + dir entry block */
365#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
366
367/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
368 * dir inode link */
369#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
370 + OCFS2_LINK_CREDITS)
371
372/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
373 * inode alloc group descriptor */
374#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
375
376/* dinode update, old dir dinode update, new dir dinode update, old
377 * dir dir entry, new dir dir entry, dir entry update for renaming
378 * directory + target unlink */
379#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
380 + OCFS2_UNLINK_CREDITS)
381
382static inline int ocfs2_calc_extend_credits(struct super_block *sb,
383 struct ocfs2_dinode *fe,
384 u32 bits_wanted)
385{
386 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
387
388 /* bitmap dinode, group desc. + relinked group. */
389 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
390
391 /* we might need to shift tree depth so lets assume an
392 * absolute worst case of complete fragmentation. Even with
393 * that, we only need one update for the dinode, and then
394 * however many metadata chunks needed * a remaining suballoc
395 * alloc. */
396 sysfile_bitmap_blocks = 1 +
397 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
398
399 /* this does not include *new* metadata blocks, which are
400 * accounted for in sysfile_bitmap_blocks. fe +
401 * prev. last_eb_blk + blocks along edge of tree.
402 * calc_symlink_credits passes because we just need 1
403 * credit for the dinode there. */
404 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
405
406 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
407}
408
409static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
410{
411 int blocks = OCFS2_MKNOD_CREDITS;
412
413 /* links can be longer than one block so we may update many
414 * within our single allocated extent. */
415 blocks += ocfs2_clusters_to_blocks(sb, 1);
416
417 return blocks;
418}
419
420static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
421 unsigned int cpg)
422{
423 int blocks;
424 int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
425 /* parent inode update + new block group header + bitmap inode update
426 + bitmap blocks affected */
427 blocks = 1 + 1 + 1 + bitmap_blocks;
428 return blocks;
429}
430
431static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
432 unsigned int clusters_to_del,
433 struct ocfs2_dinode *fe,
434 struct ocfs2_extent_list *last_el)
435{
436 /* for dinode + all headers in this pass + update to next leaf */
437 u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
438 u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
439 int credits = 1 + tree_depth + 1;
440 int i;
441
442 i = next_free - 1;
443 BUG_ON(i < 0);
444
445 /* We may be deleting metadata blocks, so metadata alloc dinode +
446 one desc. block for each possible delete. */
447 if (tree_depth && next_free == 1 &&
448 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
449 credits += 1 + tree_depth;
450
451 /* update to the truncate log. */
452 credits += OCFS2_TRUNCATE_LOG_UPDATE;
453
454 return credits;
455}
456
457#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 000000000000..fe373a2101d9
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.c
5 *
6 * Node local data allocation
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/bitops.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45
46#include "buffer_head_io.h"
47
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
55 struct ocfs2_dinode *alloc,
56 u32 numbits);
57
58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
59
60static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *alloc,
63 struct inode *main_bm_inode,
64 struct buffer_head *main_bm_bh);
65
66static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
67 struct ocfs2_journal_handle *handle,
68 struct ocfs2_alloc_context **ac,
69 struct inode **bitmap_inode,
70 struct buffer_head **bitmap_bh);
71
72static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac);
75
76static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
77 struct inode *local_alloc_inode);
78
79/*
80 * Determine how large our local alloc window should be, in bits.
81 *
82 * These values (and the behavior in ocfs2_alloc_should_use_local) have
83 * been chosen so that most allocations, including new block groups go
84 * through local alloc.
85 */
86static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
87{
88 BUG_ON(osb->s_clustersize_bits < 12);
89
90 return 2048 >> (osb->s_clustersize_bits - 12);
91}
92
93/*
94 * Tell us whether a given allocation should use the local alloc
95 * file. Otherwise, it has to go to the main bitmap.
96 */
97int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
98{
99 int la_bits = ocfs2_local_alloc_window_bits(osb);
100
101 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
102 return 0;
103
104 /* la_bits should be at least twice the size (in clusters) of
105 * a new block group. We want to be sure block group
106 * allocations go through the local alloc, so allow an
107 * allocation to take up to half the bitmap. */
108 if (bits > (la_bits / 2))
109 return 0;
110
111 return 1;
112}
113
114int ocfs2_load_local_alloc(struct ocfs2_super *osb)
115{
116 int status = 0;
117 struct ocfs2_dinode *alloc = NULL;
118 struct buffer_head *alloc_bh = NULL;
119 u32 num_used;
120 struct inode *inode = NULL;
121 struct ocfs2_local_alloc *la;
122
123 mlog_entry_void();
124
125 /* read the alloc off disk */
126 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
127 osb->slot_num);
128 if (!inode) {
129 status = -EINVAL;
130 mlog_errno(status);
131 goto bail;
132 }
133
134 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
135 &alloc_bh, 0, inode);
136 if (status < 0) {
137 mlog_errno(status);
138 goto bail;
139 }
140
141 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
142 la = OCFS2_LOCAL_ALLOC(alloc);
143
144 if (!(le32_to_cpu(alloc->i_flags) &
145 (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
146 mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
147 OCFS2_I(inode)->ip_blkno);
148 status = -EINVAL;
149 goto bail;
150 }
151
152 if ((la->la_size == 0) ||
153 (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
154 mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
155 le16_to_cpu(la->la_size));
156 status = -EINVAL;
157 goto bail;
158 }
159
160 /* do a little verification. */
161 num_used = ocfs2_local_alloc_count_bits(alloc);
162
163 /* hopefully the local alloc has always been recovered before
164 * we load it. */
165 if (num_used
166 || alloc->id1.bitmap1.i_used
167 || alloc->id1.bitmap1.i_total
168 || la->la_bm_off)
169 mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
170 "found = %u, set = %u, taken = %u, off = %u\n",
171 num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
172 le32_to_cpu(alloc->id1.bitmap1.i_total),
173 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
174
175 osb->local_alloc_bh = alloc_bh;
176 osb->local_alloc_state = OCFS2_LA_ENABLED;
177
178bail:
179 if (status < 0)
180 if (alloc_bh)
181 brelse(alloc_bh);
182 if (inode)
183 iput(inode);
184
185 mlog_exit(status);
186 return status;
187}
188
189/*
190 * return any unused bits to the bitmap and write out a clean
191 * local_alloc.
192 *
193 * local_alloc_bh is optional. If not passed, we will simply use the
194 * one off osb. If you do pass it however, be warned that it *will* be
195 * returned brelse'd and NULL'd out.*/
196void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
197{
198 int status;
199 struct ocfs2_journal_handle *handle = NULL;
200 struct inode *local_alloc_inode = NULL;
201 struct buffer_head *bh = NULL;
202 struct buffer_head *main_bm_bh = NULL;
203 struct inode *main_bm_inode = NULL;
204 struct ocfs2_dinode *alloc_copy = NULL;
205 struct ocfs2_dinode *alloc = NULL;
206
207 mlog_entry_void();
208
209 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
210 goto bail;
211
212 local_alloc_inode =
213 ocfs2_get_system_file_inode(osb,
214 LOCAL_ALLOC_SYSTEM_INODE,
215 osb->slot_num);
216 if (!local_alloc_inode) {
217 status = -ENOENT;
218 mlog_errno(status);
219 goto bail;
220 }
221
222 osb->local_alloc_state = OCFS2_LA_DISABLED;
223
224 handle = ocfs2_alloc_handle(osb);
225 if (!handle) {
226 status = -ENOMEM;
227 mlog_errno(status);
228 goto bail;
229 }
230
231 main_bm_inode = ocfs2_get_system_file_inode(osb,
232 GLOBAL_BITMAP_SYSTEM_INODE,
233 OCFS2_INVALID_SLOT);
234 if (!main_bm_inode) {
235 status = -EINVAL;
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_handle_add_inode(handle, main_bm_inode);
241 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
242 if (status < 0) {
243 mlog_errno(status);
244 goto bail;
245 }
246
247 /* WINDOW_MOVE_CREDITS is a bit heavy... */
248 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
249 if (IS_ERR(handle)) {
250 mlog_errno(PTR_ERR(handle));
251 handle = NULL;
252 goto bail;
253 }
254
255 bh = osb->local_alloc_bh;
256 alloc = (struct ocfs2_dinode *) bh->b_data;
257
258 alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
259 if (!alloc_copy) {
260 status = -ENOMEM;
261 goto bail;
262 }
263 memcpy(alloc_copy, alloc, bh->b_size);
264
265 status = ocfs2_journal_access(handle, local_alloc_inode, bh,
266 OCFS2_JOURNAL_ACCESS_WRITE);
267 if (status < 0) {
268 mlog_errno(status);
269 goto bail;
270 }
271
272 ocfs2_clear_local_alloc(alloc);
273
274 status = ocfs2_journal_dirty(handle, bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 brelse(bh);
281 osb->local_alloc_bh = NULL;
282 osb->local_alloc_state = OCFS2_LA_UNUSED;
283
284 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
285 main_bm_inode, main_bm_bh);
286 if (status < 0)
287 mlog_errno(status);
288
289bail:
290 if (handle)
291 ocfs2_commit_trans(handle);
292
293 if (main_bm_bh)
294 brelse(main_bm_bh);
295
296 if (main_bm_inode)
297 iput(main_bm_inode);
298
299 if (local_alloc_inode)
300 iput(local_alloc_inode);
301
302 if (alloc_copy)
303 kfree(alloc_copy);
304
305 mlog_exit_void();
306}
307
308/*
309 * We want to free the bitmap bits outside of any recovery context as
310 * we'll need a cluster lock to do so, but we must clear the local
311 * alloc before giving up the recovered nodes journal. To solve this,
312 * we kmalloc a copy of the local alloc before it's change for the
313 * caller to process with ocfs2_complete_local_alloc_recovery
314 */
315int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
316 int slot_num,
317 struct ocfs2_dinode **alloc_copy)
318{
319 int status = 0;
320 struct buffer_head *alloc_bh = NULL;
321 struct inode *inode = NULL;
322 struct ocfs2_dinode *alloc;
323
324 mlog_entry("(slot_num = %d)\n", slot_num);
325
326 *alloc_copy = NULL;
327
328 inode = ocfs2_get_system_file_inode(osb,
329 LOCAL_ALLOC_SYSTEM_INODE,
330 slot_num);
331 if (!inode) {
332 status = -EINVAL;
333 mlog_errno(status);
334 goto bail;
335 }
336
337 down(&inode->i_sem);
338
339 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
340 &alloc_bh, 0, inode);
341 if (status < 0) {
342 mlog_errno(status);
343 goto bail;
344 }
345
346 *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
347 if (!(*alloc_copy)) {
348 status = -ENOMEM;
349 goto bail;
350 }
351 memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
352
353 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
354 ocfs2_clear_local_alloc(alloc);
355
356 status = ocfs2_write_block(osb, alloc_bh, inode);
357 if (status < 0)
358 mlog_errno(status);
359
360bail:
361 if ((status < 0) && (*alloc_copy)) {
362 kfree(*alloc_copy);
363 *alloc_copy = NULL;
364 }
365
366 if (alloc_bh)
367 brelse(alloc_bh);
368
369 if (inode) {
370 up(&inode->i_sem);
371 iput(inode);
372 }
373
374 mlog_exit(status);
375 return status;
376}
377
378/*
379 * Step 2: By now, we've completed the journal recovery, we've stamped
380 * a clean local alloc on disk and dropped the node out of the
381 * recovery map. Dlm locks will no longer stall, so lets clear out the
382 * main bitmap.
383 */
384int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
385 struct ocfs2_dinode *alloc)
386{
387 int status;
388 struct ocfs2_journal_handle *handle = NULL;
389 struct buffer_head *main_bm_bh = NULL;
390 struct inode *main_bm_inode = NULL;
391
392 mlog_entry_void();
393
394 handle = ocfs2_alloc_handle(osb);
395 if (!handle) {
396 status = -ENOMEM;
397 mlog_errno(status);
398 goto bail;
399 }
400
401 main_bm_inode = ocfs2_get_system_file_inode(osb,
402 GLOBAL_BITMAP_SYSTEM_INODE,
403 OCFS2_INVALID_SLOT);
404 if (!main_bm_inode) {
405 status = -EINVAL;
406 mlog_errno(status);
407 goto bail;
408 }
409
410 ocfs2_handle_add_inode(handle, main_bm_inode);
411 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
412 if (status < 0) {
413 mlog_errno(status);
414 goto bail;
415 }
416
417 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
418 if (IS_ERR(handle)) {
419 status = PTR_ERR(handle);
420 handle = NULL;
421 mlog_errno(status);
422 goto bail;
423 }
424
425 /* we want the bitmap change to be recorded on disk asap */
426 ocfs2_handle_set_sync(handle, 1);
427
428 status = ocfs2_sync_local_to_main(osb, handle, alloc,
429 main_bm_inode, main_bm_bh);
430 if (status < 0)
431 mlog_errno(status);
432
433bail:
434 if (handle)
435 ocfs2_commit_trans(handle);
436
437 if (main_bm_bh)
438 brelse(main_bm_bh);
439
440 if (main_bm_inode)
441 iput(main_bm_inode);
442
443 mlog_exit(status);
444 return status;
445}
446
447/*
448 * make sure we've got at least bitswanted contiguous bits in the
449 * local alloc. You lose them when you drop i_sem.
450 *
451 * We will add ourselves to the transaction passed in, but may start
452 * our own in order to shift windows.
453 */
454int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
455 struct ocfs2_journal_handle *passed_handle,
456 u32 bits_wanted,
457 struct ocfs2_alloc_context *ac)
458{
459 int status;
460 struct ocfs2_dinode *alloc;
461 struct inode *local_alloc_inode;
462 unsigned int free_bits;
463
464 mlog_entry_void();
465
466 BUG_ON(!passed_handle);
467 BUG_ON(!ac);
468 BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
469
470 local_alloc_inode =
471 ocfs2_get_system_file_inode(osb,
472 LOCAL_ALLOC_SYSTEM_INODE,
473 osb->slot_num);
474 if (!local_alloc_inode) {
475 status = -ENOENT;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
480
481 if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
482 status = -ENOSPC;
483 goto bail;
484 }
485
486 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
487 mlog(0, "Asking for more than my max window size!\n");
488 status = -ENOSPC;
489 goto bail;
490 }
491
492 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
493
494 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
495 ocfs2_local_alloc_count_bits(alloc)) {
496 ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
497 "%u free bits, but a count shows %u",
498 le64_to_cpu(alloc->i_blkno),
499 le32_to_cpu(alloc->id1.bitmap1.i_used),
500 ocfs2_local_alloc_count_bits(alloc));
501 status = -EIO;
502 goto bail;
503 }
504
505 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
506 le32_to_cpu(alloc->id1.bitmap1.i_used);
507 if (bits_wanted > free_bits) {
508 /* uhoh, window change time. */
509 status =
510 ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
511 if (status < 0) {
512 if (status != -ENOSPC)
513 mlog_errno(status);
514 goto bail;
515 }
516 }
517
518 ac->ac_inode = igrab(local_alloc_inode);
519 get_bh(osb->local_alloc_bh);
520 ac->ac_bh = osb->local_alloc_bh;
521 ac->ac_which = OCFS2_AC_USE_LOCAL;
522 status = 0;
523bail:
524 if (local_alloc_inode)
525 iput(local_alloc_inode);
526
527 mlog_exit(status);
528 return status;
529}
530
531int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
532 struct ocfs2_journal_handle *handle,
533 struct ocfs2_alloc_context *ac,
534 u32 min_bits,
535 u32 *bit_off,
536 u32 *num_bits)
537{
538 int status, start;
539 struct inode *local_alloc_inode;
540 u32 bits_wanted;
541 void *bitmap;
542 struct ocfs2_dinode *alloc;
543 struct ocfs2_local_alloc *la;
544
545 mlog_entry_void();
546 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
547
548 bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
549 local_alloc_inode = ac->ac_inode;
550 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
551 la = OCFS2_LOCAL_ALLOC(alloc);
552
553 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
554 if (start == -1) {
555 /* TODO: Shouldn't we just BUG here? */
556 status = -ENOSPC;
557 mlog_errno(status);
558 goto bail;
559 }
560
561 bitmap = la->la_bitmap;
562 *bit_off = le32_to_cpu(la->la_bm_off) + start;
563 /* local alloc is always contiguous by nature -- we never
564 * delete bits from it! */
565 *num_bits = bits_wanted;
566
567 status = ocfs2_journal_access(handle, local_alloc_inode,
568 osb->local_alloc_bh,
569 OCFS2_JOURNAL_ACCESS_WRITE);
570 if (status < 0) {
571 mlog_errno(status);
572 goto bail;
573 }
574
575 while(bits_wanted--)
576 ocfs2_set_bit(start++, bitmap);
577
578 alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
579 le32_to_cpu(alloc->id1.bitmap1.i_used));
580
581 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
582 if (status < 0) {
583 mlog_errno(status);
584 goto bail;
585 }
586
587 status = 0;
588bail:
589 mlog_exit(status);
590 return status;
591}
592
593static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
594{
595 int i;
596 u8 *buffer;
597 u32 count = 0;
598 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
599
600 mlog_entry_void();
601
602 buffer = la->la_bitmap;
603 for (i = 0; i < le16_to_cpu(la->la_size); i++)
604 count += hweight8(buffer[i]);
605
606 mlog_exit(count);
607 return count;
608}
609
610static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
611 struct ocfs2_dinode *alloc,
612 u32 numbits)
613{
614 int numfound, bitoff, left, startoff, lastzero;
615 void *bitmap = NULL;
616
617 mlog_entry("(numbits wanted = %u)\n", numbits);
618
619 if (!alloc->id1.bitmap1.i_total) {
620 mlog(0, "No bits in my window!\n");
621 bitoff = -1;
622 goto bail;
623 }
624
625 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
626
627 numfound = bitoff = startoff = 0;
628 lastzero = -1;
629 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
630 while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
631 if (bitoff == left) {
632 /* mlog(0, "bitoff (%d) == left", bitoff); */
633 break;
634 }
635 /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
636 "numfound = %d\n", bitoff, startoff, numfound);*/
637
638 /* Ok, we found a zero bit... is it contig. or do we
639 * start over?*/
640 if (bitoff == startoff) {
641 /* we found a zero */
642 numfound++;
643 startoff++;
644 } else {
645 /* got a zero after some ones */
646 numfound = 1;
647 startoff = bitoff+1;
648 }
649 /* we got everything we needed */
650 if (numfound == numbits) {
651 /* mlog(0, "Found it all!\n"); */
652 break;
653 }
654 }
655
656 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
657 numfound);
658
659 if (numfound == numbits)
660 bitoff = startoff - numfound;
661 else
662 bitoff = -1;
663
664bail:
665 mlog_exit(bitoff);
666 return bitoff;
667}
668
669static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
670{
671 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
672 int i;
673 mlog_entry_void();
674
675 alloc->id1.bitmap1.i_total = 0;
676 alloc->id1.bitmap1.i_used = 0;
677 la->la_bm_off = 0;
678 for(i = 0; i < le16_to_cpu(la->la_size); i++)
679 la->la_bitmap[i] = 0;
680
681 mlog_exit_void();
682}
683
684#if 0
685/* turn this on and uncomment below to aid debugging window shifts. */
686static void ocfs2_verify_zero_bits(unsigned long *bitmap,
687 unsigned int start,
688 unsigned int count)
689{
690 unsigned int tmp = count;
691 while(tmp--) {
692 if (ocfs2_test_bit(start + tmp, bitmap)) {
693 printk("ocfs2_verify_zero_bits: start = %u, count = "
694 "%u\n", start, count);
695 printk("ocfs2_verify_zero_bits: bit %u is set!",
696 start + tmp);
697 BUG();
698 }
699 }
700}
701#endif
702
703/*
704 * sync the local alloc to main bitmap.
705 *
706 * assumes you've already locked the main bitmap -- the bitmap inode
707 * passed is used for caching.
708 */
709static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
710 struct ocfs2_journal_handle *handle,
711 struct ocfs2_dinode *alloc,
712 struct inode *main_bm_inode,
713 struct buffer_head *main_bm_bh)
714{
715 int status = 0;
716 int bit_off, left, count, start;
717 u64 la_start_blk;
718 u64 blkno;
719 void *bitmap;
720 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
721
722 mlog_entry("total = %u, COUNT = %u, used = %u\n",
723 le32_to_cpu(alloc->id1.bitmap1.i_total),
724 ocfs2_local_alloc_count_bits(alloc),
725 le32_to_cpu(alloc->id1.bitmap1.i_used));
726
727 if (!alloc->id1.bitmap1.i_total) {
728 mlog(0, "nothing to sync!\n");
729 goto bail;
730 }
731
732 if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
733 le32_to_cpu(alloc->id1.bitmap1.i_total)) {
734 mlog(0, "all bits were taken!\n");
735 goto bail;
736 }
737
738 la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
739 le32_to_cpu(la->la_bm_off));
740 bitmap = la->la_bitmap;
741 start = count = bit_off = 0;
742 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
743
744 while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
745 != -1) {
746 if ((bit_off < left) && (bit_off == start)) {
747 count++;
748 start++;
749 continue;
750 }
751 if (count) {
752 blkno = la_start_blk +
753 ocfs2_clusters_to_blocks(osb->sb,
754 start - count);
755
756 mlog(0, "freeing %u bits starting at local "
757 "alloc bit %u (la_start_blk = %"MLFu64", "
758 "blkno = %"MLFu64")\n", count, start - count,
759 la_start_blk, blkno);
760
761 status = ocfs2_free_clusters(handle, main_bm_inode,
762 main_bm_bh, blkno, count);
763 if (status < 0) {
764 mlog_errno(status);
765 goto bail;
766 }
767 }
768 if (bit_off >= left)
769 break;
770 count = 1;
771 start = bit_off + 1;
772 }
773
774bail:
775 mlog_exit(status);
776 return status;
777}
778
779static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
780 struct ocfs2_journal_handle *handle,
781 struct ocfs2_alloc_context **ac,
782 struct inode **bitmap_inode,
783 struct buffer_head **bitmap_bh)
784{
785 int status;
786
787 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
788 if (!(*ac)) {
789 status = -ENOMEM;
790 mlog_errno(status);
791 goto bail;
792 }
793
794 (*ac)->ac_handle = handle;
795 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
796
797 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
798 if (status < 0) {
799 if (status != -ENOSPC)
800 mlog_errno(status);
801 goto bail;
802 }
803
804 *bitmap_inode = (*ac)->ac_inode;
805 igrab(*bitmap_inode);
806 *bitmap_bh = (*ac)->ac_bh;
807 get_bh(*bitmap_bh);
808 status = 0;
809bail:
810 if ((status < 0) && *ac) {
811 ocfs2_free_alloc_context(*ac);
812 *ac = NULL;
813 }
814
815 mlog_exit(status);
816 return status;
817}
818
819/*
820 * pass it the bitmap lock in lock_bh if you have it.
821 */
822static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
823 struct ocfs2_journal_handle *handle,
824 struct ocfs2_alloc_context *ac)
825{
826 int status = 0;
827 u32 cluster_off, cluster_count;
828 struct ocfs2_dinode *alloc = NULL;
829 struct ocfs2_local_alloc *la;
830
831 mlog_entry_void();
832
833 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
834 la = OCFS2_LOCAL_ALLOC(alloc);
835
836 if (alloc->id1.bitmap1.i_total)
837 mlog(0, "asking me to alloc a new window over a non-empty "
838 "one\n");
839
840 mlog(0, "Allocating %u clusters for a new window.\n",
841 ocfs2_local_alloc_window_bits(osb));
842 /* we used the generic suballoc reserve function, but we set
843 * everything up nicely, so there's no reason why we can't use
844 * the more specific cluster api to claim bits. */
845 status = ocfs2_claim_clusters(osb, handle, ac,
846 ocfs2_local_alloc_window_bits(osb),
847 &cluster_off, &cluster_count);
848 if (status < 0) {
849 if (status != -ENOSPC)
850 mlog_errno(status);
851 goto bail;
852 }
853
854 la->la_bm_off = cpu_to_le32(cluster_off);
855 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
856 /* just in case... In the future when we find space ourselves,
857 * we don't have to get all contiguous -- but we'll have to
858 * set all previously used bits in bitmap and update
859 * la_bits_set before setting the bits in the main bitmap. */
860 alloc->id1.bitmap1.i_used = 0;
861 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
862 le16_to_cpu(la->la_size));
863
864 mlog(0, "New window allocated:\n");
865 mlog(0, "window la_bm_off = %u\n",
866 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
867 mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
868
869bail:
870 mlog_exit(status);
871 return status;
872}
873
874/* Note that we do *NOT* lock the local alloc inode here as
875 * it's been locked already for us. */
876static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
877 struct inode *local_alloc_inode)
878{
879 int status = 0;
880 struct buffer_head *main_bm_bh = NULL;
881 struct inode *main_bm_inode = NULL;
882 struct ocfs2_journal_handle *handle = NULL;
883 struct ocfs2_dinode *alloc;
884 struct ocfs2_dinode *alloc_copy = NULL;
885 struct ocfs2_alloc_context *ac = NULL;
886
887 mlog_entry_void();
888
889 handle = ocfs2_alloc_handle(osb);
890 if (!handle) {
891 status = -ENOMEM;
892 mlog_errno(status);
893 goto bail;
894 }
895
896 /* This will lock the main bitmap for us. */
897 status = ocfs2_local_alloc_reserve_for_window(osb,
898 handle,
899 &ac,
900 &main_bm_inode,
901 &main_bm_bh);
902 if (status < 0) {
903 if (status != -ENOSPC)
904 mlog_errno(status);
905 goto bail;
906 }
907
908 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
909 if (IS_ERR(handle)) {
910 status = PTR_ERR(handle);
911 handle = NULL;
912 mlog_errno(status);
913 goto bail;
914 }
915
916 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
917
918 /* We want to clear the local alloc before doing anything
919 * else, so that if we error later during this operation,
920 * local alloc shutdown won't try to double free main bitmap
921 * bits. Make a copy so the sync function knows which bits to
922 * free. */
923 alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
924 if (!alloc_copy) {
925 status = -ENOMEM;
926 mlog_errno(status);
927 goto bail;
928 }
929 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
930
931 status = ocfs2_journal_access(handle, local_alloc_inode,
932 osb->local_alloc_bh,
933 OCFS2_JOURNAL_ACCESS_WRITE);
934 if (status < 0) {
935 mlog_errno(status);
936 goto bail;
937 }
938
939 ocfs2_clear_local_alloc(alloc);
940
941 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
942 if (status < 0) {
943 mlog_errno(status);
944 goto bail;
945 }
946
947 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
948 main_bm_inode, main_bm_bh);
949 if (status < 0) {
950 mlog_errno(status);
951 goto bail;
952 }
953
954 status = ocfs2_local_alloc_new_window(osb, handle, ac);
955 if (status < 0) {
956 if (status != -ENOSPC)
957 mlog_errno(status);
958 goto bail;
959 }
960
961 atomic_inc(&osb->alloc_stats.moves);
962
963 status = 0;
964bail:
965 if (handle)
966 ocfs2_commit_trans(handle);
967
968 if (main_bm_bh)
969 brelse(main_bm_bh);
970
971 if (main_bm_inode)
972 iput(main_bm_inode);
973
974 if (alloc_copy)
975 kfree(alloc_copy);
976
977 if (ac)
978 ocfs2_free_alloc_context(ac);
979
980 mlog_exit(status);
981 return status;
982}
983
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 000000000000..30f88ce14e46
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCALALLOC_H
27#define OCFS2_LOCALALLOC_H
28
29int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num,
35 struct ocfs2_dinode **alloc_copy);
36
37int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
38 struct ocfs2_dinode *alloc);
39
40int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
41 u64 bits);
42
43struct ocfs2_alloc_context;
44int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
45 struct ocfs2_journal_handle *passed_handle,
46 u32 bits_wanted,
47 struct ocfs2_alloc_context *ac);
48
49int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
50 struct ocfs2_journal_handle *handle,
51 struct ocfs2_alloc_context *ac,
52 u32 min_bits,
53 u32 *bit_off,
54 u32 *num_bits);
55
56#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 000000000000..afdeec4b0eef
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mmap.c
5 *
6 * Code to deal with the mess that is clustered mmap.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32#include <linux/signal.h>
33#include <linux/rbtree.h>
34
35#define MLOG_MASK_PREFIX ML_FILE_IO
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "dlmglue.h"
41#include "file.h"
42#include "inode.h"
43#include "mmap.h"
44
45static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address,
47 int *type)
48{
49 struct inode *inode = area->vm_file->f_dentry->d_inode;
50 struct page *page = NOPAGE_SIGBUS;
51 sigset_t blocked, oldset;
52 int ret;
53
54 mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
55
56 /* The best way to deal with signals in this path is
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) {
65 mlog_errno(ret);
66 goto out;
67 }
68
69 page = filemap_nopage(area, address, type);
70
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
72 if (ret < 0)
73 mlog_errno(ret);
74out:
75 mlog_exit_ptr(page);
76 return page;
77}
78
79static struct vm_operations_struct ocfs2_file_vm_ops = {
80 .nopage = ocfs2_nopage,
81};
82
83int ocfs2_mmap(struct file *file,
84 struct vm_area_struct *vma)
85{
86 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
87 struct inode *inode = mapping->host;
88
89 /* We don't want to support shared writable mappings yet. */
90 if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
91 && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
92 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
93 /* This is -EINVAL because generic_file_readonly_mmap
94 * returns it in a similar situation. */
95 return -EINVAL;
96 }
97
98 update_atime(inode);
99 vma->vm_ops = &ocfs2_file_vm_ops;
100 return 0;
101}
102
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 000000000000..1274ee0f1fe2
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
1#ifndef OCFS2_MMAP_H
2#define OCFS2_MMAP_H
3
4int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
5
6#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 000000000000..f6b77ff1d2bf
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.c
5 *
6 * Create and rename file, directory, symlinks
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dcache.h"
51#include "dir.h"
52#include "dlmglue.h"
53#include "extent_map.h"
54#include "file.h"
55#include "inode.h"
56#include "journal.h"
57#include "namei.h"
58#include "suballoc.h"
59#include "symlink.h"
60#include "sysfile.h"
61#include "uptodate.h"
62#include "vote.h"
63
64#include "buffer_head_io.h"
65
66#define NAMEI_RA_CHUNKS 2
67#define NAMEI_RA_BLOCKS 4
68#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
69#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
70
71static int inline ocfs2_search_dirblock(struct buffer_head *bh,
72 struct inode *dir,
73 const char *name, int namelen,
74 unsigned long offset,
75 struct ocfs2_dir_entry **res_dir);
76
77static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
78 struct inode *dir,
79 struct ocfs2_dir_entry *de_del,
80 struct buffer_head *bh);
81
82static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
83 struct inode *dir,
84 const char *name, int namelen,
85 struct inode *inode, u64 blkno,
86 struct buffer_head *parent_fe_bh,
87 struct buffer_head *insert_bh);
88
89static int ocfs2_mknod_locked(struct ocfs2_super *osb,
90 struct inode *dir,
91 struct dentry *dentry, int mode,
92 dev_t dev,
93 struct buffer_head **new_fe_bh,
94 struct buffer_head *parent_fe_bh,
95 struct ocfs2_journal_handle *handle,
96 struct inode **ret_inode,
97 struct ocfs2_alloc_context *inode_ac);
98
99static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
100 struct ocfs2_journal_handle *handle,
101 struct inode *parent,
102 struct inode *inode,
103 struct buffer_head *fe_bh,
104 struct ocfs2_alloc_context *data_ac);
105
106static int ocfs2_double_lock(struct ocfs2_super *osb,
107 struct ocfs2_journal_handle *handle,
108 struct buffer_head **bh1,
109 struct inode *inode1,
110 struct buffer_head **bh2,
111 struct inode *inode2);
112
113static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
114 struct ocfs2_journal_handle *handle,
115 struct inode *inode,
116 char *name,
117 struct buffer_head **de_bh);
118
119static int ocfs2_orphan_add(struct ocfs2_super *osb,
120 struct ocfs2_journal_handle *handle,
121 struct inode *inode,
122 struct ocfs2_dinode *fe,
123 char *name,
124 struct buffer_head *de_bh);
125
126static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
127 struct ocfs2_journal_handle *handle,
128 struct inode *inode,
129 const char *symname);
130
131static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
132 struct dentry *dentry,
133 struct inode *inode, u64 blkno,
134 struct buffer_head *parent_fe_bh,
135 struct buffer_head *insert_bh)
136{
137 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
138 dentry->d_name.name, dentry->d_name.len,
139 inode, blkno, parent_fe_bh, insert_bh);
140}
141
142/* An orphan dir name is an 8 byte value, printed as a hex string */
143#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
144
145static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
146 struct nameidata *nd)
147{
148 int status;
149 u64 blkno;
150 struct buffer_head *dirent_bh = NULL;
151 struct inode *inode = NULL;
152 struct dentry *ret;
153 struct ocfs2_dir_entry *dirent;
154 struct ocfs2_inode_info *oi;
155
156 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
157 dentry->d_name.len, dentry->d_name.name);
158
159 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
160 ret = ERR_PTR(-ENAMETOOLONG);
161 goto bail;
162 }
163
164 mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
165 dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
166
167 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
168 if (status < 0) {
169 if (status != -ENOENT)
170 mlog_errno(status);
171 ret = ERR_PTR(status);
172 goto bail;
173 }
174
175 status = ocfs2_find_files_on_disk(dentry->d_name.name,
176 dentry->d_name.len, &blkno,
177 dir, &dirent_bh, &dirent);
178 if (status < 0)
179 goto bail_add;
180
181 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
182 if (IS_ERR(inode)) {
183 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
184 ret = ERR_PTR(-EACCES);
185 goto bail_unlock;
186 }
187
188 oi = OCFS2_I(inode);
189 /* Clear any orphaned state... If we were able to look up the
190 * inode from a directory, it certainly can't be orphaned. We
191 * might have the bad state from a node which intended to
192 * orphan this inode but crashed before it could commit the
193 * unlink. */
194 spin_lock(&oi->ip_lock);
195 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
196 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
197 spin_unlock(&oi->ip_lock);
198
199bail_add:
200
201 dentry->d_op = &ocfs2_dentry_ops;
202 ret = d_splice_alias(inode, dentry);
203
204bail_unlock:
205 /* Don't drop the cluster lock until *after* the d_add --
206 * unlink on another node will message us to remove that
207 * dentry under this lock so otherwise we can race this with
208 * the vote thread and have a stale dentry. */
209 ocfs2_meta_unlock(dir, 0);
210
211bail:
212 if (dirent_bh)
213 brelse(dirent_bh);
214
215 mlog_exit_ptr(ret);
216
217 return ret;
218}
219
220static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
221 struct ocfs2_journal_handle *handle,
222 struct inode *parent,
223 struct inode *inode,
224 struct buffer_head *fe_bh,
225 struct ocfs2_alloc_context *data_ac)
226{
227 int status;
228 struct buffer_head *new_bh = NULL;
229 struct ocfs2_dir_entry *de = NULL;
230
231 mlog_entry_void();
232
233 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
234 data_ac, NULL, &new_bh);
235 if (status < 0) {
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_set_new_buffer_uptodate(inode, new_bh);
241
242 status = ocfs2_journal_access(handle, inode, new_bh,
243 OCFS2_JOURNAL_ACCESS_CREATE);
244 if (status < 0) {
245 mlog_errno(status);
246 goto bail;
247 }
248 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
249
250 de = (struct ocfs2_dir_entry *) new_bh->b_data;
251 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
252 de->name_len = 1;
253 de->rec_len =
254 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
255 strcpy(de->name, ".");
256 ocfs2_set_de_type(de, S_IFDIR);
257 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
258 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
259 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
260 OCFS2_DIR_REC_LEN(1));
261 de->name_len = 2;
262 strcpy(de->name, "..");
263 ocfs2_set_de_type(de, S_IFDIR);
264
265 status = ocfs2_journal_dirty(handle, new_bh);
266 if (status < 0) {
267 mlog_errno(status);
268 goto bail;
269 }
270
271 i_size_write(inode, inode->i_sb->s_blocksize);
272 inode->i_nlink = 2;
273 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
274 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 status = 0;
281bail:
282 if (new_bh)
283 brelse(new_bh);
284
285 mlog_exit(status);
286 return status;
287}
288
289static int ocfs2_mknod(struct inode *dir,
290 struct dentry *dentry,
291 int mode,
292 dev_t dev)
293{
294 int status = 0;
295 struct buffer_head *parent_fe_bh = NULL;
296 struct ocfs2_journal_handle *handle = NULL;
297 struct ocfs2_super *osb;
298 struct ocfs2_dinode *dirfe;
299 struct buffer_head *new_fe_bh = NULL;
300 struct buffer_head *de_bh = NULL;
301 struct inode *inode = NULL;
302 struct ocfs2_alloc_context *inode_ac = NULL;
303 struct ocfs2_alloc_context *data_ac = NULL;
304
305 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
306 (unsigned long)dev, dentry->d_name.len,
307 dentry->d_name.name);
308
309 /* get our super block */
310 osb = OCFS2_SB(dir->i_sb);
311
312 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
313 mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
314 OCFS2_I(dir)->ip_blkno, dir->i_nlink);
315 status = -EMLINK;
316 goto leave;
317 }
318
319 handle = ocfs2_alloc_handle(osb);
320 if (handle == NULL) {
321 status = -ENOMEM;
322 mlog_errno(status);
323 goto leave;
324 }
325
326 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
327 if (status < 0) {
328 if (status != -ENOENT)
329 mlog_errno(status);
330 goto leave;
331 }
332
333 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
334 if (!dirfe->i_links_count) {
335 /* can't make a file in a deleted directory. */
336 status = -ENOENT;
337 goto leave;
338 }
339
340 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
341 dentry->d_name.len);
342 if (status)
343 goto leave;
344
345 /* get a spot inside the dir. */
346 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
347 dentry->d_name.name,
348 dentry->d_name.len, &de_bh);
349 if (status < 0) {
350 mlog_errno(status);
351 goto leave;
352 }
353
354 /* reserve an inode spot */
355 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
356 if (status < 0) {
357 if (status != -ENOSPC)
358 mlog_errno(status);
359 goto leave;
360 }
361
362 /* are we making a directory? If so, reserve a cluster for his
363 * 1st extent. */
364 if (S_ISDIR(mode)) {
365 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
366 if (status < 0) {
367 if (status != -ENOSPC)
368 mlog_errno(status);
369 goto leave;
370 }
371 }
372
373 handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
374 if (IS_ERR(handle)) {
375 status = PTR_ERR(handle);
376 handle = NULL;
377 mlog_errno(status);
378 goto leave;
379 }
380
381 /* do the real work now. */
382 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
383 &new_fe_bh, parent_fe_bh, handle,
384 &inode, inode_ac);
385 if (status < 0) {
386 mlog_errno(status);
387 goto leave;
388 }
389
390 if (S_ISDIR(mode)) {
391 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
392 new_fe_bh, data_ac);
393 if (status < 0) {
394 mlog_errno(status);
395 goto leave;
396 }
397
398 status = ocfs2_journal_access(handle, dir, parent_fe_bh,
399 OCFS2_JOURNAL_ACCESS_WRITE);
400 if (status < 0) {
401 mlog_errno(status);
402 goto leave;
403 }
404 le16_add_cpu(&dirfe->i_links_count, 1);
405 status = ocfs2_journal_dirty(handle, parent_fe_bh);
406 if (status < 0) {
407 mlog_errno(status);
408 goto leave;
409 }
410 dir->i_nlink++;
411 }
412
413 status = ocfs2_add_entry(handle, dentry, inode,
414 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
415 de_bh);
416 if (status < 0) {
417 mlog_errno(status);
418 goto leave;
419 }
420
421 insert_inode_hash(inode);
422 dentry->d_op = &ocfs2_dentry_ops;
423 d_instantiate(dentry, inode);
424 status = 0;
425leave:
426 if (handle)
427 ocfs2_commit_trans(handle);
428
429 if (status == -ENOSPC)
430 mlog(0, "Disk is full\n");
431
432 if (new_fe_bh)
433 brelse(new_fe_bh);
434
435 if (de_bh)
436 brelse(de_bh);
437
438 if (parent_fe_bh)
439 brelse(parent_fe_bh);
440
441 if ((status < 0) && inode)
442 iput(inode);
443
444 if (inode_ac)
445 ocfs2_free_alloc_context(inode_ac);
446
447 if (data_ac)
448 ocfs2_free_alloc_context(data_ac);
449
450 mlog_exit(status);
451
452 return status;
453}
454
455static int ocfs2_mknod_locked(struct ocfs2_super *osb,
456 struct inode *dir,
457 struct dentry *dentry, int mode,
458 dev_t dev,
459 struct buffer_head **new_fe_bh,
460 struct buffer_head *parent_fe_bh,
461 struct ocfs2_journal_handle *handle,
462 struct inode **ret_inode,
463 struct ocfs2_alloc_context *inode_ac)
464{
465 int status = 0;
466 struct ocfs2_dinode *fe = NULL;
467 struct ocfs2_extent_list *fel;
468 u64 fe_blkno = 0;
469 u16 suballoc_bit;
470 struct inode *inode = NULL;
471
472 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
473 (unsigned long)dev, dentry->d_name.len,
474 dentry->d_name.name);
475
476 *new_fe_bh = NULL;
477 *ret_inode = NULL;
478
479 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
480 &fe_blkno);
481 if (status < 0) {
482 mlog_errno(status);
483 goto leave;
484 }
485
486 inode = new_inode(dir->i_sb);
487 if (IS_ERR(inode)) {
488 status = PTR_ERR(inode);
489 mlog(ML_ERROR, "new_inode failed!\n");
490 goto leave;
491 }
492
493 /* populate as many fields early on as possible - many of
494 * these are used by the support functions here and in
495 * callers. */
496 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
497 OCFS2_I(inode)->ip_blkno = fe_blkno;
498 if (S_ISDIR(mode))
499 inode->i_nlink = 2;
500 else
501 inode->i_nlink = 1;
502 inode->i_mode = mode;
503 spin_lock(&osb->osb_lock);
504 inode->i_generation = osb->s_next_generation++;
505 spin_unlock(&osb->osb_lock);
506
507 *new_fe_bh = sb_getblk(osb->sb, fe_blkno);
508 if (!*new_fe_bh) {
509 status = -EIO;
510 mlog_errno(status);
511 goto leave;
512 }
513 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
514
515 status = ocfs2_journal_access(handle, inode, *new_fe_bh,
516 OCFS2_JOURNAL_ACCESS_CREATE);
517 if (status < 0) {
518 mlog_errno(status);
519 goto leave;
520 }
521
522 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
523 memset(fe, 0, osb->sb->s_blocksize);
524
525 fe->i_generation = cpu_to_le32(inode->i_generation);
526 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
527 fe->i_blkno = cpu_to_le64(fe_blkno);
528 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
529 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
530 fe->i_uid = cpu_to_le32(current->fsuid);
531 if (dir->i_mode & S_ISGID) {
532 fe->i_gid = cpu_to_le32(dir->i_gid);
533 if (S_ISDIR(mode))
534 mode |= S_ISGID;
535 } else
536 fe->i_gid = cpu_to_le32(current->fsgid);
537 fe->i_mode = cpu_to_le16(mode);
538 if (S_ISCHR(mode) || S_ISBLK(mode))
539 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
540
541 fe->i_links_count = cpu_to_le16(inode->i_nlink);
542
543 fe->i_last_eb_blk = 0;
544 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
545 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
546 fe->i_atime = fe->i_ctime = fe->i_mtime =
547 cpu_to_le64(CURRENT_TIME.tv_sec);
548 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
549 cpu_to_le32(CURRENT_TIME.tv_nsec);
550 fe->i_dtime = 0;
551
552 fel = &fe->id2.i_list;
553 fel->l_tree_depth = 0;
554 fel->l_next_free_rec = 0;
555 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
556
557 status = ocfs2_journal_dirty(handle, *new_fe_bh);
558 if (status < 0) {
559 mlog_errno(status);
560 goto leave;
561 }
562
563 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
564 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
565 "i_blkno=%"MLFu64", i_ino=%lu\n",
566 (unsigned long long) (*new_fe_bh)->b_blocknr,
567 fe->i_blkno, inode->i_ino);
568 BUG();
569 }
570
571 ocfs2_inode_set_new(osb, inode);
572 status = ocfs2_create_new_inode_locks(inode);
573 if (status < 0)
574 mlog_errno(status);
575
576 status = 0; /* error in ocfs2_create_new_inode_locks is not
577 * critical */
578
579 *ret_inode = inode;
580leave:
581 if (status < 0) {
582 if (*new_fe_bh) {
583 brelse(*new_fe_bh);
584 *new_fe_bh = NULL;
585 }
586 if (inode)
587 iput(inode);
588 }
589
590 mlog_exit(status);
591 return status;
592}
593
594static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry,
596 int mode)
597{
598 int ret;
599
600 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
601 dentry->d_name.len, dentry->d_name.name);
602 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
603 mlog_exit(ret);
604
605 return ret;
606}
607
608static int ocfs2_create(struct inode *dir,
609 struct dentry *dentry,
610 int mode,
611 struct nameidata *nd)
612{
613 int ret;
614
615 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
616 dentry->d_name.len, dentry->d_name.name);
617 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
618 mlog_exit(ret);
619
620 return ret;
621}
622
623static int ocfs2_link(struct dentry *old_dentry,
624 struct inode *dir,
625 struct dentry *dentry)
626{
627 struct ocfs2_journal_handle *handle = NULL;
628 struct inode *inode = old_dentry->d_inode;
629 int err;
630 struct buffer_head *fe_bh = NULL;
631 struct buffer_head *parent_fe_bh = NULL;
632 struct buffer_head *de_bh = NULL;
633 struct ocfs2_dinode *fe = NULL;
634 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
635
636 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
637 old_dentry->d_name.len, old_dentry->d_name.name,
638 dentry->d_name.len, dentry->d_name.name);
639
640 if (S_ISDIR(inode->i_mode)) {
641 err = -EPERM;
642 goto bail;
643 }
644
645 if (inode->i_nlink >= OCFS2_LINK_MAX) {
646 err = -EMLINK;
647 goto bail;
648 }
649
650 handle = ocfs2_alloc_handle(osb);
651 if (handle == NULL) {
652 err = -ENOMEM;
653 goto bail;
654 }
655
656 err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
657 if (err < 0) {
658 if (err != -ENOENT)
659 mlog_errno(err);
660 goto bail;
661 }
662
663 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
664 dentry->d_name.len);
665 if (err)
666 goto bail;
667
668 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
669 dentry->d_name.name,
670 dentry->d_name.len, &de_bh);
671 if (err < 0) {
672 mlog_errno(err);
673 goto bail;
674 }
675
676 err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
677 if (err < 0) {
678 if (err != -ENOENT)
679 mlog_errno(err);
680 goto bail;
681 }
682
683 fe = (struct ocfs2_dinode *) fe_bh->b_data;
684 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
685 err = -EMLINK;
686 goto bail;
687 }
688
689 handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
690 if (IS_ERR(handle)) {
691 err = PTR_ERR(handle);
692 handle = NULL;
693 mlog_errno(err);
694 goto bail;
695 }
696
697 err = ocfs2_journal_access(handle, inode, fe_bh,
698 OCFS2_JOURNAL_ACCESS_WRITE);
699 if (err < 0) {
700 mlog_errno(err);
701 goto bail;
702 }
703
704 inode->i_nlink++;
705 inode->i_ctime = CURRENT_TIME;
706 fe->i_links_count = cpu_to_le16(inode->i_nlink);
707 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
708 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
709
710 err = ocfs2_journal_dirty(handle, fe_bh);
711 if (err < 0) {
712 le16_add_cpu(&fe->i_links_count, -1);
713 inode->i_nlink--;
714 mlog_errno(err);
715 goto bail;
716 }
717
718 err = ocfs2_add_entry(handle, dentry, inode,
719 OCFS2_I(inode)->ip_blkno,
720 parent_fe_bh, de_bh);
721 if (err) {
722 le16_add_cpu(&fe->i_links_count, -1);
723 inode->i_nlink--;
724 mlog_errno(err);
725 goto bail;
726 }
727
728 atomic_inc(&inode->i_count);
729 dentry->d_op = &ocfs2_dentry_ops;
730 d_instantiate(dentry, inode);
731bail:
732 if (handle)
733 ocfs2_commit_trans(handle);
734 if (de_bh)
735 brelse(de_bh);
736 if (fe_bh)
737 brelse(fe_bh);
738 if (parent_fe_bh)
739 brelse(parent_fe_bh);
740
741 mlog_exit(err);
742
743 return err;
744}
745
746static int ocfs2_unlink(struct inode *dir,
747 struct dentry *dentry)
748{
749 int status;
750 unsigned int saved_nlink = 0;
751 struct inode *inode = dentry->d_inode;
752 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
753 u64 blkno;
754 struct ocfs2_dinode *fe = NULL;
755 struct buffer_head *fe_bh = NULL;
756 struct buffer_head *parent_node_bh = NULL;
757 struct ocfs2_journal_handle *handle = NULL;
758 struct ocfs2_dir_entry *dirent = NULL;
759 struct buffer_head *dirent_bh = NULL;
760 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
761 struct buffer_head *orphan_entry_bh = NULL;
762
763 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
764 dentry->d_name.len, dentry->d_name.name);
765
766 BUG_ON(dentry->d_parent->d_inode != dir);
767
768 mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
769
770 if (inode == osb->root_inode) {
771 mlog(0, "Cannot delete the root directory\n");
772 status = -EPERM;
773 goto leave;
774 }
775
776 handle = ocfs2_alloc_handle(osb);
777 if (handle == NULL) {
778 status = -ENOMEM;
779 mlog_errno(status);
780 goto leave;
781 }
782
783 status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
784 if (status < 0) {
785 if (status != -ENOENT)
786 mlog_errno(status);
787 goto leave;
788 }
789
790 status = ocfs2_find_files_on_disk(dentry->d_name.name,
791 dentry->d_name.len, &blkno,
792 dir, &dirent_bh, &dirent);
793 if (status < 0) {
794 if (status != -ENOENT)
795 mlog_errno(status);
796 goto leave;
797 }
798
799 if (OCFS2_I(inode)->ip_blkno != blkno) {
800 status = -ENOENT;
801
802 mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
803 "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
804 OCFS2_I(inode)->ip_flags);
805 goto leave;
806 }
807
808 status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
809 if (status < 0) {
810 if (status != -ENOENT)
811 mlog_errno(status);
812 goto leave;
813 }
814
815 if (S_ISDIR(inode->i_mode)) {
816 if (!ocfs2_empty_dir(inode)) {
817 status = -ENOTEMPTY;
818 goto leave;
819 } else if (inode->i_nlink != 2) {
820 status = -ENOTEMPTY;
821 goto leave;
822 }
823 }
824
825 /* There are still a few steps left until we can consider the
826 * unlink to have succeeded. Save off nlink here before
827 * modification so we can set it back in case we hit an issue
828 * before commit. */
829 saved_nlink = inode->i_nlink;
830 if (S_ISDIR(inode->i_mode))
831 inode->i_nlink = 0;
832 else
833 inode->i_nlink--;
834
835 status = ocfs2_request_unlink_vote(inode, dentry,
836 (unsigned int) inode->i_nlink);
837 if (status < 0) {
838 /* This vote should succeed under all normal
839 * circumstances. */
840 mlog_errno(status);
841 goto leave;
842 }
843
844 if (!inode->i_nlink) {
845 status = ocfs2_prepare_orphan_dir(osb, handle, inode,
846 orphan_name,
847 &orphan_entry_bh);
848 if (status < 0) {
849 mlog_errno(status);
850 goto leave;
851 }
852 }
853
854 handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
855 if (IS_ERR(handle)) {
856 status = PTR_ERR(handle);
857 handle = NULL;
858 mlog_errno(status);
859 goto leave;
860 }
861
862 status = ocfs2_journal_access(handle, inode, fe_bh,
863 OCFS2_JOURNAL_ACCESS_WRITE);
864 if (status < 0) {
865 mlog_errno(status);
866 goto leave;
867 }
868
869 fe = (struct ocfs2_dinode *) fe_bh->b_data;
870
871 if (!inode->i_nlink) {
872 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
873 orphan_entry_bh);
874 if (status < 0) {
875 mlog_errno(status);
876 goto leave;
877 }
878 }
879
880 /* delete the name from the parent dir */
881 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
882 if (status < 0) {
883 mlog_errno(status);
884 goto leave;
885 }
886
887 /* We can set nlink on the dinode now. clear the saved version
888 * so that it doesn't get set later. */
889 fe->i_links_count = cpu_to_le16(inode->i_nlink);
890 saved_nlink = 0;
891
892 status = ocfs2_journal_dirty(handle, fe_bh);
893 if (status < 0) {
894 mlog_errno(status);
895 goto leave;
896 }
897
898 if (S_ISDIR(inode->i_mode)) {
899 dir->i_nlink--;
900 status = ocfs2_mark_inode_dirty(handle, dir,
901 parent_node_bh);
902 if (status < 0) {
903 mlog_errno(status);
904 dir->i_nlink++;
905 }
906 }
907
908leave:
909 if (status < 0 && saved_nlink)
910 inode->i_nlink = saved_nlink;
911
912 if (handle)
913 ocfs2_commit_trans(handle);
914
915 if (fe_bh)
916 brelse(fe_bh);
917
918 if (dirent_bh)
919 brelse(dirent_bh);
920
921 if (parent_node_bh)
922 brelse(parent_node_bh);
923
924 if (orphan_entry_bh)
925 brelse(orphan_entry_bh);
926
927 mlog_exit(status);
928
929 return status;
930}
931
932/*
933 * The only place this should be used is rename!
934 * if they have the same id, then the 1st one is the only one locked.
935 */
936static int ocfs2_double_lock(struct ocfs2_super *osb,
937 struct ocfs2_journal_handle *handle,
938 struct buffer_head **bh1,
939 struct inode *inode1,
940 struct buffer_head **bh2,
941 struct inode *inode2)
942{
943 int status;
944 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
945 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
946 struct buffer_head **tmpbh;
947 struct inode *tmpinode;
948
949 mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
950 oi1->ip_blkno, oi2->ip_blkno);
951
952 BUG_ON(!handle);
953
954 if (*bh1)
955 *bh1 = NULL;
956 if (*bh2)
957 *bh2 = NULL;
958
959 /* we always want to lock the one with the lower lockid first. */
960 if (oi1->ip_blkno != oi2->ip_blkno) {
961 if (oi1->ip_blkno < oi2->ip_blkno) {
962 /* switch id1 and id2 around */
963 mlog(0, "switching them around...\n");
964 tmpbh = bh2;
965 bh2 = bh1;
966 bh1 = tmpbh;
967
968 tmpinode = inode2;
969 inode2 = inode1;
970 inode1 = tmpinode;
971 }
972 /* lock id2 */
973 status = ocfs2_meta_lock(inode2, handle, bh2, 1);
974 if (status < 0) {
975 if (status != -ENOENT)
976 mlog_errno(status);
977 goto bail;
978 }
979 }
980 /* lock id1 */
981 status = ocfs2_meta_lock(inode1, handle, bh1, 1);
982 if (status < 0) {
983 if (status != -ENOENT)
984 mlog_errno(status);
985 goto bail;
986 }
987bail:
988 mlog_exit(status);
989 return status;
990}
991
992#define PARENT_INO(buffer) \
993 ((struct ocfs2_dir_entry *) \
994 ((char *)buffer + \
995 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
996
997static int ocfs2_rename(struct inode *old_dir,
998 struct dentry *old_dentry,
999 struct inode *new_dir,
1000 struct dentry *new_dentry)
1001{
1002 int status = 0, rename_lock = 0;
1003 struct inode *old_inode = old_dentry->d_inode;
1004 struct inode *new_inode = new_dentry->d_inode;
1005 struct ocfs2_dinode *newfe = NULL;
1006 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1007 struct buffer_head *orphan_entry_bh = NULL;
1008 struct buffer_head *newfe_bh = NULL;
1009 struct buffer_head *insert_entry_bh = NULL;
1010 struct ocfs2_super *osb = NULL;
1011 u64 newfe_blkno;
1012 struct ocfs2_journal_handle *handle = NULL;
1013 struct buffer_head *old_dir_bh = NULL;
1014 struct buffer_head *new_dir_bh = NULL;
1015 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
1016 // and new_dentry
1017 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1018 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1019 // this is the 1st dirent bh
1020 nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
1021 unsigned int links_count;
1022
1023 /* At some point it might be nice to break this function up a
1024 * bit. */
1025
1026 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
1027 old_dir, old_dentry, new_dir, new_dentry,
1028 old_dentry->d_name.len, old_dentry->d_name.name,
1029 new_dentry->d_name.len, new_dentry->d_name.name);
1030
1031 osb = OCFS2_SB(old_dir->i_sb);
1032
1033 if (new_inode) {
1034 if (!igrab(new_inode))
1035 BUG();
1036 }
1037
1038 if (atomic_read(&old_dentry->d_count) > 2) {
1039 shrink_dcache_parent(old_dentry);
1040 if (atomic_read(&old_dentry->d_count) > 2) {
1041 status = -EBUSY;
1042 goto bail;
1043 }
1044 }
1045
1046 /* Assume a directory heirarchy thusly:
1047 * a/b/c
1048 * a/d
1049 * a,b,c, and d are all directories.
1050 *
1051 * from cwd of 'a' on both nodes:
1052 * node1: mv b/c d
1053 * node2: mv d b/c
1054 *
1055 * And that's why, just like the VFS, we need a file system
1056 * rename lock. */
1057 if (old_dentry != new_dentry) {
1058 status = ocfs2_rename_lock(osb);
1059 if (status < 0) {
1060 mlog_errno(status);
1061 goto bail;
1062 }
1063 rename_lock = 1;
1064 }
1065
1066 handle = ocfs2_alloc_handle(osb);
1067 if (handle == NULL) {
1068 status = -ENOMEM;
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 /* if old and new are the same, this'll just do one lock. */
1074 status = ocfs2_double_lock(osb, handle,
1075 &old_dir_bh, old_dir,
1076 &new_dir_bh, new_dir);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto bail;
1080 }
1081
1082 /* make sure both dirs have bhs
1083 * get an extra ref on old_dir_bh if old==new */
1084 if (!new_dir_bh) {
1085 if (old_dir_bh) {
1086 new_dir_bh = old_dir_bh;
1087 get_bh(new_dir_bh);
1088 } else {
1089 mlog(ML_ERROR, "no old_dir_bh!\n");
1090 status = -EIO;
1091 goto bail;
1092 }
1093 }
1094
1095 if (S_ISDIR(old_inode->i_mode)) {
1096 /* Directories actually require metadata updates to
1097 * the directory info so we can't get away with not
1098 * doing node locking on it. */
1099 status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
1100 if (status < 0) {
1101 if (status != -ENOENT)
1102 mlog_errno(status);
1103 goto bail;
1104 }
1105
1106 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1107 if (status < 0) {
1108 mlog_errno(status);
1109 goto bail;
1110 }
1111
1112 status = -EIO;
1113 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
1114 if (!old_inode_de_bh)
1115 goto bail;
1116
1117 status = -EIO;
1118 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
1119 OCFS2_I(old_dir)->ip_blkno)
1120 goto bail;
1121 status = -EMLINK;
1122 if (!new_inode && new_dir!=old_dir &&
1123 new_dir->i_nlink >= OCFS2_LINK_MAX)
1124 goto bail;
1125 } else {
1126 /* Ah, the simple case - we're a file so just send a
1127 * message. */
1128 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1129 if (status < 0) {
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133 }
1134
1135 status = -ENOENT;
1136 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1137 old_dentry->d_name.len,
1138 old_dir, &old_de);
1139 if (!old_de_bh)
1140 goto bail;
1141
1142 /*
1143 * Check for inode number is _not_ due to possible IO errors.
1144 * We might rmdir the source, keep it as pwd of some process
1145 * and merrily kill the link to whatever was created under the
1146 * same name. Goodbye sticky bit ;-<
1147 */
1148 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
1149 goto bail;
1150
1151 /* check if the target already exists (in which case we need
1152 * to delete it */
1153 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1154 new_dentry->d_name.len,
1155 &newfe_blkno, new_dir, &new_de_bh,
1156 &new_de);
1157 /* The only error we allow here is -ENOENT because the new
1158 * file not existing is perfectly valid. */
1159 if ((status < 0) && (status != -ENOENT)) {
1160 /* If we cannot find the file specified we should just */
1161 /* return the error... */
1162 mlog_errno(status);
1163 goto bail;
1164 }
1165
1166 if (!new_de && new_inode)
1167 mlog(ML_ERROR, "inode %lu does not exist in it's parent "
1168 "directory!", new_inode->i_ino);
1169
1170 /* In case we need to overwrite an existing file, we blow it
1171 * away first */
1172 if (new_de) {
1173 /* VFS didn't think there existed an inode here, but
1174 * someone else in the cluster must have raced our
1175 * rename to create one. Today we error cleanly, in
1176 * the future we should consider calling iget to build
1177 * a new struct inode for this entry. */
1178 if (!new_inode) {
1179 status = -EACCES;
1180
1181 mlog(0, "We found an inode for name %.*s but VFS "
1182 "didn't give us one.\n", new_dentry->d_name.len,
1183 new_dentry->d_name.name);
1184 goto bail;
1185 }
1186
1187 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1188 status = -EACCES;
1189
1190 mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
1191 "disagree. ip_flags = %x\n",
1192 OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
1193 OCFS2_I(new_inode)->ip_flags);
1194 goto bail;
1195 }
1196
1197 status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
1198 if (status < 0) {
1199 if (status != -ENOENT)
1200 mlog_errno(status);
1201 goto bail;
1202 }
1203
1204 if (S_ISDIR(new_inode->i_mode))
1205 links_count = 0;
1206 else
1207 links_count = (unsigned int) (new_inode->i_nlink - 1);
1208
1209 status = ocfs2_request_unlink_vote(new_inode, new_dentry,
1210 links_count);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto bail;
1214 }
1215
1216 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1217
1218 mlog(0, "aha rename over existing... new_de=%p "
1219 "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
1220 new_de, newfe_blkno, newfe_bh, newfe_bh ?
1221 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1222
1223 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1224 status = ocfs2_prepare_orphan_dir(osb, handle,
1225 new_inode,
1226 orphan_name,
1227 &orphan_entry_bh);
1228 if (status < 0) {
1229 mlog_errno(status);
1230 goto bail;
1231 }
1232 }
1233 } else {
1234 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
1235
1236 status = ocfs2_check_dir_for_entry(new_dir,
1237 new_dentry->d_name.name,
1238 new_dentry->d_name.len);
1239 if (status)
1240 goto bail;
1241
1242 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1243 new_dentry->d_name.name,
1244 new_dentry->d_name.len,
1245 &insert_entry_bh);
1246 if (status < 0) {
1247 mlog_errno(status);
1248 goto bail;
1249 }
1250 }
1251
1252 handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
1253 if (IS_ERR(handle)) {
1254 status = PTR_ERR(handle);
1255 handle = NULL;
1256 mlog_errno(status);
1257 goto bail;
1258 }
1259
1260 if (new_de) {
1261 if (S_ISDIR(new_inode->i_mode)) {
1262 if (!ocfs2_empty_dir(new_inode) ||
1263 new_inode->i_nlink != 2) {
1264 status = -ENOTEMPTY;
1265 goto bail;
1266 }
1267 }
1268 status = ocfs2_journal_access(handle, new_inode, newfe_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (status < 0) {
1271 mlog_errno(status);
1272 goto bail;
1273 }
1274
1275 if (S_ISDIR(new_inode->i_mode) ||
1276 (newfe->i_links_count == cpu_to_le16(1))){
1277 status = ocfs2_orphan_add(osb, handle, new_inode,
1278 newfe, orphan_name,
1279 orphan_entry_bh);
1280 if (status < 0) {
1281 mlog_errno(status);
1282 goto bail;
1283 }
1284 }
1285
1286 /* change the dirent to point to the correct inode */
1287 status = ocfs2_journal_access(handle, new_dir, new_de_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1294 new_de->file_type = old_de->file_type;
1295 new_dir->i_version++;
1296 status = ocfs2_journal_dirty(handle, new_de_bh);
1297 if (status < 0) {
1298 mlog_errno(status);
1299 goto bail;
1300 }
1301
1302 if (S_ISDIR(new_inode->i_mode))
1303 newfe->i_links_count = 0;
1304 else
1305 le16_add_cpu(&newfe->i_links_count, -1);
1306
1307 status = ocfs2_journal_dirty(handle, newfe_bh);
1308 if (status < 0) {
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312 } else {
1313 /* if the name was not found in new_dir, add it now */
1314 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1315 OCFS2_I(old_inode)->ip_blkno,
1316 new_dir_bh, insert_entry_bh);
1317 }
1318
1319 old_inode->i_ctime = CURRENT_TIME;
1320 mark_inode_dirty(old_inode);
1321
1322 /* now that the name has been added to new_dir, remove the old name */
1323 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1324 if (status < 0) {
1325 mlog_errno(status);
1326 goto bail;
1327 }
1328
1329 if (new_inode) {
1330 new_inode->i_nlink--;
1331 new_inode->i_ctime = CURRENT_TIME;
1332 }
1333 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1334 if (old_inode_de_bh) {
1335 status = ocfs2_journal_access(handle, old_inode,
1336 old_inode_de_bh,
1337 OCFS2_JOURNAL_ACCESS_WRITE);
1338 PARENT_INO(old_inode_de_bh->b_data) =
1339 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1340 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1341 old_dir->i_nlink--;
1342 if (new_inode) {
1343 new_inode->i_nlink--;
1344 } else {
1345 new_dir->i_nlink++;
1346 mark_inode_dirty(new_dir);
1347 }
1348 }
1349 mark_inode_dirty(old_dir);
1350 if (new_inode)
1351 mark_inode_dirty(new_inode);
1352
1353 if (old_dir != new_dir)
1354 if (new_dir_nlink != new_dir->i_nlink) {
1355 if (!new_dir_bh) {
1356 mlog(ML_ERROR, "need to change nlink for new "
1357 "dir %"MLFu64" from %d to %d but bh is "
1358 "NULL\n", OCFS2_I(new_dir)->ip_blkno,
1359 (int)new_dir_nlink, new_dir->i_nlink);
1360 } else {
1361 struct ocfs2_dinode *fe;
1362 status = ocfs2_journal_access(handle,
1363 new_dir,
1364 new_dir_bh,
1365 OCFS2_JOURNAL_ACCESS_WRITE);
1366 fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
1367 fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
1368 status = ocfs2_journal_dirty(handle, new_dir_bh);
1369 }
1370 }
1371
1372 if (old_dir_nlink != old_dir->i_nlink) {
1373 if (!old_dir_bh) {
1374 mlog(ML_ERROR, "need to change nlink for old dir "
1375 "%"MLFu64" from %d to %d but bh is NULL!\n",
1376 OCFS2_I(old_dir)->ip_blkno,
1377 (int)old_dir_nlink,
1378 old_dir->i_nlink);
1379 } else {
1380 struct ocfs2_dinode *fe;
1381 status = ocfs2_journal_access(handle, old_dir,
1382 old_dir_bh,
1383 OCFS2_JOURNAL_ACCESS_WRITE);
1384 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1385 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1386 status = ocfs2_journal_dirty(handle, old_dir_bh);
1387 }
1388 }
1389
1390 status = 0;
1391bail:
1392 if (rename_lock)
1393 ocfs2_rename_unlock(osb);
1394
1395 if (handle)
1396 ocfs2_commit_trans(handle);
1397
1398 if (new_inode)
1399 sync_mapping_buffers(old_inode->i_mapping);
1400
1401 if (new_inode)
1402 iput(new_inode);
1403 if (newfe_bh)
1404 brelse(newfe_bh);
1405 if (old_dir_bh)
1406 brelse(old_dir_bh);
1407 if (new_dir_bh)
1408 brelse(new_dir_bh);
1409 if (new_de_bh)
1410 brelse(new_de_bh);
1411 if (old_de_bh)
1412 brelse(old_de_bh);
1413 if (old_inode_de_bh)
1414 brelse(old_inode_de_bh);
1415 if (orphan_entry_bh)
1416 brelse(orphan_entry_bh);
1417 if (insert_entry_bh)
1418 brelse(insert_entry_bh);
1419
1420 mlog_exit(status);
1421
1422 return status;
1423}
1424
1425/*
1426 * we expect i_size = strlen(symname). Copy symname into the file
1427 * data, including the null terminator.
1428 */
1429static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1430 struct ocfs2_journal_handle *handle,
1431 struct inode *inode,
1432 const char *symname)
1433{
1434 struct buffer_head **bhs = NULL;
1435 const char *c;
1436 struct super_block *sb = osb->sb;
1437 u64 p_blkno;
1438 int p_blocks;
1439 int virtual, blocks, status, i, bytes_left;
1440
1441 bytes_left = i_size_read(inode) + 1;
1442 /* we can't trust i_blocks because we're actually going to
1443 * write i_size + 1 bytes. */
1444 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1445
1446 mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
1447 inode->i_blocks, i_size_read(inode), blocks);
1448
1449 /* Sanity check -- make sure we're going to fit. */
1450 if (bytes_left >
1451 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
1452 status = -EIO;
1453 mlog_errno(status);
1454 goto bail;
1455 }
1456
1457 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
1458 if (!bhs) {
1459 status = -ENOMEM;
1460 mlog_errno(status);
1461 goto bail;
1462 }
1463
1464 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
1465 &p_blocks);
1466 if (status < 0) {
1467 mlog_errno(status);
1468 goto bail;
1469 }
1470
1471 /* links can never be larger than one cluster so we know this
1472 * is all going to be contiguous, but do a sanity check
1473 * anyway. */
1474 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
1475 status = -EIO;
1476 mlog_errno(status);
1477 goto bail;
1478 }
1479
1480 virtual = 0;
1481 while(bytes_left > 0) {
1482 c = &symname[virtual * sb->s_blocksize];
1483
1484 bhs[virtual] = sb_getblk(sb, p_blkno);
1485 if (!bhs[virtual]) {
1486 status = -ENOMEM;
1487 mlog_errno(status);
1488 goto bail;
1489 }
1490 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
1491
1492 status = ocfs2_journal_access(handle, inode, bhs[virtual],
1493 OCFS2_JOURNAL_ACCESS_CREATE);
1494 if (status < 0) {
1495 mlog_errno(status);
1496 goto bail;
1497 }
1498
1499 memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
1500
1501 memcpy(bhs[virtual]->b_data, c,
1502 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1503 bytes_left);
1504
1505 status = ocfs2_journal_dirty(handle, bhs[virtual]);
1506 if (status < 0) {
1507 mlog_errno(status);
1508 goto bail;
1509 }
1510
1511 virtual++;
1512 p_blkno++;
1513 bytes_left -= sb->s_blocksize;
1514 }
1515
1516 status = 0;
1517bail:
1518
1519 if (bhs) {
1520 for(i = 0; i < blocks; i++)
1521 if (bhs[i])
1522 brelse(bhs[i]);
1523 kfree(bhs);
1524 }
1525
1526 mlog_exit(status);
1527 return status;
1528}
1529
1530static int ocfs2_symlink(struct inode *dir,
1531 struct dentry *dentry,
1532 const char *symname)
1533{
1534 int status, l, credits;
1535 u64 newsize;
1536 struct ocfs2_super *osb = NULL;
1537 struct inode *inode = NULL;
1538 struct super_block *sb;
1539 struct buffer_head *new_fe_bh = NULL;
1540 struct buffer_head *de_bh = NULL;
1541 struct buffer_head *parent_fe_bh = NULL;
1542 struct ocfs2_dinode *fe = NULL;
1543 struct ocfs2_dinode *dirfe;
1544 struct ocfs2_journal_handle *handle = NULL;
1545 struct ocfs2_alloc_context *inode_ac = NULL;
1546 struct ocfs2_alloc_context *data_ac = NULL;
1547
1548 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1549 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1550
1551 sb = dir->i_sb;
1552 osb = OCFS2_SB(sb);
1553
1554 l = strlen(symname) + 1;
1555
1556 credits = ocfs2_calc_symlink_credits(sb);
1557
1558 handle = ocfs2_alloc_handle(osb);
1559 if (handle == NULL) {
1560 status = -ENOMEM;
1561 mlog_errno(status);
1562 goto bail;
1563 }
1564
1565 /* lock the parent directory */
1566 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
1567 if (status < 0) {
1568 if (status != -ENOENT)
1569 mlog_errno(status);
1570 goto bail;
1571 }
1572
1573 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1574 if (!dirfe->i_links_count) {
1575 /* can't make a file in a deleted directory. */
1576 status = -ENOENT;
1577 goto bail;
1578 }
1579
1580 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
1581 dentry->d_name.len);
1582 if (status)
1583 goto bail;
1584
1585 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1586 dentry->d_name.name,
1587 dentry->d_name.len, &de_bh);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto bail;
1591 }
1592
1593 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
1594 if (status < 0) {
1595 if (status != -ENOSPC)
1596 mlog_errno(status);
1597 goto bail;
1598 }
1599
1600 /* don't reserve bitmap space for fast symlinks. */
1601 if (l > ocfs2_fast_symlink_chars(sb)) {
1602 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
1603 if (status < 0) {
1604 if (status != -ENOSPC)
1605 mlog_errno(status);
1606 goto bail;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans(osb, handle, credits);
1611 if (IS_ERR(handle)) {
1612 status = PTR_ERR(handle);
1613 handle = NULL;
1614 mlog_errno(status);
1615 goto bail;
1616 }
1617
1618 status = ocfs2_mknod_locked(osb, dir, dentry,
1619 S_IFLNK | S_IRWXUGO, 0,
1620 &new_fe_bh, parent_fe_bh, handle,
1621 &inode, inode_ac);
1622 if (status < 0) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1628 inode->i_rdev = 0;
1629 newsize = l - 1;
1630 if (l > ocfs2_fast_symlink_chars(sb)) {
1631 inode->i_op = &ocfs2_symlink_inode_operations;
1632 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
1633 handle, data_ac, NULL,
1634 NULL);
1635 if (status < 0) {
1636 if (status != -ENOSPC && status != -EINTR) {
1637 mlog(ML_ERROR, "Failed to extend file to "
1638 "%"MLFu64"\n",
1639 newsize);
1640 mlog_errno(status);
1641 status = -ENOSPC;
1642 }
1643 goto bail;
1644 }
1645 i_size_write(inode, newsize);
1646 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
1647 } else {
1648 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1649 memcpy((char *) fe->id2.i_symlink, symname, l);
1650 i_size_write(inode, newsize);
1651 inode->i_blocks = 0;
1652 }
1653
1654 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
1655 if (status < 0) {
1656 mlog_errno(status);
1657 goto bail;
1658 }
1659
1660 if (!ocfs2_inode_is_fast_symlink(inode)) {
1661 status = ocfs2_create_symlink_data(osb, handle, inode,
1662 symname);
1663 if (status < 0) {
1664 mlog_errno(status);
1665 goto bail;
1666 }
1667 }
1668
1669 status = ocfs2_add_entry(handle, dentry, inode,
1670 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1671 de_bh);
1672 if (status < 0) {
1673 mlog_errno(status);
1674 goto bail;
1675 }
1676
1677 insert_inode_hash(inode);
1678 dentry->d_op = &ocfs2_dentry_ops;
1679 d_instantiate(dentry, inode);
1680bail:
1681 if (handle)
1682 ocfs2_commit_trans(handle);
1683 if (new_fe_bh)
1684 brelse(new_fe_bh);
1685 if (parent_fe_bh)
1686 brelse(parent_fe_bh);
1687 if (de_bh)
1688 brelse(de_bh);
1689 if (inode_ac)
1690 ocfs2_free_alloc_context(inode_ac);
1691 if (data_ac)
1692 ocfs2_free_alloc_context(data_ac);
1693 if ((status < 0) && inode)
1694 iput(inode);
1695
1696 mlog_exit(status);
1697
1698 return status;
1699}
1700
1701int ocfs2_check_dir_entry(struct inode * dir,
1702 struct ocfs2_dir_entry * de,
1703 struct buffer_head * bh,
1704 unsigned long offset)
1705{
1706 const char *error_msg = NULL;
1707 const int rlen = le16_to_cpu(de->rec_len);
1708
1709 if (rlen < OCFS2_DIR_REC_LEN(1))
1710 error_msg = "rec_len is smaller than minimal";
1711 else if (rlen % 4 != 0)
1712 error_msg = "rec_len % 4 != 0";
1713 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1714 error_msg = "rec_len is too small for name_len";
1715 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1716 error_msg = "directory entry across blocks";
1717
1718 if (error_msg != NULL)
1719 mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
1720 "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
1721 OCFS2_I(dir)->ip_blkno, error_msg, offset,
1722 le64_to_cpu(de->inode), rlen, de->name_len);
1723 return error_msg == NULL ? 1 : 0;
1724}
1725
1726/* we don't always have a dentry for what we want to add, so people
1727 * like orphan dir can call this instead.
1728 *
1729 * If you pass me insert_bh, I'll skip the search of the other dir
1730 * blocks and put the record in there.
1731 */
1732static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
1733 struct inode *dir,
1734 const char *name, int namelen,
1735 struct inode *inode, u64 blkno,
1736 struct buffer_head *parent_fe_bh,
1737 struct buffer_head *insert_bh)
1738{
1739 unsigned long offset;
1740 unsigned short rec_len;
1741 struct ocfs2_dir_entry *de, *de1;
1742 struct super_block *sb;
1743 int retval, status;
1744
1745 mlog_entry_void();
1746
1747 sb = dir->i_sb;
1748
1749 if (!namelen)
1750 return -EINVAL;
1751
1752 rec_len = OCFS2_DIR_REC_LEN(namelen);
1753 offset = 0;
1754 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1755 while (1) {
1756 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1757 /* These checks should've already been passed by the
1758 * prepare function, but I guess we can leave them
1759 * here anyway. */
1760 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1761 retval = -ENOENT;
1762 goto bail;
1763 }
1764 if (ocfs2_match(namelen, name, de)) {
1765 retval = -EEXIST;
1766 goto bail;
1767 }
1768 if (((le64_to_cpu(de->inode) == 0) &&
1769 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1770 (le16_to_cpu(de->rec_len) >=
1771 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1772 status = ocfs2_journal_access(handle, dir, insert_bh,
1773 OCFS2_JOURNAL_ACCESS_WRITE);
1774 /* By now the buffer is marked for journaling */
1775 offset += le16_to_cpu(de->rec_len);
1776 if (le64_to_cpu(de->inode)) {
1777 de1 = (struct ocfs2_dir_entry *)((char *) de +
1778 OCFS2_DIR_REC_LEN(de->name_len));
1779 de1->rec_len =
1780 cpu_to_le16(le16_to_cpu(de->rec_len) -
1781 OCFS2_DIR_REC_LEN(de->name_len));
1782 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1783 de = de1;
1784 }
1785 de->file_type = OCFS2_FT_UNKNOWN;
1786 if (blkno) {
1787 de->inode = cpu_to_le64(blkno);
1788 ocfs2_set_de_type(de, inode->i_mode);
1789 } else
1790 de->inode = 0;
1791 de->name_len = namelen;
1792 memcpy(de->name, name, namelen);
1793
1794 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1795 dir->i_version++;
1796 status = ocfs2_journal_dirty(handle, insert_bh);
1797 retval = 0;
1798 goto bail;
1799 }
1800 offset += le16_to_cpu(de->rec_len);
1801 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1802 }
1803
1804 /* when you think about it, the assert above should prevent us
1805 * from ever getting here. */
1806 retval = -ENOSPC;
1807bail:
1808
1809 mlog_exit(retval);
1810 return retval;
1811}
1812
1813
1814/*
1815 * ocfs2_delete_entry deletes a directory entry by merging it with the
1816 * previous entry
1817 */
1818static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
1819 struct inode *dir,
1820 struct ocfs2_dir_entry *de_del,
1821 struct buffer_head *bh)
1822{
1823 struct ocfs2_dir_entry *de, *pde;
1824 int i, status = -ENOENT;
1825
1826 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1827
1828 i = 0;
1829 pde = NULL;
1830 de = (struct ocfs2_dir_entry *) bh->b_data;
1831 while (i < bh->b_size) {
1832 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1833 status = -EIO;
1834 mlog_errno(status);
1835 goto bail;
1836 }
1837 if (de == de_del) {
1838 status = ocfs2_journal_access(handle, dir, bh,
1839 OCFS2_JOURNAL_ACCESS_WRITE);
1840 if (status < 0) {
1841 status = -EIO;
1842 mlog_errno(status);
1843 goto bail;
1844 }
1845 if (pde)
1846 pde->rec_len =
1847 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1848 le16_to_cpu(de->rec_len));
1849 else
1850 de->inode = 0;
1851 dir->i_version++;
1852 status = ocfs2_journal_dirty(handle, bh);
1853 goto bail;
1854 }
1855 i += le16_to_cpu(de->rec_len);
1856 pde = de;
1857 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1858 }
1859bail:
1860 mlog_exit(status);
1861 return status;
1862}
1863
1864/*
1865 * Returns 0 if not found, -1 on failure, and 1 on success
1866 */
1867static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1868 struct inode *dir,
1869 const char *name, int namelen,
1870 unsigned long offset,
1871 struct ocfs2_dir_entry **res_dir)
1872{
1873 struct ocfs2_dir_entry *de;
1874 char *dlimit, *de_buf;
1875 int de_len;
1876 int ret = 0;
1877
1878 mlog_entry_void();
1879
1880 de_buf = bh->b_data;
1881 dlimit = de_buf + dir->i_sb->s_blocksize;
1882
1883 while (de_buf < dlimit) {
1884 /* this code is executed quadratically often */
1885 /* do minimal checking `by hand' */
1886
1887 de = (struct ocfs2_dir_entry *) de_buf;
1888
1889 if (de_buf + namelen <= dlimit &&
1890 ocfs2_match(namelen, name, de)) {
1891 /* found a match - just to be sure, do a full check */
1892 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1893 ret = -1;
1894 goto bail;
1895 }
1896 *res_dir = de;
1897 ret = 1;
1898 goto bail;
1899 }
1900
1901 /* prevent looping on a bad block */
1902 de_len = le16_to_cpu(de->rec_len);
1903 if (de_len <= 0) {
1904 ret = -1;
1905 goto bail;
1906 }
1907
1908 de_buf += de_len;
1909 offset += de_len;
1910 }
1911
1912bail:
1913 mlog_exit(ret);
1914 return ret;
1915}
1916
1917struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1918 struct inode *dir,
1919 struct ocfs2_dir_entry **res_dir)
1920{
1921 struct super_block *sb;
1922 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1923 struct buffer_head *bh, *ret = NULL;
1924 unsigned long start, block, b;
1925 int ra_max = 0; /* Number of bh's in the readahead
1926 buffer, bh_use[] */
1927 int ra_ptr = 0; /* Current index into readahead
1928 buffer */
1929 int num = 0;
1930 int nblocks, i, err;
1931
1932 mlog_entry_void();
1933
1934 *res_dir = NULL;
1935 sb = dir->i_sb;
1936
1937 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
1938 start = OCFS2_I(dir)->ip_dir_start_lookup;
1939 if (start >= nblocks)
1940 start = 0;
1941 block = start;
1942
1943restart:
1944 do {
1945 /*
1946 * We deal with the read-ahead logic here.
1947 */
1948 if (ra_ptr >= ra_max) {
1949 /* Refill the readahead buffer */
1950 ra_ptr = 0;
1951 b = block;
1952 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1953 /*
1954 * Terminate if we reach the end of the
1955 * directory and must wrap, or if our
1956 * search has finished at this block.
1957 */
1958 if (b >= nblocks || (num && block == start)) {
1959 bh_use[ra_max] = NULL;
1960 break;
1961 }
1962 num++;
1963
1964 /* XXX: questionable readahead stuff here */
1965 bh = ocfs2_bread(dir, b++, &err, 1);
1966 bh_use[ra_max] = bh;
1967#if 0 // ???
1968 if (bh)
1969 ll_rw_block(READ, 1, &bh);
1970#endif
1971 }
1972 }
1973 if ((bh = bh_use[ra_ptr++]) == NULL)
1974 goto next;
1975 wait_on_buffer(bh);
1976 if (!buffer_uptodate(bh)) {
1977 /* read error, skip block & hope for the best */
1978 brelse(bh);
1979 goto next;
1980 }
1981 i = ocfs2_search_dirblock(bh, dir, name, namelen,
1982 block << sb->s_blocksize_bits,
1983 res_dir);
1984 if (i == 1) {
1985 OCFS2_I(dir)->ip_dir_start_lookup = block;
1986 ret = bh;
1987 goto cleanup_and_exit;
1988 } else {
1989 brelse(bh);
1990 if (i < 0)
1991 goto cleanup_and_exit;
1992 }
1993 next:
1994 if (++block >= nblocks)
1995 block = 0;
1996 } while (block != start);
1997
1998 /*
1999 * If the directory has grown while we were searching, then
2000 * search the last part of the directory before giving up.
2001 */
2002 block = nblocks;
2003 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2004 if (block < nblocks) {
2005 start = 0;
2006 goto restart;
2007 }
2008
2009cleanup_and_exit:
2010 /* Clean up the read-ahead blocks */
2011 for (; ra_ptr < ra_max; ra_ptr++)
2012 brelse(bh_use[ra_ptr]);
2013
2014 mlog_exit_ptr(ret);
2015 return ret;
2016}
2017
2018static int ocfs2_blkno_stringify(u64 blkno, char *name)
2019{
2020 int status, namelen;
2021
2022 mlog_entry_void();
2023
2024 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
2025 blkno);
2026 if (namelen <= 0) {
2027 if (namelen)
2028 status = namelen;
2029 else
2030 status = -EINVAL;
2031 mlog_errno(status);
2032 goto bail;
2033 }
2034 if (namelen != OCFS2_ORPHAN_NAMELEN) {
2035 status = -EINVAL;
2036 mlog_errno(status);
2037 goto bail;
2038 }
2039
2040 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
2041 namelen);
2042
2043 status = 0;
2044bail:
2045 mlog_exit(status);
2046 return status;
2047}
2048
2049static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2050 struct ocfs2_journal_handle *handle,
2051 struct inode *inode,
2052 char *name,
2053 struct buffer_head **de_bh)
2054{
2055 struct inode *orphan_dir_inode = NULL;
2056 struct buffer_head *orphan_dir_bh = NULL;
2057 int status = 0;
2058
2059 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2060 if (status < 0) {
2061 mlog_errno(status);
2062 goto leave;
2063 }
2064
2065 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2066 ORPHAN_DIR_SYSTEM_INODE,
2067 osb->slot_num);
2068 if (!orphan_dir_inode) {
2069 status = -ENOENT;
2070 mlog_errno(status);
2071 goto leave;
2072 }
2073
2074 ocfs2_handle_add_inode(handle, orphan_dir_inode);
2075 status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
2076 if (status < 0) {
2077 mlog_errno(status);
2078 goto leave;
2079 }
2080
2081 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2082 orphan_dir_bh, name,
2083 OCFS2_ORPHAN_NAMELEN, de_bh);
2084 if (status < 0) {
2085 mlog_errno(status);
2086 goto leave;
2087 }
2088
2089leave:
2090 if (orphan_dir_inode)
2091 iput(orphan_dir_inode);
2092
2093 if (orphan_dir_bh)
2094 brelse(orphan_dir_bh);
2095
2096 mlog_exit(status);
2097 return status;
2098}
2099
2100static int ocfs2_orphan_add(struct ocfs2_super *osb,
2101 struct ocfs2_journal_handle *handle,
2102 struct inode *inode,
2103 struct ocfs2_dinode *fe,
2104 char *name,
2105 struct buffer_head *de_bh)
2106{
2107 struct inode *orphan_dir_inode = NULL;
2108 struct buffer_head *orphan_dir_bh = NULL;
2109 int status = 0;
2110 struct ocfs2_dinode *orphan_fe;
2111
2112 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
2113
2114 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2115 ORPHAN_DIR_SYSTEM_INODE,
2116 osb->slot_num);
2117 if (!orphan_dir_inode) {
2118 status = -ENOENT;
2119 mlog_errno(status);
2120 goto leave;
2121 }
2122
2123 status = ocfs2_read_block(osb,
2124 OCFS2_I(orphan_dir_inode)->ip_blkno,
2125 &orphan_dir_bh, OCFS2_BH_CACHED,
2126 orphan_dir_inode);
2127 if (status < 0) {
2128 mlog_errno(status);
2129 goto leave;
2130 }
2131
2132 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
2133 OCFS2_JOURNAL_ACCESS_WRITE);
2134 if (status < 0) {
2135 mlog_errno(status);
2136 goto leave;
2137 }
2138
2139 /* we're a cluster, and nlink can change on disk from
2140 * underneath us... */
2141 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2142 if (S_ISDIR(inode->i_mode))
2143 le16_add_cpu(&orphan_fe->i_links_count, 1);
2144 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2145
2146 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2147 if (status < 0) {
2148 mlog_errno(status);
2149 goto leave;
2150 }
2151
2152 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2153 OCFS2_ORPHAN_NAMELEN, inode,
2154 OCFS2_I(inode)->ip_blkno,
2155 orphan_dir_bh, de_bh);
2156 if (status < 0) {
2157 mlog_errno(status);
2158 goto leave;
2159 }
2160
2161 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
2162
2163 /* Record which orphan dir our inode now resides
2164 * in. delete_inode will use this to determine which orphan
2165 * dir to lock. */
2166 spin_lock(&OCFS2_I(inode)->ip_lock);
2167 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2168 spin_unlock(&OCFS2_I(inode)->ip_lock);
2169
2170 mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
2171 OCFS2_I(inode)->ip_blkno, osb->slot_num);
2172
2173leave:
2174 if (orphan_dir_inode)
2175 iput(orphan_dir_inode);
2176
2177 if (orphan_dir_bh)
2178 brelse(orphan_dir_bh);
2179
2180 mlog_exit(status);
2181 return status;
2182}
2183
2184/* unlike orphan_add, we expect the orphan dir to already be locked here. */
2185int ocfs2_orphan_del(struct ocfs2_super *osb,
2186 struct ocfs2_journal_handle *handle,
2187 struct inode *orphan_dir_inode,
2188 struct inode *inode,
2189 struct buffer_head *orphan_dir_bh)
2190{
2191 char name[OCFS2_ORPHAN_NAMELEN + 1];
2192 struct ocfs2_dinode *orphan_fe;
2193 int status = 0;
2194 struct buffer_head *target_de_bh = NULL;
2195 struct ocfs2_dir_entry *target_de = NULL;
2196
2197 mlog_entry_void();
2198
2199 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2200 if (status < 0) {
2201 mlog_errno(status);
2202 goto leave;
2203 }
2204
2205 mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
2206 name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
2207
2208 /* find it's spot in the orphan directory */
2209 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
2210 orphan_dir_inode, &target_de);
2211 if (!target_de_bh) {
2212 status = -ENOENT;
2213 mlog_errno(status);
2214 goto leave;
2215 }
2216
2217 /* remove it from the orphan directory */
2218 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
2219 target_de_bh);
2220 if (status < 0) {
2221 mlog_errno(status);
2222 goto leave;
2223 }
2224
2225 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
2226 OCFS2_JOURNAL_ACCESS_WRITE);
2227 if (status < 0) {
2228 mlog_errno(status);
2229 goto leave;
2230 }
2231
2232 /* do the i_nlink dance! :) */
2233 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2234 if (S_ISDIR(inode->i_mode))
2235 le16_add_cpu(&orphan_fe->i_links_count, -1);
2236 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2237
2238 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2239 if (status < 0) {
2240 mlog_errno(status);
2241 goto leave;
2242 }
2243
2244leave:
2245 if (target_de_bh)
2246 brelse(target_de_bh);
2247
2248 mlog_exit(status);
2249 return status;
2250}
2251
2252struct inode_operations ocfs2_dir_iops = {
2253 .create = ocfs2_create,
2254 .lookup = ocfs2_lookup,
2255 .link = ocfs2_link,
2256 .unlink = ocfs2_unlink,
2257 .rmdir = ocfs2_unlink,
2258 .symlink = ocfs2_symlink,
2259 .mkdir = ocfs2_mkdir,
2260 .mknod = ocfs2_mknod,
2261 .rename = ocfs2_rename,
2262 .setattr = ocfs2_setattr,
2263 .getattr = ocfs2_getattr,
2264};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 000000000000..deaaa97dbf0b
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_NAMEI_H
27#define OCFS2_NAMEI_H
28
29extern struct inode_operations ocfs2_dir_iops;
30
31struct dentry *ocfs2_get_parent(struct dentry *child);
32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb,
42 struct ocfs2_journal_handle *handle,
43 struct inode *orphan_dir_inode,
44 struct inode *inode,
45 struct buffer_head *orphan_dir_bh);
46
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 000000000000..0b499bccec5a
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs1_fs_compat.h
5 *
6 * OCFS1 volume header definitions. OCFS2 creates valid but unmountable
7 * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
8 * This allows an OCFS1 volume to see the partition and cleanly fail to
9 * mount it.
10 *
11 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License, version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public
23 * License along with this program; if not, write to the
24 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 * Boston, MA 021110-1307, USA.
26 */
27
28#ifndef _OCFS1_FS_COMPAT_H
29#define _OCFS1_FS_COMPAT_H
30
31#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
32#define OCFS1_MAX_MOUNT_POINT_LEN 128
33#define OCFS1_MAX_VOL_ID_LENGTH 16
34#define OCFS1_MAX_VOL_LABEL_LEN 64
35#define OCFS1_MAX_CLUSTER_NAME_LEN 64
36
37#define OCFS1_MAJOR_VERSION (2)
38#define OCFS1_MINOR_VERSION (0)
39#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
40
41/*
42 * OCFS1 superblock. Lives at sector 0.
43 */
44struct ocfs1_vol_disk_hdr
45{
46/*00*/ __u32 minor_version;
47 __u32 major_version;
48/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
49/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
50/*108*/ __u64 serial_num;
51/*110*/ __u64 device_size;
52 __u64 start_off;
53/*120*/ __u64 bitmap_off;
54 __u64 publ_off;
55/*130*/ __u64 vote_off;
56 __u64 root_bitmap_off;
57/*140*/ __u64 data_start_off;
58 __u64 root_bitmap_size;
59/*150*/ __u64 root_off;
60 __u64 root_size;
61/*160*/ __u64 cluster_size;
62 __u64 num_nodes;
63/*170*/ __u64 num_clusters;
64 __u64 dir_node_size;
65/*180*/ __u64 file_node_size;
66 __u64 internal_off;
67/*190*/ __u64 node_cfg_off;
68 __u64 node_cfg_size;
69/*1A0*/ __u64 new_cfg_off;
70 __u32 prot_bits;
71 __s32 excl_mount;
72/*1B0*/
73};
74
75
76struct ocfs1_disk_lock
77{
78/*00*/ __u32 curr_master;
79 __u8 file_lock;
80 __u8 compat_pad[3]; /* Not in orignal definition. Used to
81 make the already existing alignment
82 explicit */
83 __u64 last_write_time;
84/*10*/ __u64 last_read_time;
85 __u32 writer_node_num;
86 __u32 reader_node_num;
87/*20*/ __u64 oin_node_map;
88 __u64 dlock_seq_num;
89/*30*/
90};
91
92/*
93 * OCFS1 volume label. Lives at sector 1.
94 */
95struct ocfs1_vol_label
96{
97/*00*/ struct ocfs1_disk_lock disk_lock;
98/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
99/*70*/ __u16 label_len;
100/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
101/*82*/ __u16 vol_id_len;
102/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
103/*A4*/ __u16 cluster_name_len;
104/*A6*/
105};
106
107
108#endif /* _OCFS1_FS_COMPAT_H */
109
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 000000000000..f468c600cf92
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2.h
5 *
6 * Defines macros and structures used in OCFS2
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_H
27#define OCFS2_H
28
29#include <linux/spinlock.h>
30#include <linux/sched.h>
31#include <linux/wait.h>
32#include <linux/list.h>
33#include <linux/rbtree.h>
34#include <linux/workqueue.h>
35#include <linux/kref.h>
36
37#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h"
39#include "cluster/tcp.h"
40
41#include "dlm/dlmapi.h"
42
43#include "ocfs2_fs.h"
44#include "endian.h"
45#include "ocfs2_lockid.h"
46
47struct ocfs2_extent_map {
48 u32 em_clusters;
49 struct rb_root em_extents;
50};
51
52/* Most user visible OCFS2 inodes will have very few pieces of
53 * metadata, but larger files (including bitmaps, etc) must be taken
54 * into account when designing an access scheme. We allow a small
55 * amount of inlined blocks to be stored on an array and grow the
56 * structure into a rb tree when necessary. */
57#define OCFS2_INODE_MAX_CACHE_ARRAY 2
58
59struct ocfs2_caching_info {
60 unsigned int ci_num_cached;
61 union {
62 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
63 struct rb_root ci_tree;
64 } ci_cache;
65};
66
67/* this limits us to 256 nodes
68 * if we need more, we can do a kmalloc for the map */
69#define OCFS2_NODE_MAP_MAX_NODES 256
70struct ocfs2_node_map {
71 u16 num_nodes;
72 unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
73};
74
75enum ocfs2_ast_action {
76 OCFS2_AST_INVALID = 0,
77 OCFS2_AST_ATTACH,
78 OCFS2_AST_CONVERT,
79 OCFS2_AST_DOWNCONVERT,
80};
81
82/* actions for an unlockast function to take. */
83enum ocfs2_unlock_action {
84 OCFS2_UNLOCK_INVALID = 0,
85 OCFS2_UNLOCK_CANCEL_CONVERT,
86 OCFS2_UNLOCK_DROP_LOCK,
87};
88
89/* ocfs2_lock_res->l_flags flags. */
90#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
91 * the lvb */
92#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
93 * dlm_lock */
94#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
95 * downconvert*/
96#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
97#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
98#define OCFS2_LOCK_REFRESHING (0x00000020)
99#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
100 * for shutdown paths */
101#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
102 * when to skip queueing
103 * a lock because it's
104 * about to be
105 * dropped. */
106#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
107
108struct ocfs2_lock_res_ops;
109
110typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
111
112struct ocfs2_lock_res {
113 void *l_priv;
114 struct ocfs2_lock_res_ops *l_ops;
115 spinlock_t l_lock;
116
117 struct list_head l_blocked_list;
118 struct list_head l_mask_waiters;
119
120 enum ocfs2_lock_type l_type;
121 unsigned long l_flags;
122 char l_name[OCFS2_LOCK_ID_MAX_LEN];
123 int l_level;
124 unsigned int l_ro_holders;
125 unsigned int l_ex_holders;
126 struct dlm_lockstatus l_lksb;
127
128 /* used from AST/BAST funcs. */
129 enum ocfs2_ast_action l_action;
130 enum ocfs2_unlock_action l_unlock_action;
131 int l_requested;
132 int l_blocking;
133
134 wait_queue_head_t l_event;
135
136 struct list_head l_debug_list;
137};
138
139struct ocfs2_dlm_debug {
140 struct kref d_refcnt;
141 struct dentry *d_locking_state;
142 struct list_head d_lockres_tracking;
143};
144
145enum ocfs2_vol_state
146{
147 VOLUME_INIT = 0,
148 VOLUME_MOUNTED,
149 VOLUME_DISMOUNTED,
150 VOLUME_DISABLED
151};
152
153struct ocfs2_alloc_stats
154{
155 atomic_t moves;
156 atomic_t local_data;
157 atomic_t bitmap_data;
158 atomic_t bg_allocs;
159 atomic_t bg_extends;
160};
161
162enum ocfs2_local_alloc_state
163{
164 OCFS2_LA_UNUSED = 0,
165 OCFS2_LA_ENABLED,
166 OCFS2_LA_DISABLED
167};
168
169enum ocfs2_mount_options
170{
171 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
172 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
173 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
174 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
175 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
176#ifdef OCFS2_ORACORE_WORKAROUNDS
177 OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
178#endif
179};
180
181#define OCFS2_OSB_SOFT_RO 0x0001
182#define OCFS2_OSB_HARD_RO 0x0002
183#define OCFS2_OSB_ERROR_FS 0x0004
184
185struct ocfs2_journal;
186struct ocfs2_journal_handle;
187struct ocfs2_super
188{
189 u32 osb_id; /* id used by the proc interface */
190 struct task_struct *commit_task;
191 struct super_block *sb;
192 struct inode *root_inode;
193 struct inode *sys_root_inode;
194 struct inode *system_inodes[NUM_SYSTEM_INODES];
195
196 struct ocfs2_slot_info *slot_info;
197
198 spinlock_t node_map_lock;
199 struct ocfs2_node_map mounted_map;
200 struct ocfs2_node_map recovery_map;
201 struct ocfs2_node_map umount_map;
202
203 u32 num_clusters;
204 u64 root_blkno;
205 u64 system_dir_blkno;
206 u64 bitmap_blkno;
207 u32 bitmap_cpg;
208 u8 *uuid;
209 char *uuid_str;
210 u8 *vol_label;
211 u64 first_cluster_group_blkno;
212 u32 fs_generation;
213
214 u32 s_feature_compat;
215 u32 s_feature_incompat;
216 u32 s_feature_ro_compat;
217
218 /* Protects s_next_generaion, osb_flags. Could protect more on
219 * osb as it's very short lived. */
220 spinlock_t osb_lock;
221 u32 s_next_generation;
222 unsigned long osb_flags;
223
224 unsigned long s_mount_opt;
225
226 u16 max_slots;
227 u16 num_nodes;
228 s16 node_num;
229 s16 slot_num;
230 int s_sectsize_bits;
231 int s_clustersize;
232 int s_clustersize_bits;
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234
235 atomic_t vol_state;
236 struct semaphore recovery_lock;
237 struct task_struct *recovery_thread_task;
238 int disable_recovery;
239 wait_queue_head_t checkpoint_event;
240 atomic_t needs_checkpoint;
241 struct ocfs2_journal *journal;
242
243 enum ocfs2_local_alloc_state local_alloc_state;
244 struct buffer_head *local_alloc_bh;
245
246 /* Next two fields are for local node slot recovery during
247 * mount. */
248 int dirty;
249 struct ocfs2_dinode *local_alloc_copy;
250
251 struct ocfs2_alloc_stats alloc_stats;
252 char dev_str[20]; /* "major,minor" of the device */
253
254 struct dlm_ctxt *dlm;
255 struct ocfs2_lock_res osb_super_lockres;
256 struct ocfs2_lock_res osb_rename_lockres;
257 struct dlm_eviction_cb osb_eviction_cb;
258 struct ocfs2_dlm_debug *osb_dlm_debug;
259
260 struct dentry *osb_debug_root;
261
262 wait_queue_head_t recovery_event;
263
264 spinlock_t vote_task_lock;
265 struct task_struct *vote_task;
266 wait_queue_head_t vote_event;
267 unsigned long vote_wake_sequence;
268 unsigned long vote_work_sequence;
269
270 struct list_head blocked_lock_list;
271 unsigned long blocked_lock_count;
272
273 struct list_head vote_list;
274 int vote_count;
275
276 u32 net_key;
277 spinlock_t net_response_lock;
278 unsigned int net_response_ids;
279 struct list_head net_response_list;
280
281 struct o2hb_callback_func osb_hb_up;
282 struct o2hb_callback_func osb_hb_down;
283
284 struct list_head osb_net_handlers;
285
286 wait_queue_head_t osb_mount_event;
287
288 /* Truncate log info */
289 struct inode *osb_tl_inode;
290 struct buffer_head *osb_tl_bh;
291 struct work_struct osb_truncate_log_wq;
292};
293
294#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
295#define OCFS2_MAX_OSB_ID 65536
296
297static inline int ocfs2_should_order_data(struct inode *inode)
298{
299 if (!S_ISREG(inode->i_mode))
300 return 0;
301 if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
302 return 0;
303 return 1;
304}
305
306/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock
309 * too! */
310static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
311 unsigned long flag)
312{
313 spin_lock(&osb->osb_lock);
314 osb->osb_flags |= flag;
315 spin_unlock(&osb->osb_lock);
316}
317
318static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
319 int hard)
320{
321 spin_lock(&osb->osb_lock);
322 osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
323 if (hard)
324 osb->osb_flags |= OCFS2_OSB_HARD_RO;
325 else
326 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
327 spin_unlock(&osb->osb_lock);
328}
329
330static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
331{
332 int ret;
333
334 spin_lock(&osb->osb_lock);
335 ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
336 spin_unlock(&osb->osb_lock);
337
338 return ret;
339}
340
341static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
342{
343 int ret;
344
345 spin_lock(&osb->osb_lock);
346 ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
347 spin_unlock(&osb->osb_lock);
348
349 return ret;
350}
351
352#define OCFS2_IS_VALID_DINODE(ptr) \
353 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
354
355#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
356 typeof(__di) ____di = (__di); \
357 ocfs2_error((__sb), \
358 "Dinode # %"MLFu64" has bad signature %.*s", \
359 (____di)->i_blkno, 7, \
360 (____di)->i_signature); \
361} while (0);
362
363#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
364 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
365
366#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
367 typeof(__eb) ____eb = (__eb); \
368 ocfs2_error((__sb), \
369 "Extent Block # %"MLFu64" has bad signature %.*s", \
370 (____eb)->h_blkno, 7, \
371 (____eb)->h_signature); \
372} while (0);
373
374#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
375 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
376
377#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
378 typeof(__gd) ____gd = (__gd); \
379 ocfs2_error((__sb), \
380 "Group Descriptor # %"MLFu64" has bad signature %.*s", \
381 (____gd)->bg_blkno, 7, \
382 (____gd)->bg_signature); \
383} while (0);
384
385static inline unsigned long ino_from_blkno(struct super_block *sb,
386 u64 blkno)
387{
388 return (unsigned long)(blkno & (u64)ULONG_MAX);
389}
390
391static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
392 u32 clusters)
393{
394 int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
395 sb->s_blocksize_bits;
396
397 return (u64)clusters << c_to_b_bits;
398}
399
400static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
401 u64 blocks)
402{
403 int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
404 sb->s_blocksize_bits;
405
406 return (u32)(blocks >> b_to_c_bits);
407}
408
409static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
410 u64 bytes)
411{
412 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
413 unsigned int clusters;
414
415 bytes += OCFS2_SB(sb)->s_clustersize - 1;
416 /* OCFS2 just cannot have enough clusters to overflow this */
417 clusters = (unsigned int)(bytes >> cl_bits);
418
419 return clusters;
420}
421
422static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
423 u64 bytes)
424{
425 bytes += sb->s_blocksize - 1;
426 return bytes >> sb->s_blocksize_bits;
427}
428
429static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
430 u32 clusters)
431{
432 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
433}
434
435static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
436 u64 bytes)
437{
438 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
439 unsigned int clusters;
440
441 clusters = ocfs2_clusters_for_bytes(sb, bytes);
442 return (u64)clusters << cl_bits;
443}
444
445static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
446 u64 bytes)
447{
448 u64 blocks;
449
450 blocks = ocfs2_blocks_for_bytes(sb, bytes);
451 return blocks << sb->s_blocksize_bits;
452}
453
454static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
455{
456 return (unsigned long)((bytes + 511) >> 9);
457}
458
459#define ocfs2_set_bit ext2_set_bit
460#define ocfs2_clear_bit ext2_clear_bit
461#define ocfs2_test_bit ext2_test_bit
462#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
463#endif /* OCFS2_H */
464
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 000000000000..dfb8a5bedfc8
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_fs.h
5 *
6 * On-disk structures for OCFS2.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _OCFS2_FS_H
26#define _OCFS2_FS_H
27
28/* Version */
29#define OCFS2_MAJOR_REV_LEVEL 0
30#define OCFS2_MINOR_REV_LEVEL 90
31
32/*
33 * An OCFS2 volume starts this way:
34 * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
35 * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
36 * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
37 *
38 * All other structures are found from the superblock information.
39 *
40 * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
41 * blocksize of 2K, it is 4096 bytes into disk.
42 */
43#define OCFS2_SUPER_BLOCK_BLKNO 2
44
45/*
46 * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
47 * grow if needed.
48 */
49#define OCFS2_MIN_CLUSTERSIZE 4096
50#define OCFS2_MAX_CLUSTERSIZE 1048576
51
52/*
53 * Blocks cannot be bigger than clusters, so the maximum blocksize is the
54 * minimum cluster size.
55 */
56#define OCFS2_MIN_BLOCKSIZE 512
57#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
58
59/* Filesystem magic number */
60#define OCFS2_SUPER_MAGIC 0x7461636f
61
62/* Object signatures */
63#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67
68/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
70 ( OCFS2_SB(sb)->s_feature_compat & (mask) )
71#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
72 ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
73#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
74 ( OCFS2_SB(sb)->s_feature_incompat & (mask) )
75#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
76 OCFS2_SB(sb)->s_feature_compat |= (mask)
77#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
78 OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
79#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
80 OCFS2_SB(sb)->s_feature_incompat |= (mask)
81#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
82 OCFS2_SB(sb)->s_feature_compat &= ~(mask)
83#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
84 OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
85#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87
88#define OCFS2_FEATURE_COMPAT_SUPP 0
89#define OCFS2_FEATURE_INCOMPAT_SUPP 0
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91
92/*
93 * Heartbeat-only devices are missing journals and other files. The
94 * filesystem driver can't load them, but the library can. Never put
95 * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
96 */
97#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
98
99
100/*
101 * Flags on ocfs2_dinode.i_flags
102 */
103#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
104#define OCFS2_UNUSED2_FL (0x00000002)
105#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
106#define OCFS2_UNUSED3_FL (0x00000008)
107/* System inode flags */
108#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
109#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
110#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
111#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
112#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
113#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
114#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
115#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
116
117/*
118 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
119 */
120#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
121
122/*
123 * superblock s_state flags
124 */
125#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
126
127/* Limit of space in ocfs2_dir_entry */
128#define OCFS2_MAX_FILENAME_LEN 255
129
130/* Maximum slots on an ocfs2 file system */
131#define OCFS2_MAX_SLOTS 255
132
133/* Slot map indicator for an empty slot */
134#define OCFS2_INVALID_SLOT -1
135
136#define OCFS2_VOL_UUID_LEN 16
137#define OCFS2_MAX_VOL_LABEL_LEN 64
138
139/* Journal limits (in bytes) */
140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
141#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
142
143struct ocfs2_system_inode_info {
144 char *si_name;
145 int si_iflags;
146 int si_mode;
147};
148
149/* System file index */
150enum {
151 BAD_BLOCK_SYSTEM_INODE = 0,
152 GLOBAL_INODE_ALLOC_SYSTEM_INODE,
153 SLOT_MAP_SYSTEM_INODE,
154#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
155 HEARTBEAT_SYSTEM_INODE,
156 GLOBAL_BITMAP_SYSTEM_INODE,
157#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
158 ORPHAN_DIR_SYSTEM_INODE,
159 EXTENT_ALLOC_SYSTEM_INODE,
160 INODE_ALLOC_SYSTEM_INODE,
161 JOURNAL_SYSTEM_INODE,
162 LOCAL_ALLOC_SYSTEM_INODE,
163 TRUNCATE_LOG_SYSTEM_INODE,
164 NUM_SYSTEM_INODES
165};
166
167static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
168 /* Global system inodes (single copy) */
169 /* The first two are only used from userspace mfks/tunefs */
170 [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
171 [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
172
173 /* These are used by the running filesystem */
174 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
175 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
176 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
177
178 /* Slot-specific system inodes (one copy per slot) */
179 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
180 [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
181 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
182 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
183 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
184 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
185};
186
187/* Parameter passed from mount.ocfs2 to module */
188#define OCFS2_HB_NONE "heartbeat=none"
189#define OCFS2_HB_LOCAL "heartbeat=local"
190
191/*
192 * OCFS2 directory file types. Only the low 3 bits are used. The
193 * other bits are reserved for now.
194 */
195#define OCFS2_FT_UNKNOWN 0
196#define OCFS2_FT_REG_FILE 1
197#define OCFS2_FT_DIR 2
198#define OCFS2_FT_CHRDEV 3
199#define OCFS2_FT_BLKDEV 4
200#define OCFS2_FT_FIFO 5
201#define OCFS2_FT_SOCK 6
202#define OCFS2_FT_SYMLINK 7
203
204#define OCFS2_FT_MAX 8
205
206/*
207 * OCFS2_DIR_PAD defines the directory entries boundaries
208 *
209 * NOTE: It must be a multiple of 4
210 */
211#define OCFS2_DIR_PAD 4
212#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
213#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
214#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
215 OCFS2_DIR_ROUND) & \
216 ~OCFS2_DIR_ROUND)
217
218#define OCFS2_LINK_MAX 32000
219
220#define S_SHIFT 12
221static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
222 [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
223 [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
224 [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
225 [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
226 [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
227 [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
228 [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
229};
230
231
232/*
233 * Convenience casts
234 */
235#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
236
237/*
238 * On disk extent record for OCFS2
239 * It describes a range of clusters on disk.
240 */
241struct ocfs2_extent_rec {
242/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
243 __le32 e_clusters; /* Clusters covered by this extent */
244 __le64 e_blkno; /* Physical disk offset, in blocks */
245/*10*/
246};
247
248struct ocfs2_chain_rec {
249 __le32 c_free; /* Number of free bits in this chain. */
250 __le32 c_total; /* Number of total bits in this chain */
251 __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
252};
253
254struct ocfs2_truncate_rec {
255 __le32 t_start; /* 1st cluster in this log */
256 __le32 t_clusters; /* Number of total clusters covered */
257};
258
259/*
260 * On disk extent list for OCFS2 (node in the tree). Note that this
261 * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
262 * offsets are relative to ocfs2_dinode.id2.i_list or
263 * ocfs2_extent_block.h_list, respectively.
264 */
265struct ocfs2_extent_list {
266/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
267 point. 0 means data extents
268 hang directly off this
269 header (a leaf) */
270 __le16 l_count; /* Number of extent records */
271 __le16 l_next_free_rec; /* Next unused extent slot */
272 __le16 l_reserved1;
273 __le64 l_reserved2; /* Pad to
274 sizeof(ocfs2_extent_rec) */
275/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
276};
277
278/*
279 * On disk allocation chain list for OCFS2. Note that this is
280 * contained inside ocfs2_dinode, so the offsets are relative to
281 * ocfs2_dinode.id2.i_chain.
282 */
283struct ocfs2_chain_list {
284/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
285 __le16 cl_bpc; /* Bits per cluster */
286 __le16 cl_count; /* Total chains in this list */
287 __le16 cl_next_free_rec; /* Next unused chain slot */
288 __le64 cl_reserved1;
289/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
290};
291
292/*
293 * On disk deallocation log for OCFS2. Note that this is
294 * contained inside ocfs2_dinode, so the offsets are relative to
295 * ocfs2_dinode.id2.i_dealloc.
296 */
297struct ocfs2_truncate_log {
298/*00*/ __le16 tl_count; /* Total records in this log */
299 __le16 tl_used; /* Number of records in use */
300 __le32 tl_reserved1;
301/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
302};
303
304/*
305 * On disk extent block (indirect block) for OCFS2
306 */
307struct ocfs2_extent_block
308{
309/*00*/ __u8 h_signature[8]; /* Signature for verification */
310 __le64 h_reserved1;
311/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
312 extent_header belongs to */
313 __le16 h_suballoc_bit; /* Bit offset in suballocator
314 block group */
315 __le32 h_fs_generation; /* Must match super block */
316 __le64 h_blkno; /* Offset on disk, in blocks */
317/*20*/ __le64 h_reserved3;
318 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
319 of next leaf header pointing
320 to data */
321/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
322/* Actual on-disk size is one block */
323};
324
325/*
326 * On disk superblock for OCFS2
327 * Note that it is contained inside an ocfs2_dinode, so all offsets
328 * are relative to the start of ocfs2_dinode.id2.
329 */
330struct ocfs2_super_block {
331/*00*/ __le16 s_major_rev_level;
332 __le16 s_minor_rev_level;
333 __le16 s_mnt_count;
334 __le16 s_max_mnt_count;
335 __le16 s_state; /* File system state */
336 __le16 s_errors; /* Behaviour when detecting errors */
337 __le32 s_checkinterval; /* Max time between checks */
338/*10*/ __le64 s_lastcheck; /* Time of last check */
339 __le32 s_creator_os; /* OS */
340 __le32 s_feature_compat; /* Compatible feature set */
341/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
342 __le32 s_feature_ro_compat; /* Readonly-compatible feature set */
343 __le64 s_root_blkno; /* Offset, in blocks, of root directory
344 dinode */
345/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
346 directory dinode */
347 __le32 s_blocksize_bits; /* Blocksize for this fs */
348 __le32 s_clustersize_bits; /* Clustersize for this fs */
349/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
350 before tunefs required */
351 __le16 s_reserved1;
352 __le32 s_reserved2;
353 __le64 s_first_cluster_group; /* Block offset of 1st cluster
354 * group header */
355/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
356/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
357/*A0*/
358};
359
360/*
361 * Local allocation bitmap for OCFS2 slots
362 * Note that it exists inside an ocfs2_dinode, so all offsets are
363 * relative to the start of ocfs2_dinode.id2.
364 */
365struct ocfs2_local_alloc
366{
367/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
368 __le16 la_size; /* Size of included bitmap, in bytes */
369 __le16 la_reserved1;
370 __le64 la_reserved2;
371/*10*/ __u8 la_bitmap[0];
372};
373
374/*
375 * On disk inode for OCFS2
376 */
377struct ocfs2_dinode {
378/*00*/ __u8 i_signature[8]; /* Signature for validation */
379 __le32 i_generation; /* Generation number */
380 __le16 i_suballoc_slot; /* Slot suballocator this inode
381 belongs to */
382 __le16 i_suballoc_bit; /* Bit offset in suballocator
383 block group */
384/*10*/ __le32 i_reserved0;
385 __le32 i_clusters; /* Cluster count */
386 __le32 i_uid; /* Owner UID */
387 __le32 i_gid; /* Owning GID */
388/*20*/ __le64 i_size; /* Size in bytes */
389 __le16 i_mode; /* File mode */
390 __le16 i_links_count; /* Links count */
391 __le32 i_flags; /* File flags */
392/*30*/ __le64 i_atime; /* Access time */
393 __le64 i_ctime; /* Creation time */
394/*40*/ __le64 i_mtime; /* Modification time */
395 __le64 i_dtime; /* Deletion time */
396/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
397 __le64 i_last_eb_blk; /* Pointer to last extent
398 block */
399/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
400 __le32 i_atime_nsec;
401 __le32 i_ctime_nsec;
402 __le32 i_mtime_nsec;
403/*70*/ __le64 i_reserved1[9];
404/*B8*/ union {
405 __le64 i_pad1; /* Generic way to refer to this
406 64bit union */
407 struct {
408 __le64 i_rdev; /* Device number */
409 } dev1;
410 struct { /* Info for bitmap system
411 inodes */
412 __le32 i_used; /* Bits (ie, clusters) used */
413 __le32 i_total; /* Total bits (clusters)
414 available */
415 } bitmap1;
416 struct { /* Info for journal system
417 inodes */
418 __le32 ij_flags; /* Mounted, version, etc. */
419 __le32 ij_pad;
420 } journal1;
421 } id1; /* Inode type dependant 1 */
422/*C0*/ union {
423 struct ocfs2_super_block i_super;
424 struct ocfs2_local_alloc i_lab;
425 struct ocfs2_chain_list i_chain;
426 struct ocfs2_extent_list i_list;
427 struct ocfs2_truncate_log i_dealloc;
428 __u8 i_symlink[0];
429 } id2;
430/* Actual on-disk size is one block */
431};
432
433/*
434 * On-disk directory entry structure for OCFS2
435 *
436 * Packed as this structure could be accessed unaligned on 64-bit platforms
437 */
438struct ocfs2_dir_entry {
439/*00*/ __le64 inode; /* Inode number */
440 __le16 rec_len; /* Directory entry length */
441 __u8 name_len; /* Name length */
442 __u8 file_type;
443/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
444/* Actual on-disk length specified by rec_len */
445} __attribute__ ((packed));
446
447/*
448 * On disk allocator group structure for OCFS2
449 */
450struct ocfs2_group_desc
451{
452/*00*/ __u8 bg_signature[8]; /* Signature for validation */
453 __le16 bg_size; /* Size of included bitmap in
454 bytes. */
455 __le16 bg_bits; /* Bits represented by this
456 group. */
457 __le16 bg_free_bits_count; /* Free bits count */
458 __le16 bg_chain; /* What chain I am in. */
459/*10*/ __le32 bg_generation;
460 __le32 bg_reserved1;
461 __le64 bg_next_group; /* Next group in my list, in
462 blocks */
463/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
464 blocks */
465 __le64 bg_blkno; /* Offset on disk, in blocks */
466/*30*/ __le64 bg_reserved2[2];
467/*40*/ __u8 bg_bitmap[0];
468};
469
470#ifdef __KERNEL__
471static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
472{
473 return sb->s_blocksize -
474 offsetof(struct ocfs2_dinode, id2.i_symlink);
475}
476
477static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
478{
479 int size;
480
481 size = sb->s_blocksize -
482 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
483
484 return size / sizeof(struct ocfs2_extent_rec);
485}
486
487static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
488{
489 int size;
490
491 size = sb->s_blocksize -
492 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
493
494 return size / sizeof(struct ocfs2_chain_rec);
495}
496
497static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
498{
499 int size;
500
501 size = sb->s_blocksize -
502 offsetof(struct ocfs2_extent_block, h_list.l_recs);
503
504 return size / sizeof(struct ocfs2_extent_rec);
505}
506
507static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
508{
509 u16 size;
510
511 size = sb->s_blocksize -
512 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
513
514 return size;
515}
516
517static inline int ocfs2_group_bitmap_size(struct super_block *sb)
518{
519 int size;
520
521 size = sb->s_blocksize -
522 offsetof(struct ocfs2_group_desc, bg_bitmap);
523
524 return size;
525}
526
527static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
528{
529 int size;
530
531 size = sb->s_blocksize -
532 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
533
534 return size / sizeof(struct ocfs2_truncate_rec);
535}
536#else
537static inline int ocfs2_fast_symlink_chars(int blocksize)
538{
539 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
540}
541
542static inline int ocfs2_extent_recs_per_inode(int blocksize)
543{
544 int size;
545
546 size = blocksize -
547 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
548
549 return size / sizeof(struct ocfs2_extent_rec);
550}
551
552static inline int ocfs2_chain_recs_per_inode(int blocksize)
553{
554 int size;
555
556 size = blocksize -
557 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
558
559 return size / sizeof(struct ocfs2_chain_rec);
560}
561
562static inline int ocfs2_extent_recs_per_eb(int blocksize)
563{
564 int size;
565
566 size = blocksize -
567 offsetof(struct ocfs2_extent_block, h_list.l_recs);
568
569 return size / sizeof(struct ocfs2_extent_rec);
570}
571
572static inline int ocfs2_local_alloc_size(int blocksize)
573{
574 int size;
575
576 size = blocksize -
577 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
578
579 return size;
580}
581
582static inline int ocfs2_group_bitmap_size(int blocksize)
583{
584 int size;
585
586 size = blocksize -
587 offsetof(struct ocfs2_group_desc, bg_bitmap);
588
589 return size;
590}
591
592static inline int ocfs2_truncate_recs_per_inode(int blocksize)
593{
594 int size;
595
596 size = blocksize -
597 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
598
599 return size / sizeof(struct ocfs2_truncate_rec);
600}
601#endif /* __KERNEL__ */
602
603
604static inline int ocfs2_system_inode_is_global(int type)
605{
606 return ((type >= 0) &&
607 (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
608}
609
610static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
611 int type, int slot)
612{
613 int chars;
614
615 /*
616 * Global system inodes can only have one copy. Everything
617 * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
618 * list has a copy per slot.
619 */
620 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
621 chars = snprintf(buf, len,
622 ocfs2_system_inodes[type].si_name);
623 else
624 chars = snprintf(buf, len,
625 ocfs2_system_inodes[type].si_name,
626 slot);
627
628 return chars;
629}
630
631static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
632 umode_t mode)
633{
634 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
635}
636
637#endif /* _OCFS2_FS_H */
638
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 000000000000..7dd9e1e705b0
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_lockid.h
5 *
6 * Defines OCFS2 lockid bits.
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCKID_H
27#define OCFS2_LOCKID_H
28
29/* lock ids are made up in the following manner:
30 * name[0] --> type
31 * name[1-6] --> 6 pad characters, reserved for now
32 * name[7-22] --> block number, expressed in hex as 16 chars
33 * name[23-30] --> i_generation, expressed in hex 8 chars
34 * name[31] --> '\0' */
35#define OCFS2_LOCK_ID_MAX_LEN 32
36#define OCFS2_LOCK_ID_PAD "000000"
37
38enum ocfs2_lock_type {
39 OCFS2_LOCK_TYPE_META = 0,
40 OCFS2_LOCK_TYPE_DATA,
41 OCFS2_LOCK_TYPE_SUPER,
42 OCFS2_LOCK_TYPE_RENAME,
43 OCFS2_LOCK_TYPE_RW,
44 OCFS2_NUM_LOCK_TYPES
45};
46
47static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
48{
49 char c;
50 switch (type) {
51 case OCFS2_LOCK_TYPE_META:
52 c = 'M';
53 break;
54 case OCFS2_LOCK_TYPE_DATA:
55 c = 'D';
56 break;
57 case OCFS2_LOCK_TYPE_SUPER:
58 c = 'S';
59 break;
60 case OCFS2_LOCK_TYPE_RENAME:
61 c = 'R';
62 break;
63 case OCFS2_LOCK_TYPE_RW:
64 c = 'W';
65 break;
66 default:
67 c = '\0';
68 }
69
70 return c;
71}
72
73#endif /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 000000000000..871627961d6d
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slot_map.c
5 *
6 *
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30
31#define MLOG_MASK_PREFIX ML_SUPER
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "dlmglue.h"
37#include "extent_map.h"
38#include "heartbeat.h"
39#include "inode.h"
40#include "slot_map.h"
41#include "super.h"
42#include "sysfile.h"
43
44#include "buffer_head_io.h"
45
46static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
47 s16 global);
48static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
49 s16 slot_num,
50 s16 node_num);
51
52/* Use the slot information we've collected to create a map of mounted
53 * nodes. Should be holding an EX on super block. assumes slot info is
54 * up to date. Note that we call this *after* we find a slot, so our
55 * own node should be set in the map too... */
56void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
57{
58 int i;
59 struct ocfs2_slot_info *si = osb->slot_info;
60
61 spin_lock(&si->si_lock);
62
63 for (i = 0; i < si->si_size; i++)
64 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
65 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
66 si->si_global_node_nums[i]);
67
68 spin_unlock(&si->si_lock);
69}
70
71/* post the slot information on disk into our slot_info struct. */
72void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
73{
74 int i;
75 __le16 *disk_info;
76
77 /* we don't read the slot block here as ocfs2_super_lock
78 * should've made sure we have the most recent copy. */
79 spin_lock(&si->si_lock);
80 disk_info = (__le16 *) si->si_bh->b_data;
81
82 for (i = 0; i < si->si_size; i++)
83 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
84
85 spin_unlock(&si->si_lock);
86}
87
88/* post the our slot info stuff into it's destination bh and write it
89 * out. */
90int ocfs2_update_disk_slots(struct ocfs2_super *osb,
91 struct ocfs2_slot_info *si)
92{
93 int status, i;
94 __le16 *disk_info = (__le16 *) si->si_bh->b_data;
95
96 spin_lock(&si->si_lock);
97 for (i = 0; i < si->si_size; i++)
98 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
99 spin_unlock(&si->si_lock);
100
101 status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
102 if (status < 0)
103 mlog_errno(status);
104
105 return status;
106}
107
108/* try to find global node in the slot info. Returns
109 * OCFS2_INVALID_SLOT if nothing is found. */
110static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
111 s16 global)
112{
113 int i;
114 s16 ret = OCFS2_INVALID_SLOT;
115
116 for(i = 0; i < si->si_num_slots; i++) {
117 if (global == si->si_global_node_nums[i]) {
118 ret = (s16) i;
119 break;
120 }
121 }
122 return ret;
123}
124
125static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
126{
127 int i;
128 s16 ret = OCFS2_INVALID_SLOT;
129
130 for(i = 0; i < si->si_num_slots; i++) {
131 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
132 ret = (s16) i;
133 break;
134 }
135 }
136 return ret;
137}
138
139s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
140 s16 global)
141{
142 s16 ret;
143
144 spin_lock(&si->si_lock);
145 ret = __ocfs2_node_num_to_slot(si, global);
146 spin_unlock(&si->si_lock);
147 return ret;
148}
149
150static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
151 s16 slot_num,
152 s16 node_num)
153{
154 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
155 BUG_ON(slot_num >= si->si_num_slots);
156 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
157 (node_num >= O2NM_MAX_NODES));
158
159 si->si_global_node_nums[slot_num] = node_num;
160}
161
162void ocfs2_clear_slot(struct ocfs2_slot_info *si,
163 s16 slot_num)
164{
165 spin_lock(&si->si_lock);
166 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
167 spin_unlock(&si->si_lock);
168}
169
170int ocfs2_init_slot_info(struct ocfs2_super *osb)
171{
172 int status, i;
173 u64 blkno;
174 struct inode *inode = NULL;
175 struct buffer_head *bh = NULL;
176 struct ocfs2_slot_info *si;
177
178 si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
179 if (!si) {
180 status = -ENOMEM;
181 mlog_errno(status);
182 goto bail;
183 }
184
185 spin_lock_init(&si->si_lock);
186 si->si_num_slots = osb->max_slots;
187 si->si_size = OCFS2_MAX_SLOTS;
188
189 for(i = 0; i < si->si_num_slots; i++)
190 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
191
192 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
193 OCFS2_INVALID_SLOT);
194 if (!inode) {
195 status = -EINVAL;
196 mlog_errno(status);
197 goto bail;
198 }
199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
201 if (status < 0) {
202 mlog_errno(status);
203 goto bail;
204 }
205
206 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
207 if (status < 0) {
208 mlog_errno(status);
209 goto bail;
210 }
211
212 si->si_inode = inode;
213 si->si_bh = bh;
214 osb->slot_info = si;
215bail:
216 if (status < 0 && si)
217 ocfs2_free_slot_info(si);
218
219 return status;
220}
221
222void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
223{
224 if (si->si_inode)
225 iput(si->si_inode);
226 if (si->si_bh)
227 brelse(si->si_bh);
228 kfree(si);
229}
230
231int ocfs2_find_slot(struct ocfs2_super *osb)
232{
233 int status;
234 s16 slot;
235 struct ocfs2_slot_info *si;
236
237 mlog_entry_void();
238
239 si = osb->slot_info;
240
241 ocfs2_update_slot_info(si);
242
243 spin_lock(&si->si_lock);
244 /* search for ourselves first and take the slot if it already
245 * exists. Perhaps we need to mark this in a variable for our
246 * own journal recovery? Possibly not, though we certainly
247 * need to warn to the user */
248 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
249 if (slot == OCFS2_INVALID_SLOT) {
250 /* if no slot yet, then just take 1st available
251 * one. */
252 slot = __ocfs2_find_empty_slot(si);
253 if (slot == OCFS2_INVALID_SLOT) {
254 spin_unlock(&si->si_lock);
255 mlog(ML_ERROR, "no free slots available!\n");
256 status = -EINVAL;
257 goto bail;
258 }
259 } else
260 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
261 slot);
262
263 __ocfs2_fill_slot(si, slot, osb->node_num);
264 osb->slot_num = slot;
265 spin_unlock(&si->si_lock);
266
267 mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
268
269 status = ocfs2_update_disk_slots(osb, si);
270 if (status < 0)
271 mlog_errno(status);
272
273bail:
274 mlog_exit(status);
275 return status;
276}
277
278void ocfs2_put_slot(struct ocfs2_super *osb)
279{
280 int status;
281 struct ocfs2_slot_info *si = osb->slot_info;
282
283 if (!si)
284 return;
285
286 ocfs2_update_slot_info(si);
287
288 spin_lock(&si->si_lock);
289 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
290 osb->slot_num = OCFS2_INVALID_SLOT;
291 spin_unlock(&si->si_lock);
292
293 status = ocfs2_update_disk_slots(osb, si);
294 if (status < 0) {
295 mlog_errno(status);
296 goto bail;
297 }
298
299bail:
300 osb->slot_info = NULL;
301 ocfs2_free_slot_info(si);
302}
303
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 000000000000..d8c8ceed031b
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slotmap.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef SLOTMAP_H
28#define SLOTMAP_H
29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
42
43int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb);
45
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num)
59{
60 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
61 assert_spin_locked(&si->si_lock);
62
63 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
64}
65
66#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 000000000000..c46c164aefbb
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45#include "uptodate.h"
46
47#include "buffer_head_io.h"
48
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
61
62static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
64
65static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted,
82 u32 min_bits,
83 u16 *bit_off,
84 unsigned int *num_bits,
85 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr);
88static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 struct buffer_head *bg_bh,
90 unsigned int bits_wanted,
91 u16 *bit_off,
92 u16 *bits_found);
93static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 struct inode *alloc_inode,
95 struct ocfs2_group_desc *bg,
96 struct buffer_head *group_bh,
97 unsigned int bit_off,
98 unsigned int num_bits);
99static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 struct inode *alloc_inode,
101 struct ocfs2_group_desc *bg,
102 struct buffer_head *group_bh,
103 unsigned int bit_off,
104 unsigned int num_bits);
105
106static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 struct inode *alloc_inode,
108 struct buffer_head *fe_bh,
109 struct buffer_head *bg_bh,
110 struct buffer_head *prev_bg_bh,
111 u16 chain);
112static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 u32 wanted);
114static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 struct inode *alloc_inode,
116 struct buffer_head *alloc_bh,
117 unsigned int start_bit,
118 u64 bg_blkno,
119 unsigned int count);
120static inline u64 ocfs2_which_suballoc_group(u64 block,
121 unsigned int bit);
122static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 u64 bg_blkno,
124 u16 bg_bit_off);
125static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 u32 cluster);
127static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 u64 data_blkno,
129 u64 *bg_blkno,
130 u16 *bg_bit_off);
131
132void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133{
134 if (ac->ac_inode)
135 iput(ac->ac_inode);
136 if (ac->ac_bh)
137 brelse(ac->ac_bh);
138 kfree(ac);
139}
140
141static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142{
143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144}
145
146static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 struct inode *alloc_inode,
148 struct buffer_head *bg_bh,
149 u64 group_blkno,
150 u16 my_chain,
151 struct ocfs2_chain_list *cl)
152{
153 int status = 0;
154 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 struct super_block * sb = alloc_inode->i_sb;
156
157 mlog_entry_void();
158
159 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161 "!= b_blocknr (%llu)", group_blkno,
162 (unsigned long long) bg_bh->b_blocknr);
163 status = -EIO;
164 goto bail;
165 }
166
167 status = ocfs2_journal_access(handle,
168 alloc_inode,
169 bg_bh,
170 OCFS2_JOURNAL_ACCESS_CREATE);
171 if (status < 0) {
172 mlog_errno(status);
173 goto bail;
174 }
175
176 memset(bg, 0, sb->s_blocksize);
177 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181 bg->bg_chain = cpu_to_le16(my_chain);
182 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184 bg->bg_blkno = cpu_to_le64(group_blkno);
185 /* set the 1st bit in the bitmap to account for the descriptor block */
186 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188
189 status = ocfs2_journal_dirty(handle, bg_bh);
190 if (status < 0)
191 mlog_errno(status);
192
193 /* There is no need to zero out or otherwise initialize the
194 * other blocks in a group - All valid FS metadata in a block
195 * group stores the superblock fs_generation value at
196 * allocation time. */
197
198bail:
199 mlog_exit(status);
200 return status;
201}
202
203static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204{
205 u16 curr, best;
206
207 best = curr = 0;
208 while (curr < le16_to_cpu(cl->cl_count)) {
209 if (le32_to_cpu(cl->cl_recs[best].c_total) >
210 le32_to_cpu(cl->cl_recs[curr].c_total))
211 best = curr;
212 curr++;
213 }
214 return best;
215}
216
217/*
218 * We expect the block group allocator to already be locked.
219 */
220static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221 struct inode *alloc_inode,
222 struct buffer_head *bh)
223{
224 int status, credits;
225 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226 struct ocfs2_chain_list *cl;
227 struct ocfs2_alloc_context *ac = NULL;
228 struct ocfs2_journal_handle *handle = NULL;
229 u32 bit_off, num_bits;
230 u16 alloc_rec;
231 u64 bg_blkno;
232 struct buffer_head *bg_bh = NULL;
233 struct ocfs2_group_desc *bg;
234
235 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236
237 mlog_entry_void();
238
239 handle = ocfs2_alloc_handle(osb);
240 if (!handle) {
241 status = -ENOMEM;
242 mlog_errno(status);
243 goto bail;
244 }
245
246 cl = &fe->id2.i_chain;
247 status = ocfs2_reserve_clusters(osb,
248 handle,
249 le16_to_cpu(cl->cl_cpg),
250 &ac);
251 if (status < 0) {
252 if (status != -ENOSPC)
253 mlog_errno(status);
254 goto bail;
255 }
256
257 credits = ocfs2_calc_group_alloc_credits(osb->sb,
258 le16_to_cpu(cl->cl_cpg));
259 handle = ocfs2_start_trans(osb, handle, credits);
260 if (IS_ERR(handle)) {
261 status = PTR_ERR(handle);
262 handle = NULL;
263 mlog_errno(status);
264 goto bail;
265 }
266
267 status = ocfs2_claim_clusters(osb,
268 handle,
269 ac,
270 le16_to_cpu(cl->cl_cpg),
271 &bit_off,
272 &num_bits);
273 if (status < 0) {
274 if (status != -ENOSPC)
275 mlog_errno(status);
276 goto bail;
277 }
278
279 alloc_rec = ocfs2_find_smallest_chain(cl);
280
281 /* setup the group */
282 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283 mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284 alloc_rec, bg_blkno);
285
286 bg_bh = sb_getblk(osb->sb, bg_blkno);
287 if (!bg_bh) {
288 status = -EIO;
289 mlog_errno(status);
290 goto bail;
291 }
292 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293
294 status = ocfs2_block_group_fill(handle,
295 alloc_inode,
296 bg_bh,
297 bg_blkno,
298 alloc_rec,
299 cl);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306
307 status = ocfs2_journal_access(handle, alloc_inode,
308 bh, OCFS2_JOURNAL_ACCESS_WRITE);
309 if (status < 0) {
310 mlog_errno(status);
311 goto bail;
312 }
313
314 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315 le16_to_cpu(bg->bg_free_bits_count));
316 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
318 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319 le16_add_cpu(&cl->cl_next_free_rec, 1);
320
321 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322 le16_to_cpu(bg->bg_free_bits_count));
323 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325
326 status = ocfs2_journal_dirty(handle, bh);
327 if (status < 0) {
328 mlog_errno(status);
329 goto bail;
330 }
331
332 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335 le32_to_cpu(fe->i_clusters)));
336 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338 alloc_inode->i_blocks =
339 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340
341 status = 0;
342bail:
343 if (handle)
344 ocfs2_commit_trans(handle);
345
346 if (ac)
347 ocfs2_free_alloc_context(ac);
348
349 if (bg_bh)
350 brelse(bg_bh);
351
352 mlog_exit(status);
353 return status;
354}
355
356static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357 struct ocfs2_alloc_context *ac)
358{
359 int status;
360 u32 bits_wanted = ac->ac_bits_wanted;
361 struct inode *alloc_inode = ac->ac_inode;
362 struct buffer_head *bh = NULL;
363 struct ocfs2_journal_handle *handle = ac->ac_handle;
364 struct ocfs2_dinode *fe;
365 u32 free_bits;
366
367 mlog_entry_void();
368
369 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370
371 ocfs2_handle_add_inode(handle, alloc_inode);
372 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373 if (status < 0) {
374 mlog_errno(status);
375 goto bail;
376 }
377
378 fe = (struct ocfs2_dinode *) bh->b_data;
379 if (!OCFS2_IS_VALID_DINODE(fe)) {
380 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381 status = -EIO;
382 goto bail;
383 }
384 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386 "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387 status = -EIO;
388 goto bail;
389 }
390
391 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392 le32_to_cpu(fe->id1.bitmap1.i_used);
393
394 if (bits_wanted > free_bits) {
395 /* cluster bitmap never grows */
396 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398 bits_wanted, free_bits);
399 status = -ENOSPC;
400 goto bail;
401 }
402
403 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404 if (status < 0) {
405 if (status != -ENOSPC)
406 mlog_errno(status);
407 goto bail;
408 }
409 atomic_inc(&osb->alloc_stats.bg_extends);
410
411 /* You should never ask for this much metadata */
412 BUG_ON(bits_wanted >
413 (le32_to_cpu(fe->id1.bitmap1.i_total)
414 - le32_to_cpu(fe->id1.bitmap1.i_used)));
415 }
416
417 get_bh(bh);
418 ac->ac_bh = bh;
419bail:
420 if (bh)
421 brelse(bh);
422
423 mlog_exit(status);
424 return status;
425}
426
427int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428 struct ocfs2_journal_handle *handle,
429 struct ocfs2_dinode *fe,
430 struct ocfs2_alloc_context **ac)
431{
432 int status;
433 struct inode *alloc_inode = NULL;
434
435 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436 if (!(*ac)) {
437 status = -ENOMEM;
438 mlog_errno(status);
439 goto bail;
440 }
441
442 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443 (*ac)->ac_handle = handle;
444 (*ac)->ac_which = OCFS2_AC_USE_META;
445
446#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447 alloc_inode = ocfs2_get_system_file_inode(osb,
448 EXTENT_ALLOC_SYSTEM_INODE,
449 0);
450#else
451 alloc_inode = ocfs2_get_system_file_inode(osb,
452 EXTENT_ALLOC_SYSTEM_INODE,
453 osb->slot_num);
454#endif
455 if (!alloc_inode) {
456 status = -ENOMEM;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 (*ac)->ac_inode = igrab(alloc_inode);
462 (*ac)->ac_group_search = ocfs2_block_group_search;
463
464 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465 if (status < 0) {
466 if (status != -ENOSPC)
467 mlog_errno(status);
468 goto bail;
469 }
470
471 status = 0;
472bail:
473 if ((status < 0) && *ac) {
474 ocfs2_free_alloc_context(*ac);
475 *ac = NULL;
476 }
477
478 if (alloc_inode)
479 iput(alloc_inode);
480
481 mlog_exit(status);
482 return status;
483}
484
485int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486 struct ocfs2_journal_handle *handle,
487 struct ocfs2_alloc_context **ac)
488{
489 int status;
490 struct inode *alloc_inode = NULL;
491
492 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493 if (!(*ac)) {
494 status = -ENOMEM;
495 mlog_errno(status);
496 goto bail;
497 }
498
499 (*ac)->ac_bits_wanted = 1;
500 (*ac)->ac_handle = handle;
501 (*ac)->ac_which = OCFS2_AC_USE_INODE;
502
503 alloc_inode = ocfs2_get_system_file_inode(osb,
504 INODE_ALLOC_SYSTEM_INODE,
505 osb->slot_num);
506 if (!alloc_inode) {
507 status = -ENOMEM;
508 mlog_errno(status);
509 goto bail;
510 }
511
512 (*ac)->ac_inode = igrab(alloc_inode);
513 (*ac)->ac_group_search = ocfs2_block_group_search;
514
515 status = ocfs2_reserve_suballoc_bits(osb, *ac);
516 if (status < 0) {
517 if (status != -ENOSPC)
518 mlog_errno(status);
519 goto bail;
520 }
521
522 status = 0;
523bail:
524 if ((status < 0) && *ac) {
525 ocfs2_free_alloc_context(*ac);
526 *ac = NULL;
527 }
528
529 if (alloc_inode)
530 iput(alloc_inode);
531
532 mlog_exit(status);
533 return status;
534}
535
536/* local alloc code has to do the same thing, so rather than do this
537 * twice.. */
538int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539 struct ocfs2_alloc_context *ac)
540{
541 int status;
542
543 ac->ac_inode = ocfs2_get_system_file_inode(osb,
544 GLOBAL_BITMAP_SYSTEM_INODE,
545 OCFS2_INVALID_SLOT);
546 if (!ac->ac_inode) {
547 status = -EINVAL;
548 mlog(ML_ERROR, "Could not get bitmap inode!\n");
549 goto bail;
550 }
551 ac->ac_which = OCFS2_AC_USE_MAIN;
552 ac->ac_group_search = ocfs2_cluster_group_search;
553
554 status = ocfs2_reserve_suballoc_bits(osb, ac);
555 if (status < 0 && status != -ENOSPC)
556 mlog_errno(status);
557bail:
558 return status;
559}
560
561/* Callers don't need to care which bitmap (local alloc or main) to
562 * use so we figure it out for them, but unfortunately this clutters
563 * things a bit. */
564int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565 struct ocfs2_journal_handle *handle,
566 u32 bits_wanted,
567 struct ocfs2_alloc_context **ac)
568{
569 int status;
570
571 mlog_entry_void();
572
573 BUG_ON(!handle);
574
575 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576 if (!(*ac)) {
577 status = -ENOMEM;
578 mlog_errno(status);
579 goto bail;
580 }
581
582 (*ac)->ac_bits_wanted = bits_wanted;
583 (*ac)->ac_handle = handle;
584
585 status = -ENOSPC;
586 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587 status = ocfs2_reserve_local_alloc_bits(osb,
588 handle,
589 bits_wanted,
590 *ac);
591 if ((status < 0) && (status != -ENOSPC)) {
592 mlog_errno(status);
593 goto bail;
594 } else if (status == -ENOSPC) {
595 /* reserve_local_bits will return enospc with
596 * the local alloc inode still locked, so we
597 * can change this safely here. */
598 mlog(0, "Disabling local alloc\n");
599 /* We set to OCFS2_LA_DISABLED so that umount
600 * can clean up what's left of the local
601 * allocation */
602 osb->local_alloc_state = OCFS2_LA_DISABLED;
603 }
604 }
605
606 if (status == -ENOSPC) {
607 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608 if (status < 0) {
609 if (status != -ENOSPC)
610 mlog_errno(status);
611 goto bail;
612 }
613 }
614
615 status = 0;
616bail:
617 if ((status < 0) && *ac) {
618 ocfs2_free_alloc_context(*ac);
619 *ac = NULL;
620 }
621
622 mlog_exit(status);
623 return status;
624}
625
626/*
627 * More or less lifted from ext3. I'll leave their description below:
628 *
629 * "For ext3 allocations, we must not reuse any blocks which are
630 * allocated in the bitmap buffer's "last committed data" copy. This
631 * prevents deletes from freeing up the page for reuse until we have
632 * committed the delete transaction.
633 *
634 * If we didn't do this, then deleting something and reallocating it as
635 * data would allow the old block to be overwritten before the
636 * transaction committed (because we force data to disk before commit).
637 * This would lead to corruption if we crashed between overwriting the
638 * data and committing the delete.
639 *
640 * @@@ We may want to make this allocation behaviour conditional on
641 * data-writes at some point, and disable it for metadata allocations or
642 * sync-data inodes."
643 *
644 * Note: OCFS2 already does this differently for metadata vs data
645 * allocations, as those bitmaps are seperate and undo access is never
646 * called on a metadata group descriptor.
647 */
648static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649 int nr)
650{
651 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652
653 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654 return 0;
655 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656 return 1;
657
658 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660}
661
662static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663 struct buffer_head *bg_bh,
664 unsigned int bits_wanted,
665 u16 *bit_off,
666 u16 *bits_found)
667{
668 void *bitmap;
669 u16 best_offset, best_size;
670 int offset, start, found, status = 0;
671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672
673 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675 return -EIO;
676 }
677
678 found = start = best_offset = best_size = 0;
679 bitmap = bg->bg_bitmap;
680
681 while((offset = ocfs2_find_next_zero_bit(bitmap,
682 le16_to_cpu(bg->bg_bits),
683 start)) != -1) {
684 if (offset == le16_to_cpu(bg->bg_bits))
685 break;
686
687 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688 /* We found a zero, but we can't use it as it
689 * hasn't been put to disk yet! */
690 found = 0;
691 start = offset + 1;
692 } else if (offset == start) {
693 /* we found a zero */
694 found++;
695 /* move start to the next bit to test */
696 start++;
697 } else {
698 /* got a zero after some ones */
699 found = 1;
700 start = offset + 1;
701 }
702 if (found > best_size) {
703 best_size = found;
704 best_offset = start - found;
705 }
706 /* we got everything we needed */
707 if (found == bits_wanted) {
708 /* mlog(0, "Found it all!\n"); */
709 break;
710 }
711 }
712
713 /* XXX: I think the first clause is equivalent to the second
714 * - jlbec */
715 if (found == bits_wanted) {
716 *bit_off = start - found;
717 *bits_found = found;
718 } else if (best_size) {
719 *bit_off = best_offset;
720 *bits_found = best_size;
721 } else {
722 status = -ENOSPC;
723 /* No error log here -- see the comment above
724 * ocfs2_test_bg_bit_allocatable */
725 }
726
727 return status;
728}
729
730static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731 struct inode *alloc_inode,
732 struct ocfs2_group_desc *bg,
733 struct buffer_head *group_bh,
734 unsigned int bit_off,
735 unsigned int num_bits)
736{
737 int status;
738 void *bitmap = bg->bg_bitmap;
739 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740
741 mlog_entry_void();
742
743 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745 status = -EIO;
746 goto bail;
747 }
748 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749
750 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751 num_bits);
752
753 if (ocfs2_is_cluster_bitmap(alloc_inode))
754 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755
756 status = ocfs2_journal_access(handle,
757 alloc_inode,
758 group_bh,
759 journal_type);
760 if (status < 0) {
761 mlog_errno(status);
762 goto bail;
763 }
764
765 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766
767 while(num_bits--)
768 ocfs2_set_bit(bit_off++, bitmap);
769
770 status = ocfs2_journal_dirty(handle,
771 group_bh);
772 if (status < 0) {
773 mlog_errno(status);
774 goto bail;
775 }
776
777bail:
778 mlog_exit(status);
779 return status;
780}
781
782/* find the one with the most empty bits */
783static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784{
785 u16 curr, best;
786
787 BUG_ON(!cl->cl_next_free_rec);
788
789 best = curr = 0;
790 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792 le32_to_cpu(cl->cl_recs[best].c_free))
793 best = curr;
794 curr++;
795 }
796
797 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798 return best;
799}
800
801static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802 struct inode *alloc_inode,
803 struct buffer_head *fe_bh,
804 struct buffer_head *bg_bh,
805 struct buffer_head *prev_bg_bh,
806 u16 chain)
807{
808 int status;
809 /* there is a really tiny chance the journal calls could fail,
810 * but we wouldn't want inconsistent blocks in *any* case. */
811 u64 fe_ptr, bg_ptr, prev_bg_ptr;
812 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815
816 if (!OCFS2_IS_VALID_DINODE(fe)) {
817 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818 status = -EIO;
819 goto out;
820 }
821 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823 status = -EIO;
824 goto out;
825 }
826 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828 status = -EIO;
829 goto out;
830 }
831
832 mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833 "top, prev = %"MLFu64"\n",
834 fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835
836 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837 bg_ptr = le64_to_cpu(bg->bg_next_group);
838 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839
840 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841 OCFS2_JOURNAL_ACCESS_WRITE);
842 if (status < 0) {
843 mlog_errno(status);
844 goto out_rollback;
845 }
846
847 prev_bg->bg_next_group = bg->bg_next_group;
848
849 status = ocfs2_journal_dirty(handle, prev_bg_bh);
850 if (status < 0) {
851 mlog_errno(status);
852 goto out_rollback;
853 }
854
855 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856 OCFS2_JOURNAL_ACCESS_WRITE);
857 if (status < 0) {
858 mlog_errno(status);
859 goto out_rollback;
860 }
861
862 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863
864 status = ocfs2_journal_dirty(handle, bg_bh);
865 if (status < 0) {
866 mlog_errno(status);
867 goto out_rollback;
868 }
869
870 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871 OCFS2_JOURNAL_ACCESS_WRITE);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out_rollback;
875 }
876
877 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878
879 status = ocfs2_journal_dirty(handle, fe_bh);
880 if (status < 0) {
881 mlog_errno(status);
882 goto out_rollback;
883 }
884
885 status = 0;
886out_rollback:
887 if (status < 0) {
888 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889 bg->bg_next_group = cpu_to_le64(bg_ptr);
890 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891 }
892out:
893 mlog_exit(status);
894 return status;
895}
896
897static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898 u32 wanted)
899{
900 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901}
902
903/* return 0 on success, -ENOSPC to keep searching and any other < 0
904 * value on error. */
905static int ocfs2_cluster_group_search(struct inode *inode,
906 struct buffer_head *group_bh,
907 u32 bits_wanted, u32 min_bits,
908 u16 *bit_off, u16 *bits_found)
909{
910 int search = -ENOSPC;
911 int ret;
912 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913 u16 tmp_off, tmp_found;
914
915 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916
917 if (bg->bg_free_bits_count) {
918 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919 group_bh, bits_wanted,
920 &tmp_off, &tmp_found);
921 if (ret)
922 return ret;
923
924 /* ocfs2_block_group_find_clear_bits() might
925 * return success, but we still want to return
926 * -ENOSPC unless it found the minimum number
927 * of bits. */
928 if (min_bits <= tmp_found) {
929 *bit_off = tmp_off;
930 *bits_found = tmp_found;
931 search = 0; /* success */
932 }
933 }
934
935 return search;
936}
937
938static int ocfs2_block_group_search(struct inode *inode,
939 struct buffer_head *group_bh,
940 u32 bits_wanted, u32 min_bits,
941 u16 *bit_off, u16 *bits_found)
942{
943 int ret = -ENOSPC;
944 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945
946 BUG_ON(min_bits != 1);
947 BUG_ON(ocfs2_is_cluster_bitmap(inode));
948
949 if (bg->bg_free_bits_count)
950 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951 group_bh, bits_wanted,
952 bit_off, bits_found);
953
954 return ret;
955}
956
957static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958 u32 bits_wanted,
959 u32 min_bits,
960 u16 *bit_off,
961 unsigned int *num_bits,
962 u64 *bg_blkno)
963{
964 int status;
965 u16 chain, tmp_bits;
966 u32 tmp_used;
967 u64 next_group;
968 struct ocfs2_journal_handle *handle = ac->ac_handle;
969 struct inode *alloc_inode = ac->ac_inode;
970 struct buffer_head *group_bh = NULL;
971 struct buffer_head *prev_group_bh = NULL;
972 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974 struct ocfs2_group_desc *bg;
975
976 chain = ac->ac_chain;
977 mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978 bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979
980 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981 le64_to_cpu(cl->cl_recs[chain].c_blkno),
982 &group_bh, OCFS2_BH_CACHED, alloc_inode);
983 if (status < 0) {
984 mlog_errno(status);
985 goto bail;
986 }
987 bg = (struct ocfs2_group_desc *) group_bh->b_data;
988 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990 status = -EIO;
991 goto bail;
992 }
993
994 status = -ENOSPC;
995 /* for now, the chain search is a bit simplistic. We just use
996 * the 1st group with any empty bits. */
997 while ((status = ac->ac_group_search(alloc_inode, group_bh,
998 bits_wanted, min_bits, bit_off,
999 &tmp_bits)) == -ENOSPC) {
1000 if (!bg->bg_next_group)
1001 break;
1002
1003 if (prev_group_bh) {
1004 brelse(prev_group_bh);
1005 prev_group_bh = NULL;
1006 }
1007 next_group = le64_to_cpu(bg->bg_next_group);
1008 prev_group_bh = group_bh;
1009 group_bh = NULL;
1010 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011 next_group, &group_bh,
1012 OCFS2_BH_CACHED, alloc_inode);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto bail;
1016 }
1017 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020 status = -EIO;
1021 goto bail;
1022 }
1023 }
1024 if (status < 0) {
1025 if (status != -ENOSPC)
1026 mlog_errno(status);
1027 goto bail;
1028 }
1029
1030 mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031 tmp_bits, bg->bg_blkno);
1032
1033 *num_bits = tmp_bits;
1034
1035 BUG_ON(*num_bits == 0);
1036
1037 /*
1038 * Keep track of previous block descriptor read. When
1039 * we find a target, if we have read more than X
1040 * number of descriptors, and the target is reasonably
1041 * empty, relink him to top of his chain.
1042 *
1043 * We've read 0 extra blocks and only send one more to
1044 * the transaction, yet the next guy to search has a
1045 * much easier time.
1046 *
1047 * Do this *after* figuring out how many bits we're taking out
1048 * of our target group.
1049 */
1050 if (ac->ac_allow_chain_relink &&
1051 (prev_group_bh) &&
1052 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053 status = ocfs2_relink_block_group(handle, alloc_inode,
1054 ac->ac_bh, group_bh,
1055 prev_group_bh, chain);
1056 if (status < 0) {
1057 mlog_errno(status);
1058 goto bail;
1059 }
1060 }
1061
1062 /* Ok, claim our bits now: set the info on dinode, chainlist
1063 * and then the group */
1064 status = ocfs2_journal_access(handle,
1065 alloc_inode,
1066 ac->ac_bh,
1067 OCFS2_JOURNAL_ACCESS_WRITE);
1068 if (status < 0) {
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076
1077 status = ocfs2_journal_dirty(handle,
1078 ac->ac_bh);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 status = ocfs2_block_group_set_bits(handle,
1085 alloc_inode,
1086 bg,
1087 group_bh,
1088 *bit_off,
1089 *num_bits);
1090 if (status < 0) {
1091 mlog_errno(status);
1092 goto bail;
1093 }
1094
1095 mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096 *num_bits, fe->i_blkno);
1097
1098 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1099bail:
1100 if (group_bh)
1101 brelse(group_bh);
1102 if (prev_group_bh)
1103 brelse(prev_group_bh);
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/* will give out up to bits_wanted contiguous bits. */
1110static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111 struct ocfs2_alloc_context *ac,
1112 u32 bits_wanted,
1113 u32 min_bits,
1114 u16 *bit_off,
1115 unsigned int *num_bits,
1116 u64 *bg_blkno)
1117{
1118 int status;
1119 u16 victim, i;
1120 struct ocfs2_chain_list *cl;
1121 struct ocfs2_dinode *fe;
1122
1123 mlog_entry_void();
1124
1125 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127 BUG_ON(!ac->ac_bh);
1128
1129 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130 if (!OCFS2_IS_VALID_DINODE(fe)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132 status = -EIO;
1133 goto bail;
1134 }
1135 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137 ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138 "used bits but only %u total.",
1139 le64_to_cpu(fe->i_blkno),
1140 le32_to_cpu(fe->id1.bitmap1.i_used),
1141 le32_to_cpu(fe->id1.bitmap1.i_total));
1142 status = -EIO;
1143 goto bail;
1144 }
1145
1146 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147
1148 victim = ocfs2_find_victim_chain(cl);
1149 ac->ac_chain = victim;
1150 ac->ac_allow_chain_relink = 1;
1151
1152 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153 num_bits, bg_blkno);
1154 if (!status)
1155 goto bail;
1156 if (status < 0 && status != -ENOSPC) {
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160
1161 mlog(0, "Search of victim chain %u came up with nothing, "
1162 "trying all chains now.\n", victim);
1163
1164 /* If we didn't pick a good victim, then just default to
1165 * searching each chain in order. Don't allow chain relinking
1166 * because we only calculate enough journal credits for one
1167 * relink per alloc. */
1168 ac->ac_allow_chain_relink = 0;
1169 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170 if (i == victim)
1171 continue;
1172 if (!cl->cl_recs[i].c_free)
1173 continue;
1174
1175 ac->ac_chain = i;
1176 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177 bit_off, num_bits,
1178 bg_blkno);
1179 if (!status)
1180 break;
1181 if (status < 0 && status != -ENOSPC) {
1182 mlog_errno(status);
1183 goto bail;
1184 }
1185 }
1186bail:
1187
1188 mlog_exit(status);
1189 return status;
1190}
1191
1192int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193 struct ocfs2_journal_handle *handle,
1194 struct ocfs2_alloc_context *ac,
1195 u32 bits_wanted,
1196 u16 *suballoc_bit_start,
1197 unsigned int *num_bits,
1198 u64 *blkno_start)
1199{
1200 int status;
1201 u64 bg_blkno;
1202
1203 BUG_ON(!ac);
1204 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206 BUG_ON(ac->ac_handle != handle);
1207
1208 status = ocfs2_claim_suballoc_bits(osb,
1209 ac,
1210 bits_wanted,
1211 1,
1212 suballoc_bit_start,
1213 num_bits,
1214 &bg_blkno);
1215 if (status < 0) {
1216 mlog_errno(status);
1217 goto bail;
1218 }
1219 atomic_inc(&osb->alloc_stats.bg_allocs);
1220
1221 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222 ac->ac_bits_given += (*num_bits);
1223 status = 0;
1224bail:
1225 mlog_exit(status);
1226 return status;
1227}
1228
1229int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230 struct ocfs2_journal_handle *handle,
1231 struct ocfs2_alloc_context *ac,
1232 u16 *suballoc_bit,
1233 u64 *fe_blkno)
1234{
1235 int status;
1236 unsigned int num_bits;
1237 u64 bg_blkno;
1238
1239 mlog_entry_void();
1240
1241 BUG_ON(!ac);
1242 BUG_ON(ac->ac_bits_given != 0);
1243 BUG_ON(ac->ac_bits_wanted != 1);
1244 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245 BUG_ON(ac->ac_handle != handle);
1246
1247 status = ocfs2_claim_suballoc_bits(osb,
1248 ac,
1249 1,
1250 1,
1251 suballoc_bit,
1252 &num_bits,
1253 &bg_blkno);
1254 if (status < 0) {
1255 mlog_errno(status);
1256 goto bail;
1257 }
1258 atomic_inc(&osb->alloc_stats.bg_allocs);
1259
1260 BUG_ON(num_bits != 1);
1261
1262 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263 ac->ac_bits_given++;
1264 status = 0;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* translate a group desc. blkno and it's bitmap offset into
1271 * disk cluster offset. */
1272static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273 u64 bg_blkno,
1274 u16 bg_bit_off)
1275{
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u32 cluster = 0;
1278
1279 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280
1281 if (bg_blkno != osb->first_cluster_group_blkno)
1282 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283 cluster += (u32) bg_bit_off;
1284 return cluster;
1285}
1286
1287/* given a cluster offset, calculate which block group it belongs to
1288 * and return that block offset. */
1289static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290 u32 cluster)
1291{
1292 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293 u32 group_no;
1294
1295 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296
1297 group_no = cluster / osb->bitmap_cpg;
1298 if (!group_no)
1299 return osb->first_cluster_group_blkno;
1300 return ocfs2_clusters_to_blocks(inode->i_sb,
1301 group_no * osb->bitmap_cpg);
1302}
1303
1304/* given the block number of a cluster start, calculate which cluster
1305 * group and descriptor bitmap offset that corresponds to. */
1306static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307 u64 data_blkno,
1308 u64 *bg_blkno,
1309 u16 *bg_bit_off)
1310{
1311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313
1314 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315
1316 *bg_blkno = ocfs2_which_cluster_group(inode,
1317 data_cluster);
1318
1319 if (*bg_blkno == osb->first_cluster_group_blkno)
1320 *bg_bit_off = (u16) data_cluster;
1321 else
1322 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323 data_blkno - *bg_blkno);
1324}
1325
1326/*
1327 * min_bits - minimum contiguous chunk from this total allocation we
1328 * can handle. set to what we asked for originally for a full
1329 * contig. allocation, set to '1' to indicate we can deal with extents
1330 * of any size.
1331 */
1332int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333 struct ocfs2_journal_handle *handle,
1334 struct ocfs2_alloc_context *ac,
1335 u32 min_clusters,
1336 u32 *cluster_start,
1337 u32 *num_clusters)
1338{
1339 int status;
1340 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341 u64 bg_blkno;
1342 u16 bg_bit_off;
1343
1344 mlog_entry_void();
1345
1346 BUG_ON(!ac);
1347 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348
1349 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350 && ac->ac_which != OCFS2_AC_USE_MAIN);
1351 BUG_ON(ac->ac_handle != handle);
1352
1353 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354 status = ocfs2_claim_local_alloc_bits(osb,
1355 handle,
1356 ac,
1357 bits_wanted,
1358 cluster_start,
1359 num_clusters);
1360 if (!status)
1361 atomic_inc(&osb->alloc_stats.local_data);
1362 } else {
1363 if (min_clusters > (osb->bitmap_cpg - 1)) {
1364 /* The only paths asking for contiguousness
1365 * should know about this already. */
1366 mlog(ML_ERROR, "minimum allocation requested exceeds "
1367 "group bitmap size!");
1368 status = -ENOSPC;
1369 goto bail;
1370 }
1371 /* clamp the current request down to a realistic size. */
1372 if (bits_wanted > (osb->bitmap_cpg - 1))
1373 bits_wanted = osb->bitmap_cpg - 1;
1374
1375 status = ocfs2_claim_suballoc_bits(osb,
1376 ac,
1377 bits_wanted,
1378 min_clusters,
1379 &bg_bit_off,
1380 num_clusters,
1381 &bg_blkno);
1382 if (!status) {
1383 *cluster_start =
1384 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385 bg_blkno,
1386 bg_bit_off);
1387 atomic_inc(&osb->alloc_stats.bitmap_data);
1388 }
1389 }
1390 if (status < 0) {
1391 if (status != -ENOSPC)
1392 mlog_errno(status);
1393 goto bail;
1394 }
1395
1396 ac->ac_bits_given += *num_clusters;
1397
1398bail:
1399 mlog_exit(status);
1400 return status;
1401}
1402
1403static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404 struct inode *alloc_inode,
1405 struct ocfs2_group_desc *bg,
1406 struct buffer_head *group_bh,
1407 unsigned int bit_off,
1408 unsigned int num_bits)
1409{
1410 int status;
1411 unsigned int tmp;
1412 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413 struct ocfs2_group_desc *undo_bg = NULL;
1414
1415 mlog_entry_void();
1416
1417 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419 status = -EIO;
1420 goto bail;
1421 }
1422
1423 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424
1425 if (ocfs2_is_cluster_bitmap(alloc_inode))
1426 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427
1428 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429 journal_type);
1430 if (status < 0) {
1431 mlog_errno(status);
1432 goto bail;
1433 }
1434
1435 if (ocfs2_is_cluster_bitmap(alloc_inode))
1436 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437
1438 tmp = num_bits;
1439 while(tmp--) {
1440 ocfs2_clear_bit((bit_off + tmp),
1441 (unsigned long *) bg->bg_bitmap);
1442 if (ocfs2_is_cluster_bitmap(alloc_inode))
1443 ocfs2_set_bit(bit_off + tmp,
1444 (unsigned long *) undo_bg->bg_bitmap);
1445 }
1446 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447
1448 status = ocfs2_journal_dirty(handle, group_bh);
1449 if (status < 0)
1450 mlog_errno(status);
1451bail:
1452 return status;
1453}
1454
1455/*
1456 * expects the suballoc inode to already be locked.
1457 */
1458static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459 struct inode *alloc_inode,
1460 struct buffer_head *alloc_bh,
1461 unsigned int start_bit,
1462 u64 bg_blkno,
1463 unsigned int count)
1464{
1465 int status = 0;
1466 u32 tmp_used;
1467 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470 struct buffer_head *group_bh = NULL;
1471 struct ocfs2_group_desc *group;
1472
1473 mlog_entry_void();
1474
1475 if (!OCFS2_IS_VALID_DINODE(fe)) {
1476 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477 status = -EIO;
1478 goto bail;
1479 }
1480 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481
1482 mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483 ", starting at %u\n",
1484 OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485 start_bit);
1486
1487 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488 alloc_inode);
1489 if (status < 0) {
1490 mlog_errno(status);
1491 goto bail;
1492 }
1493
1494 group = (struct ocfs2_group_desc *) group_bh->b_data;
1495 if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497 status = -EIO;
1498 goto bail;
1499 }
1500 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501
1502 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503 group, group_bh,
1504 start_bit, count);
1505 if (status < 0) {
1506 mlog_errno(status);
1507 goto bail;
1508 }
1509
1510 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511 OCFS2_JOURNAL_ACCESS_WRITE);
1512 if (status < 0) {
1513 mlog_errno(status);
1514 goto bail;
1515 }
1516
1517 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518 count);
1519 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521
1522 status = ocfs2_journal_dirty(handle, alloc_bh);
1523 if (status < 0) {
1524 mlog_errno(status);
1525 goto bail;
1526 }
1527
1528bail:
1529 if (group_bh)
1530 brelse(group_bh);
1531
1532 mlog_exit(status);
1533 return status;
1534}
1535
1536static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537{
1538 u64 group = block - (u64) bit;
1539
1540 return group;
1541}
1542
1543int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544 struct inode *inode_alloc_inode,
1545 struct buffer_head *inode_alloc_bh,
1546 struct ocfs2_dinode *di)
1547{
1548 u64 blk = le64_to_cpu(di->i_blkno);
1549 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551
1552 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553 inode_alloc_bh, bit, bg_blkno, 1);
1554}
1555
1556int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557 struct inode *eb_alloc_inode,
1558 struct buffer_head *eb_alloc_bh,
1559 struct ocfs2_extent_block *eb)
1560{
1561 u64 blk = le64_to_cpu(eb->h_blkno);
1562 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564
1565 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566 bit, bg_blkno, 1);
1567}
1568
1569int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570 struct inode *bitmap_inode,
1571 struct buffer_head *bitmap_bh,
1572 u64 start_blk,
1573 unsigned int num_clusters)
1574{
1575 int status;
1576 u16 bg_start_bit;
1577 u64 bg_blkno;
1578 struct ocfs2_dinode *fe;
1579
1580 /* You can't ever have a contiguous set of clusters
1581 * bigger than a block group bitmap so we never have to worry
1582 * about looping on them. */
1583
1584 mlog_entry_void();
1585
1586 /* This is expensive. We can safely remove once this stuff has
1587 * gotten tested really well. */
1588 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589
1590 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591
1592 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593 &bg_start_bit);
1594
1595 mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596 num_clusters, start_blk);
1597 mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598 bg_blkno, bg_start_bit);
1599
1600 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601 bg_start_bit, bg_blkno,
1602 num_clusters);
1603 if (status < 0)
1604 mlog_errno(status);
1605
1606 mlog_exit(status);
1607 return status;
1608}
1609
1610static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611{
1612 printk("Block Group:\n");
1613 printk("bg_signature: %s\n", bg->bg_signature);
1614 printk("bg_size: %u\n", bg->bg_size);
1615 printk("bg_bits: %u\n", bg->bg_bits);
1616 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617 printk("bg_chain: %u\n", bg->bg_chain);
1618 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1619 printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group);
1620 printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode);
1621 printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno);
1622}
1623
1624static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625{
1626 int i;
1627
1628 printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629 printk("i_signature: %s\n", fe->i_signature);
1630 printk("i_size: %"MLFu64"\n", fe->i_size);
1631 printk("i_clusters: %u\n", fe->i_clusters);
1632 printk("i_generation: %u\n",
1633 le32_to_cpu(fe->i_generation));
1634 printk("id1.bitmap1.i_used: %u\n",
1635 le32_to_cpu(fe->id1.bitmap1.i_used));
1636 printk("id1.bitmap1.i_total: %u\n",
1637 le32_to_cpu(fe->id1.bitmap1.i_total));
1638 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1639 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1640 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1641 printk("id2.i_chain.cl_next_free_rec: %u\n",
1642 fe->id2.i_chain.cl_next_free_rec);
1643 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1645 fe->id2.i_chain.cl_recs[i].c_free);
1646 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647 fe->id2.i_chain.cl_recs[i].c_total);
1648 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649 fe->id2.i_chain.cl_recs[i].c_blkno);
1650 }
1651}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 000000000000..a76c82a7ceac
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.h
5 *
6 * Defines sub allocator api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_
28
29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *,
31 u32,
32 u32,
33 u16 *,
34 u16 *);
35
36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_bits_wanted;
40 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1
42#define OCFS2_AC_USE_MAIN 2
43#define OCFS2_AC_USE_INODE 3
44#define OCFS2_AC_USE_META 4
45 u32 ac_which;
46 struct ocfs2_journal_handle *ac_handle;
47
48 /* these are used by the chain search */
49 u16 ac_chain;
50 int ac_allow_chain_relink;
51 group_search_t *ac_group_search;
52};
53
54void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
55static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
56{
57 return ac->ac_bits_wanted - ac->ac_bits_given;
58}
59
60int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *fe,
63 struct ocfs2_alloc_context **ac);
64int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
65 struct ocfs2_journal_handle *handle,
66 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb,
68 struct ocfs2_journal_handle *handle,
69 u32 bits_wanted,
70 struct ocfs2_alloc_context **ac);
71
72int ocfs2_claim_metadata(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac,
75 u32 bits_wanted,
76 u16 *suballoc_bit_start,
77 u32 *num_bits,
78 u64 *blkno_start);
79int ocfs2_claim_new_inode(struct ocfs2_super *osb,
80 struct ocfs2_journal_handle *handle,
81 struct ocfs2_alloc_context *ac,
82 u16 *suballoc_bit,
83 u64 *fe_blkno);
84int ocfs2_claim_clusters(struct ocfs2_super *osb,
85 struct ocfs2_journal_handle *handle,
86 struct ocfs2_alloc_context *ac,
87 u32 min_clusters,
88 u32 *cluster_start,
89 u32 *num_clusters);
90
91int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
92 struct inode *inode_alloc_inode,
93 struct buffer_head *inode_alloc_bh,
94 struct ocfs2_dinode *di);
95int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
96 struct inode *eb_alloc_inode,
97 struct buffer_head *eb_alloc_bh,
98 struct ocfs2_extent_block *eb);
99int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
100 struct inode *bitmap_inode,
101 struct buffer_head *bitmap_bh,
102 u64 start_blk,
103 unsigned int num_clusters);
104
105static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
106 u64 bg_blkno)
107{
108 /* This should work for all block group descriptors as only
109 * the 1st group descriptor of the cluster bitmap is
110 * different. */
111
112 if (bg_blkno == osb->first_cluster_group_blkno)
113 return 0;
114
115 /* the rest of the block groups are located at the beginning
116 * of their 1st cluster, so a direct translation just
117 * works. */
118 return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
119}
120
121static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
122{
123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
124 return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
125}
126
127/* This is for local alloc ONLY. Others should use the task-specific
128 * apis above. */
129int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
130 struct ocfs2_alloc_context *ac);
131
132#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 000000000000..48bf7f0ce544
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.c
5 *
6 * load/unload driver, mount/dismount volumes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/random.h>
34#include <linux/statfs.h>
35#include <linux/moduleparam.h>
36#include <linux/blkdev.h>
37#include <linux/socket.h>
38#include <linux/inet.h>
39#include <linux/parser.h>
40#include <linux/crc32.h>
41#include <linux/debugfs.h>
42
43#include <cluster/nodemanager.h>
44
45#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h>
47
48#include "ocfs2.h"
49
50/* this should be the only file to include a version 1 header */
51#include "ocfs1_fs_compat.h"
52
53#include "alloc.h"
54#include "dlmglue.h"
55#include "export.h"
56#include "extent_map.h"
57#include "heartbeat.h"
58#include "inode.h"
59#include "journal.h"
60#include "localalloc.h"
61#include "namei.h"
62#include "slot_map.h"
63#include "super.h"
64#include "sysfile.h"
65#include "uptodate.h"
66#include "ver.h"
67#include "vote.h"
68
69#include "buffer_head_io.h"
70
71/*
72 * Globals
73 */
74static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
75
76static u32 osb_id; /* Keeps track of next available OSB Id */
77
78static kmem_cache_t *ocfs2_inode_cachep = NULL;
79
80kmem_cache_t *ocfs2_lock_cache = NULL;
81
82/* OCFS2 needs to schedule several differnt types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL;
87
88static struct dentry *ocfs2_debugfs_root = NULL;
89
90MODULE_AUTHOR("Oracle");
91MODULE_LICENSE("GPL");
92
93static int ocfs2_parse_options(struct super_block *sb, char *options,
94 unsigned long *mount_opt, int is_remount);
95static void ocfs2_put_super(struct super_block *sb);
96static int ocfs2_mount_volume(struct super_block *sb);
97static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
98static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
99static int ocfs2_initialize_mem_caches(void);
100static void ocfs2_free_mem_caches(void);
101static void ocfs2_delete_osb(struct ocfs2_super *osb);
102
103static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
104
105static int ocfs2_sync_fs(struct super_block *sb, int wait);
106
107static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
111static int ocfs2_check_volume(struct ocfs2_super *osb);
112static int ocfs2_verify_volume(struct ocfs2_dinode *di,
113 struct buffer_head *bh,
114 u32 sectsize);
115static int ocfs2_initialize_super(struct super_block *sb,
116 struct buffer_head *bh,
117 int sector_size);
118static int ocfs2_get_sector(struct super_block *sb,
119 struct buffer_head **bh,
120 int block,
121 int sect_size);
122static void ocfs2_write_super(struct super_block *sb);
123static struct inode *ocfs2_alloc_inode(struct super_block *sb);
124static void ocfs2_destroy_inode(struct inode *inode);
125
126static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
127
128static struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs,
130 .alloc_inode = ocfs2_alloc_inode,
131 .destroy_inode = ocfs2_destroy_inode,
132 .drop_inode = ocfs2_drop_inode,
133 .clear_inode = ocfs2_clear_inode,
134 .delete_inode = ocfs2_delete_inode,
135 .sync_fs = ocfs2_sync_fs,
136 .write_super = ocfs2_write_super,
137 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount,
139};
140
141enum {
142 Opt_barrier,
143 Opt_err_panic,
144 Opt_err_ro,
145 Opt_intr,
146 Opt_nointr,
147 Opt_hb_none,
148 Opt_hb_local,
149 Opt_data_ordered,
150 Opt_data_writeback,
151 Opt_err,
152};
153
154static match_table_t tokens = {
155 {Opt_barrier, "barrier=%u"},
156 {Opt_err_panic, "errors=panic"},
157 {Opt_err_ro, "errors=remount-ro"},
158 {Opt_intr, "intr"},
159 {Opt_nointr, "nointr"},
160 {Opt_hb_none, OCFS2_HB_NONE},
161 {Opt_hb_local, OCFS2_HB_LOCAL},
162 {Opt_data_ordered, "data=ordered"},
163 {Opt_data_writeback, "data=writeback"},
164 {Opt_err, NULL}
165};
166
167/*
168 * write_super and sync_fs ripped right out of ext3.
169 */
170static void ocfs2_write_super(struct super_block *sb)
171{
172 if (down_trylock(&sb->s_lock) == 0)
173 BUG();
174 sb->s_dirt = 0;
175}
176
177static int ocfs2_sync_fs(struct super_block *sb, int wait)
178{
179 int status = 0;
180 tid_t target;
181 struct ocfs2_super *osb = OCFS2_SB(sb);
182
183 sb->s_dirt = 0;
184
185 if (ocfs2_is_hard_readonly(osb))
186 return -EROFS;
187
188 if (wait) {
189 status = ocfs2_flush_truncate_log(osb);
190 if (status < 0)
191 mlog_errno(status);
192 } else {
193 ocfs2_schedule_truncate_log_flush(osb, 0);
194 }
195
196 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
197 if (wait)
198 log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
199 target);
200 }
201 return 0;
202}
203
204static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
205{
206 struct inode *new = NULL;
207 int status = 0;
208 int i;
209
210 mlog_entry_void();
211
212 new = ocfs2_iget(osb, osb->root_blkno);
213 if (IS_ERR(new)) {
214 status = PTR_ERR(new);
215 mlog_errno(status);
216 goto bail;
217 }
218 osb->root_inode = new;
219
220 new = ocfs2_iget(osb, osb->system_dir_blkno);
221 if (IS_ERR(new)) {
222 status = PTR_ERR(new);
223 mlog_errno(status);
224 goto bail;
225 }
226 osb->sys_root_inode = new;
227
228 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
229 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
230 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
231 if (!new) {
232 ocfs2_release_system_inodes(osb);
233 status = -EINVAL;
234 mlog_errno(status);
235 /* FIXME: Should ERROR_RO_FS */
236 mlog(ML_ERROR, "Unable to load system inode %d, "
237 "possibly corrupt fs?", i);
238 goto bail;
239 }
240 // the array now has one ref, so drop this one
241 iput(new);
242 }
243
244bail:
245 mlog_exit(status);
246 return status;
247}
248
249static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
250{
251 struct inode *new = NULL;
252 int status = 0;
253 int i;
254
255 mlog_entry_void();
256
257 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
258 i < NUM_SYSTEM_INODES;
259 i++) {
260 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
261 if (!new) {
262 ocfs2_release_system_inodes(osb);
263 status = -EINVAL;
264 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
265 status, i, osb->slot_num);
266 goto bail;
267 }
268 /* the array now has one ref, so drop this one */
269 iput(new);
270 }
271
272bail:
273 mlog_exit(status);
274 return status;
275}
276
277static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
278{
279 int status = 0, i;
280 struct inode *inode;
281
282 mlog_entry_void();
283
284 for (i = 0; i < NUM_SYSTEM_INODES; i++) {
285 inode = osb->system_inodes[i];
286 if (inode) {
287 iput(inode);
288 osb->system_inodes[i] = NULL;
289 }
290 }
291
292 inode = osb->sys_root_inode;
293 if (inode) {
294 iput(inode);
295 osb->sys_root_inode = NULL;
296 }
297
298 inode = osb->root_inode;
299 if (inode) {
300 iput(inode);
301 osb->root_inode = NULL;
302 }
303
304 mlog_exit(status);
305 return status;
306}
307
308/* We're allocating fs objects, use GFP_NOFS */
309static struct inode *ocfs2_alloc_inode(struct super_block *sb)
310{
311 struct ocfs2_inode_info *oi;
312
313 oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
314 if (!oi)
315 return NULL;
316
317 return &oi->vfs_inode;
318}
319
320static void ocfs2_destroy_inode(struct inode *inode)
321{
322 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
323}
324
325/* From xfs_super.c:xfs_max_file_offset
326 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
327 */
328static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
329{
330 unsigned int pagefactor = 1;
331 unsigned int bitshift = BITS_PER_LONG - 1;
332
333 /* Figure out maximum filesize, on Linux this can depend on
334 * the filesystem blocksize (on 32 bit platforms).
335 * __block_prepare_write does this in an [unsigned] long...
336 * page->index << (PAGE_CACHE_SHIFT - bbits)
337 * So, for page sized blocks (4K on 32 bit platforms),
338 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
339 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
340 * but for smaller blocksizes it is less (bbits = log2 bsize).
341 * Note1: get_block_t takes a long (implicit cast from above)
342 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
343 * can optionally convert the [unsigned] long from above into
344 * an [unsigned] long long.
345 */
346
347#if BITS_PER_LONG == 32
348# if defined(CONFIG_LBD)
349 BUG_ON(sizeof(sector_t) != 8);
350 pagefactor = PAGE_CACHE_SIZE;
351 bitshift = BITS_PER_LONG;
352# else
353 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
354# endif
355#endif
356
357 return (((unsigned long long)pagefactor) << bitshift) - 1;
358}
359
360static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
361{
362 int incompat_features;
363 int ret = 0;
364 unsigned long parsed_options;
365 struct ocfs2_super *osb = OCFS2_SB(sb);
366
367 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
368 ret = -EINVAL;
369 goto out;
370 }
371
372 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
373 (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
374 ret = -EINVAL;
375 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
376 goto out;
377 }
378
379 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
380 (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
381 ret = -EINVAL;
382 mlog(ML_ERROR, "Cannot change data mode on remount\n");
383 goto out;
384 }
385
386 /* We're going to/from readonly mode. */
387 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
388 /* Lock here so the check of HARD_RO and the potential
389 * setting of SOFT_RO is atomic. */
390 spin_lock(&osb->osb_lock);
391 if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
392 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
393 ret = -EROFS;
394 goto unlock_osb;
395 }
396
397 if (*flags & MS_RDONLY) {
398 mlog(0, "Going to ro mode.\n");
399 sb->s_flags |= MS_RDONLY;
400 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
401 } else {
402 mlog(0, "Making ro filesystem writeable.\n");
403
404 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
405 mlog(ML_ERROR, "Cannot remount RDWR "
406 "filesystem due to previous errors.\n");
407 ret = -EROFS;
408 goto unlock_osb;
409 }
410 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
411 if (incompat_features) {
412 mlog(ML_ERROR, "Cannot remount RDWR because "
413 "of unsupported optional features "
414 "(%x).\n", incompat_features);
415 ret = -EINVAL;
416 goto unlock_osb;
417 }
418 sb->s_flags &= ~MS_RDONLY;
419 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
420 }
421unlock_osb:
422 spin_unlock(&osb->osb_lock);
423 }
424
425 if (!ret) {
426 if (!ocfs2_is_hard_readonly(osb))
427 ocfs2_set_journal_params(osb);
428
429 /* Only save off the new mount options in case of a successful
430 * remount. */
431 osb->s_mount_opt = parsed_options;
432 }
433out:
434 return ret;
435}
436
437static int ocfs2_sb_probe(struct super_block *sb,
438 struct buffer_head **bh,
439 int *sector_size)
440{
441 int status = 0, tmpstat;
442 struct ocfs1_vol_disk_hdr *hdr;
443 struct ocfs2_dinode *di;
444 int blksize;
445
446 *bh = NULL;
447
448 /* may be > 512 */
449 *sector_size = bdev_hardsect_size(sb->s_bdev);
450 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
451 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
452 *sector_size, OCFS2_MAX_BLOCKSIZE);
453 status = -EINVAL;
454 goto bail;
455 }
456
457 /* Can this really happen? */
458 if (*sector_size < OCFS2_MIN_BLOCKSIZE)
459 *sector_size = OCFS2_MIN_BLOCKSIZE;
460
461 /* check block zero for old format */
462 status = ocfs2_get_sector(sb, bh, 0, *sector_size);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
468 if (hdr->major_version == OCFS1_MAJOR_VERSION) {
469 mlog(ML_ERROR, "incompatible version: %u.%u\n",
470 hdr->major_version, hdr->minor_version);
471 status = -EINVAL;
472 }
473 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
474 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
475 mlog(ML_ERROR, "incompatible volume signature: %8s\n",
476 hdr->signature);
477 status = -EINVAL;
478 }
479 brelse(*bh);
480 *bh = NULL;
481 if (status < 0) {
482 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
483 "upgraded before mounting with ocfs v2\n");
484 goto bail;
485 }
486
487 /*
488 * Now check at magic offset for 512, 1024, 2048, 4096
489 * blocksizes. 4096 is the maximum blocksize because it is
490 * the minimum clustersize.
491 */
492 status = -EINVAL;
493 for (blksize = *sector_size;
494 blksize <= OCFS2_MAX_BLOCKSIZE;
495 blksize <<= 1) {
496 tmpstat = ocfs2_get_sector(sb, bh,
497 OCFS2_SUPER_BLOCK_BLKNO,
498 blksize);
499 if (tmpstat < 0) {
500 status = tmpstat;
501 mlog_errno(status);
502 goto bail;
503 }
504 di = (struct ocfs2_dinode *) (*bh)->b_data;
505 status = ocfs2_verify_volume(di, *bh, blksize);
506 if (status >= 0)
507 goto bail;
508 brelse(*bh);
509 *bh = NULL;
510 if (status != -EAGAIN)
511 break;
512 }
513
514bail:
515 return status;
516}
517
518static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
519{
520 struct dentry *root;
521 int status, sector_size;
522 unsigned long parsed_opt;
523 struct inode *inode = NULL;
524 struct ocfs2_super *osb = NULL;
525 struct buffer_head *bh = NULL;
526
527 mlog_entry("%p, %p, %i", sb, data, silent);
528
529 /* for now we only have one cluster/node, make sure we see it
530 * in the heartbeat universe */
531 if (!o2hb_check_local_node_heartbeating()) {
532 status = -EINVAL;
533 goto read_super_error;
534 }
535
536 /* probe for superblock */
537 status = ocfs2_sb_probe(sb, &bh, &sector_size);
538 if (status < 0) {
539 mlog(ML_ERROR, "superblock probe failed!\n");
540 goto read_super_error;
541 }
542
543 status = ocfs2_initialize_super(sb, bh, sector_size);
544 osb = OCFS2_SB(sb);
545 if (status < 0) {
546 mlog_errno(status);
547 goto read_super_error;
548 }
549 brelse(bh);
550 bh = NULL;
551
552 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
553 status = -EINVAL;
554 goto read_super_error;
555 }
556 osb->s_mount_opt = parsed_opt;
557
558 sb->s_magic = OCFS2_SUPER_MAGIC;
559
560 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
561 * heartbeat=none */
562 if (bdev_read_only(sb->s_bdev)) {
563 if (!(sb->s_flags & MS_RDONLY)) {
564 status = -EACCES;
565 mlog(ML_ERROR, "Readonly device detected but readonly "
566 "mount was not specified.\n");
567 goto read_super_error;
568 }
569
570 /* You should not be able to start a local heartbeat
571 * on a readonly device. */
572 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
573 status = -EROFS;
574 mlog(ML_ERROR, "Local heartbeat specified on readonly "
575 "device.\n");
576 goto read_super_error;
577 }
578
579 status = ocfs2_check_journals_nolocks(osb);
580 if (status < 0) {
581 if (status == -EROFS)
582 mlog(ML_ERROR, "Recovery required on readonly "
583 "file system, but write access is "
584 "unavailable.\n");
585 else
586 mlog_errno(status);
587 goto read_super_error;
588 }
589
590 ocfs2_set_ro_flag(osb, 1);
591
592 printk(KERN_NOTICE "Readonly device detected. No cluster "
593 "services will be utilized for this mount. Recovery "
594 "will be skipped.\n");
595 }
596
597 if (!ocfs2_is_hard_readonly(osb)) {
598 /* If this isn't a hard readonly mount, then we need
599 * to make sure that heartbeat is in a valid state,
600 * and that we mark ourselves soft readonly is -oro
601 * was specified. */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
603 mlog(ML_ERROR, "No heartbeat for device (%s)\n",
604 sb->s_id);
605 status = -EINVAL;
606 goto read_super_error;
607 }
608
609 if (sb->s_flags & MS_RDONLY)
610 ocfs2_set_ro_flag(osb, 0);
611 }
612
613 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
614 ocfs2_debugfs_root);
615 if (!osb->osb_debug_root) {
616 status = -EINVAL;
617 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
618 goto read_super_error;
619 }
620
621 status = ocfs2_mount_volume(sb);
622 if (osb->root_inode)
623 inode = igrab(osb->root_inode);
624
625 if (status < 0)
626 goto read_super_error;
627
628 if (!inode) {
629 status = -EIO;
630 mlog_errno(status);
631 goto read_super_error;
632 }
633
634 root = d_alloc_root(inode);
635 if (!root) {
636 status = -ENOMEM;
637 mlog_errno(status);
638 goto read_super_error;
639 }
640
641 sb->s_root = root;
642
643 ocfs2_complete_mount_recovery(osb);
644
645 printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
646 "data mode.\n",
647 MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
648 osb->slot_num,
649 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
650 "ordered");
651
652 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
653 wake_up(&osb->osb_mount_event);
654
655 mlog_exit(status);
656 return status;
657
658read_super_error:
659 if (bh != NULL)
660 brelse(bh);
661
662 if (inode)
663 iput(inode);
664
665 if (osb) {
666 atomic_set(&osb->vol_state, VOLUME_DISABLED);
667 wake_up(&osb->osb_mount_event);
668 ocfs2_dismount_volume(sb, 1);
669 }
670
671 mlog_exit(status);
672 return status;
673}
674
675static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
676 int flags,
677 const char *dev_name,
678 void *data)
679{
680 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
681}
682
683static struct file_system_type ocfs2_fs_type = {
684 .owner = THIS_MODULE,
685 .name = "ocfs2",
686 .get_sb = ocfs2_get_sb, /* is this called when we mount
687 * the fs? */
688 .kill_sb = kill_block_super, /* set to the generic one
689 * right now, but do we
690 * need to change that? */
691 .fs_flags = FS_REQUIRES_DEV,
692 .next = NULL
693};
694
695static int ocfs2_parse_options(struct super_block *sb,
696 char *options,
697 unsigned long *mount_opt,
698 int is_remount)
699{
700 int status;
701 char *p;
702
703 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
704 options ? options : "(none)");
705
706 *mount_opt = 0;
707
708 if (!options) {
709 status = 1;
710 goto bail;
711 }
712
713 while ((p = strsep(&options, ",")) != NULL) {
714 int token, option;
715 substring_t args[MAX_OPT_ARGS];
716
717 if (!*p)
718 continue;
719
720 token = match_token(p, tokens, args);
721 switch (token) {
722 case Opt_hb_local:
723 *mount_opt |= OCFS2_MOUNT_HB_LOCAL;
724 break;
725 case Opt_hb_none:
726 *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
727 break;
728 case Opt_barrier:
729 if (match_int(&args[0], &option)) {
730 status = 0;
731 goto bail;
732 }
733 if (option)
734 *mount_opt |= OCFS2_MOUNT_BARRIER;
735 else
736 *mount_opt &= ~OCFS2_MOUNT_BARRIER;
737 break;
738 case Opt_intr:
739 *mount_opt &= ~OCFS2_MOUNT_NOINTR;
740 break;
741 case Opt_nointr:
742 *mount_opt |= OCFS2_MOUNT_NOINTR;
743 break;
744 case Opt_err_panic:
745 *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
746 break;
747 case Opt_err_ro:
748 *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
749 break;
750 case Opt_data_ordered:
751 *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
752 break;
753 case Opt_data_writeback:
754 *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
755 break;
756 default:
757 mlog(ML_ERROR,
758 "Unrecognized mount option \"%s\" "
759 "or missing value\n", p);
760 status = 0;
761 goto bail;
762 }
763 }
764
765 status = 1;
766
767bail:
768 mlog_exit(status);
769 return status;
770}
771
772static int __init ocfs2_init(void)
773{
774 int status;
775
776 mlog_entry_void();
777
778 ocfs2_print_version();
779
780 if (init_ocfs2_extent_maps())
781 return -ENOMEM;
782
783 status = init_ocfs2_uptodate_cache();
784 if (status < 0) {
785 mlog_errno(status);
786 goto leave;
787 }
788
789 status = ocfs2_initialize_mem_caches();
790 if (status < 0) {
791 mlog_errno(status);
792 goto leave;
793 }
794
795 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
796 if (!ocfs2_wq) {
797 status = -ENOMEM;
798 goto leave;
799 }
800
801 spin_lock(&ocfs2_globals_lock);
802 osb_id = 0;
803 spin_unlock(&ocfs2_globals_lock);
804
805 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
806 if (!ocfs2_debugfs_root) {
807 status = -EFAULT;
808 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
809 }
810
811leave:
812 if (status < 0) {
813 ocfs2_free_mem_caches();
814 exit_ocfs2_uptodate_cache();
815 exit_ocfs2_extent_maps();
816 }
817
818 mlog_exit(status);
819
820 if (status >= 0) {
821 return register_filesystem(&ocfs2_fs_type);
822 } else
823 return -1;
824}
825
826static void __exit ocfs2_exit(void)
827{
828 mlog_entry_void();
829
830 if (ocfs2_wq) {
831 flush_workqueue(ocfs2_wq);
832 destroy_workqueue(ocfs2_wq);
833 }
834
835 debugfs_remove(ocfs2_debugfs_root);
836
837 ocfs2_free_mem_caches();
838
839 unregister_filesystem(&ocfs2_fs_type);
840
841 exit_ocfs2_extent_maps();
842
843 exit_ocfs2_uptodate_cache();
844
845 mlog_exit_void();
846}
847
848static void ocfs2_put_super(struct super_block *sb)
849{
850 mlog_entry("(0x%p)\n", sb);
851
852 ocfs2_sync_blockdev(sb);
853 ocfs2_dismount_volume(sb, 0);
854
855 mlog_exit_void();
856}
857
858static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
859{
860 struct ocfs2_super *osb;
861 u32 numbits, freebits;
862 int status;
863 struct ocfs2_dinode *bm_lock;
864 struct buffer_head *bh = NULL;
865 struct inode *inode = NULL;
866
867 mlog_entry("(%p, %p)\n", sb, buf);
868
869 osb = OCFS2_SB(sb);
870
871 inode = ocfs2_get_system_file_inode(osb,
872 GLOBAL_BITMAP_SYSTEM_INODE,
873 OCFS2_INVALID_SLOT);
874 if (!inode) {
875 mlog(ML_ERROR, "failed to get bitmap inode\n");
876 status = -EIO;
877 goto bail;
878 }
879
880 status = ocfs2_meta_lock(inode, NULL, &bh, 0);
881 if (status < 0) {
882 mlog_errno(status);
883 goto bail;
884 }
885
886 bm_lock = (struct ocfs2_dinode *) bh->b_data;
887
888 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
889 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
890
891 buf->f_type = OCFS2_SUPER_MAGIC;
892 buf->f_bsize = sb->s_blocksize;
893 buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
894 buf->f_blocks = ((sector_t) numbits) *
895 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
896 buf->f_bfree = ((sector_t) freebits) *
897 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
898 buf->f_bavail = buf->f_bfree;
899 buf->f_files = numbits;
900 buf->f_ffree = freebits;
901
902 brelse(bh);
903
904 ocfs2_meta_unlock(inode, 0);
905 status = 0;
906bail:
907 if (inode)
908 iput(inode);
909
910 mlog_exit(status);
911
912 return status;
913}
914
915static void ocfs2_inode_init_once(void *data,
916 kmem_cache_t *cachep,
917 unsigned long flags)
918{
919 struct ocfs2_inode_info *oi = data;
920
921 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
922 SLAB_CTOR_CONSTRUCTOR) {
923 oi->ip_flags = 0;
924 oi->ip_open_count = 0;
925 spin_lock_init(&oi->ip_lock);
926 ocfs2_extent_map_init(&oi->vfs_inode);
927 INIT_LIST_HEAD(&oi->ip_handle_list);
928 INIT_LIST_HEAD(&oi->ip_io_markers);
929 oi->ip_handle = NULL;
930 oi->ip_created_trans = 0;
931 oi->ip_last_trans = 0;
932 oi->ip_dir_start_lookup = 0;
933
934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem));
936
937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0;
939
940 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
941 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
942 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
943
944 ocfs2_metadata_cache_init(&oi->vfs_inode);
945
946 inode_init_once(&oi->vfs_inode);
947 }
948}
949
950static int ocfs2_initialize_mem_caches(void)
951{
952 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
953 sizeof(struct ocfs2_inode_info),
954 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
955 ocfs2_inode_init_once, NULL);
956 if (!ocfs2_inode_cachep)
957 return -ENOMEM;
958
959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
960 sizeof(struct ocfs2_journal_lock),
961 0,
962 SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
963 NULL, NULL);
964 if (!ocfs2_lock_cache)
965 return -ENOMEM;
966
967 return 0;
968}
969
970static void ocfs2_free_mem_caches(void)
971{
972 if (ocfs2_inode_cachep)
973 kmem_cache_destroy(ocfs2_inode_cachep);
974 if (ocfs2_lock_cache)
975 kmem_cache_destroy(ocfs2_lock_cache);
976
977 ocfs2_inode_cachep = NULL;
978 ocfs2_lock_cache = NULL;
979}
980
981static int ocfs2_get_sector(struct super_block *sb,
982 struct buffer_head **bh,
983 int block,
984 int sect_size)
985{
986 if (!sb_set_blocksize(sb, sect_size)) {
987 mlog(ML_ERROR, "unable to set blocksize\n");
988 return -EIO;
989 }
990
991 *bh = sb_getblk(sb, block);
992 if (!*bh) {
993 mlog_errno(-EIO);
994 return -EIO;
995 }
996 lock_buffer(*bh);
997 if (!buffer_dirty(*bh))
998 clear_buffer_uptodate(*bh);
999 unlock_buffer(*bh);
1000 ll_rw_block(READ, 1, bh);
1001 wait_on_buffer(*bh);
1002 return 0;
1003}
1004
1005/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1006static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1007{
1008 int status;
1009
1010 /* XXX hold a ref on the node while mounte? easy enough, if
1011 * desirable. */
1012 osb->node_num = o2nm_this_node();
1013 if (osb->node_num == O2NM_MAX_NODES) {
1014 mlog(ML_ERROR, "could not find this host's node number\n");
1015 status = -ENOENT;
1016 goto bail;
1017 }
1018
1019 mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
1020
1021 status = 0;
1022bail:
1023 return status;
1024}
1025
1026static int ocfs2_mount_volume(struct super_block *sb)
1027{
1028 int status = 0;
1029 int unlock_super = 0;
1030 struct ocfs2_super *osb = OCFS2_SB(sb);
1031
1032 mlog_entry_void();
1033
1034 if (ocfs2_is_hard_readonly(osb))
1035 goto leave;
1036
1037 status = ocfs2_fill_local_node_info(osb);
1038 if (status < 0) {
1039 mlog_errno(status);
1040 goto leave;
1041 }
1042
1043 status = ocfs2_register_hb_callbacks(osb);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto leave;
1047 }
1048
1049 status = ocfs2_dlm_init(osb);
1050 if (status < 0) {
1051 mlog_errno(status);
1052 goto leave;
1053 }
1054
1055 /* requires vote_thread to be running. */
1056 status = ocfs2_register_net_handlers(osb);
1057 if (status < 0) {
1058 mlog_errno(status);
1059 goto leave;
1060 }
1061
1062 status = ocfs2_super_lock(osb, 1);
1063 if (status < 0) {
1064 mlog_errno(status);
1065 goto leave;
1066 }
1067 unlock_super = 1;
1068
1069 /* This will load up the node map and add ourselves to it. */
1070 status = ocfs2_find_slot(osb);
1071 if (status < 0) {
1072 mlog_errno(status);
1073 goto leave;
1074 }
1075
1076 ocfs2_populate_mounted_map(osb);
1077
1078 /* load all node-local system inodes */
1079 status = ocfs2_init_local_system_inodes(osb);
1080 if (status < 0) {
1081 mlog_errno(status);
1082 goto leave;
1083 }
1084
1085 status = ocfs2_check_volume(osb);
1086 if (status < 0) {
1087 mlog_errno(status);
1088 goto leave;
1089 }
1090
1091 status = ocfs2_truncate_log_init(osb);
1092 if (status < 0) {
1093 mlog_errno(status);
1094 goto leave;
1095 }
1096
1097 /* This should be sent *after* we recovered our journal as it
1098 * will cause other nodes to unmark us as needing
1099 * recovery. However, we need to send it *before* dropping the
1100 * super block lock as otherwise their recovery threads might
1101 * try to clean us up while we're live! */
1102 status = ocfs2_request_mount_vote(osb);
1103 if (status < 0)
1104 mlog_errno(status);
1105
1106leave:
1107 if (unlock_super)
1108 ocfs2_super_unlock(osb, 1);
1109
1110 mlog_exit(status);
1111 return status;
1112}
1113
1114/* we can't grab the goofy sem lock from inside wait_event, so we use
1115 * memory barriers to make sure that we'll see the null task before
1116 * being woken up */
1117static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1118{
1119 mb();
1120 return osb->recovery_thread_task != NULL;
1121}
1122
1123static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1124{
1125 int tmp;
1126 struct ocfs2_super *osb = NULL;
1127
1128 mlog_entry("(0x%p)\n", sb);
1129
1130 BUG_ON(!sb);
1131 osb = OCFS2_SB(sb);
1132 BUG_ON(!osb);
1133
1134 ocfs2_shutdown_local_alloc(osb);
1135
1136 ocfs2_truncate_log_shutdown(osb);
1137
1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock);
1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144
1145 /* At this point, we know that no more recovery threads can be
1146 * launched, so wait for any recovery completion work to
1147 * complete. */
1148 flush_workqueue(ocfs2_wq);
1149
1150 ocfs2_journal_shutdown(osb);
1151
1152 ocfs2_sync_blockdev(sb);
1153
1154 /* No dlm means we've failed during mount, so skip all the
1155 * steps which depended on that to complete. */
1156 if (osb->dlm) {
1157 tmp = ocfs2_super_lock(osb, 1);
1158 if (tmp < 0) {
1159 mlog_errno(tmp);
1160 return;
1161 }
1162
1163 tmp = ocfs2_request_umount_vote(osb);
1164 if (tmp < 0)
1165 mlog_errno(tmp);
1166
1167 if (osb->slot_num != OCFS2_INVALID_SLOT)
1168 ocfs2_put_slot(osb);
1169
1170 ocfs2_super_unlock(osb, 1);
1171 }
1172
1173 ocfs2_release_system_inodes(osb);
1174
1175 if (osb->dlm) {
1176 ocfs2_unregister_net_handlers(osb);
1177
1178 ocfs2_dlm_shutdown(osb);
1179 }
1180
1181 ocfs2_clear_hb_callbacks(osb);
1182
1183 debugfs_remove(osb->osb_debug_root);
1184
1185 if (!mnt_err)
1186 ocfs2_stop_heartbeat(osb);
1187
1188 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1189
1190 printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
1191 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
1192
1193 ocfs2_delete_osb(osb);
1194 kfree(osb);
1195 sb->s_dev = 0;
1196 sb->s_fs_info = NULL;
1197}
1198
1199static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
1200 unsigned uuid_bytes)
1201{
1202 int i, ret;
1203 char *ptr;
1204
1205 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
1206
1207 osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
1208 if (osb->uuid_str == NULL)
1209 return -ENOMEM;
1210
1211 memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
1212
1213 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
1214 /* print with null */
1215 ret = snprintf(ptr, 3, "%02X", uuid[i]);
1216 if (ret != 2) /* drop super cleans up */
1217 return -EINVAL;
1218 /* then only advance past the last char */
1219 ptr += 2;
1220 }
1221
1222 return 0;
1223}
1224
1225static int ocfs2_initialize_super(struct super_block *sb,
1226 struct buffer_head *bh,
1227 int sector_size)
1228{
1229 int status = 0;
1230 int i;
1231 struct ocfs2_dinode *di = NULL;
1232 struct inode *inode = NULL;
1233 struct buffer_head *bitmap_bh = NULL;
1234 struct ocfs2_journal *journal;
1235 __le32 uuid_net_key;
1236 struct ocfs2_super *osb;
1237
1238 mlog_entry_void();
1239
1240 osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
1241 if (!osb) {
1242 status = -ENOMEM;
1243 mlog_errno(status);
1244 goto bail;
1245 }
1246
1247 sb->s_fs_info = osb;
1248 sb->s_op = &ocfs2_sops;
1249 sb->s_export_op = &ocfs2_export_ops;
1250 sb->s_flags |= MS_NOATIME;
1251 /* this is needed to support O_LARGEFILE */
1252 sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
1253
1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits)
1258 BUG();
1259
1260 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock);
1262 INIT_LIST_HEAD(&osb->net_response_list);
1263
1264 INIT_LIST_HEAD(&osb->osb_net_handlers);
1265 init_waitqueue_head(&osb->recovery_event);
1266 spin_lock_init(&osb->vote_task_lock);
1267 init_waitqueue_head(&osb->vote_event);
1268 osb->vote_work_sequence = 0;
1269 osb->vote_wake_sequence = 0;
1270 INIT_LIST_HEAD(&osb->blocked_lock_list);
1271 osb->blocked_lock_count = 0;
1272 INIT_LIST_HEAD(&osb->vote_list);
1273 spin_lock_init(&osb->osb_lock);
1274
1275 atomic_set(&osb->alloc_stats.moves, 0);
1276 atomic_set(&osb->alloc_stats.local_data, 0);
1277 atomic_set(&osb->alloc_stats.bitmap_data, 0);
1278 atomic_set(&osb->alloc_stats.bg_allocs, 0);
1279 atomic_set(&osb->alloc_stats.bg_extends, 0);
1280
1281 ocfs2_init_node_maps(osb);
1282
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285
1286 init_MUTEX(&osb->recovery_lock);
1287
1288 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL;
1290
1291 init_waitqueue_head(&osb->checkpoint_event);
1292 atomic_set(&osb->needs_checkpoint, 0);
1293
1294 osb->node_num = O2NM_INVALID_NODE_NUM;
1295 osb->slot_num = OCFS2_INVALID_SLOT;
1296
1297 osb->local_alloc_state = OCFS2_LA_UNUSED;
1298 osb->local_alloc_bh = NULL;
1299
1300 ocfs2_setup_hb_callbacks(osb);
1301
1302 init_waitqueue_head(&osb->osb_mount_event);
1303
1304 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
1305 if (!osb->vol_label) {
1306 mlog(ML_ERROR, "unable to alloc vol label\n");
1307 status = -ENOMEM;
1308 goto bail;
1309 }
1310
1311 osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
1312 if (!osb->uuid) {
1313 mlog(ML_ERROR, "unable to alloc uuid\n");
1314 status = -ENOMEM;
1315 goto bail;
1316 }
1317
1318 di = (struct ocfs2_dinode *)bh->b_data;
1319
1320 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
1321 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
1322 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
1323 osb->max_slots);
1324 status = -EINVAL;
1325 goto bail;
1326 }
1327 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
1328
1329 osb->s_feature_compat =
1330 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
1331 osb->s_feature_ro_compat =
1332 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
1333 osb->s_feature_incompat =
1334 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
1335
1336 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
1337 mlog(ML_ERROR, "couldn't mount because of unsupported "
1338 "optional features (%x).\n", i);
1339 status = -EINVAL;
1340 goto bail;
1341 }
1342 if (!(osb->sb->s_flags & MS_RDONLY) &&
1343 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
1344 mlog(ML_ERROR, "couldn't mount RDWR because of "
1345 "unsupported optional features (%x).\n", i);
1346 status = -EINVAL;
1347 goto bail;
1348 }
1349
1350 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1351
1352 /* FIXME
1353 * This should be done in ocfs2_journal_init(), but unknown
1354 * ordering issues will cause the filesystem to crash.
1355 * If anyone wants to figure out what part of the code
1356 * refers to osb->journal before ocfs2_journal_init() is run,
1357 * be my guest.
1358 */
1359 /* initialize our journal structure */
1360
1361 journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
1362 if (!journal) {
1363 mlog(ML_ERROR, "unable to alloc journal\n");
1364 status = -ENOMEM;
1365 goto bail;
1366 }
1367 osb->journal = journal;
1368 journal->j_osb = osb;
1369
1370 atomic_set(&journal->j_num_trans, 0);
1371 init_rwsem(&journal->j_trans_barrier);
1372 init_waitqueue_head(&journal->j_checkpointed);
1373 spin_lock_init(&journal->j_lock);
1374 journal->j_trans_id = (unsigned long) 1;
1375 INIT_LIST_HEAD(&journal->j_la_cleanups);
1376 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
1377 journal->j_state = OCFS2_JOURNAL_FREE;
1378
1379 /* get some pseudo constants for clustersize bits */
1380 osb->s_clustersize_bits =
1381 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
1382 osb->s_clustersize = 1 << osb->s_clustersize_bits;
1383 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
1384
1385 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
1386 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
1387 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
1388 osb->s_clustersize);
1389 status = -EINVAL;
1390 goto bail;
1391 }
1392
1393 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
1394 > (u32)~0UL) {
1395 mlog(ML_ERROR, "Volume might try to write to blocks beyond "
1396 "what jbd can address in 32 bits.\n");
1397 status = -EINVAL;
1398 goto bail;
1399 }
1400
1401 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
1402 sizeof(di->id2.i_super.s_uuid))) {
1403 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
1404 status = -ENOMEM;
1405 goto bail;
1406 }
1407
1408 memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
1409 osb->net_key = le32_to_cpu(uuid_net_key);
1410
1411 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1412 osb->vol_label[63] = '\0';
1413 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
1414 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
1415 osb->first_cluster_group_blkno =
1416 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1417 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1418 mlog(0, "vol_label: %s\n", osb->vol_label);
1419 mlog(0, "uuid: %s\n", osb->uuid_str);
1420 mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
1421 osb->root_blkno, osb->system_dir_blkno);
1422
1423 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
1424 if (!osb->osb_dlm_debug) {
1425 status = -ENOMEM;
1426 mlog_errno(status);
1427 goto bail;
1428 }
1429
1430 atomic_set(&osb->vol_state, VOLUME_INIT);
1431
1432 /* load root, system_dir, and all global system inodes */
1433 status = ocfs2_init_global_system_inodes(osb);
1434 if (status < 0) {
1435 mlog_errno(status);
1436 goto bail;
1437 }
1438
1439 /*
1440 * global bitmap
1441 */
1442 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
1443 OCFS2_INVALID_SLOT);
1444 if (!inode) {
1445 status = -EINVAL;
1446 mlog_errno(status);
1447 goto bail;
1448 }
1449
1450 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1451
1452 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1453 inode);
1454 iput(inode);
1455 if (status < 0) {
1456 mlog_errno(status);
1457 goto bail;
1458 }
1459
1460 di = (struct ocfs2_dinode *) bitmap_bh->b_data;
1461 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1462 osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
1463 brelse(bitmap_bh);
1464 mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
1465 osb->bitmap_blkno, osb->bitmap_cpg);
1466
1467 status = ocfs2_init_slot_info(osb);
1468 if (status < 0) {
1469 mlog_errno(status);
1470 goto bail;
1471 }
1472
1473 /* Link this osb onto the global linked list of all osb structures. */
1474 /* The Global Link List is mainted for the whole driver . */
1475 spin_lock(&ocfs2_globals_lock);
1476 osb->osb_id = osb_id;
1477 if (osb_id < OCFS2_MAX_OSB_ID)
1478 osb_id++;
1479 else {
1480 mlog(ML_ERROR, "Too many volumes mounted\n");
1481 status = -ENOMEM;
1482 }
1483 spin_unlock(&ocfs2_globals_lock);
1484
1485bail:
1486 mlog_exit(status);
1487 return status;
1488}
1489
1490/*
1491 * will return: -EAGAIN if it is ok to keep searching for superblocks
1492 * -EINVAL if there is a bad superblock
1493 * 0 on success
1494 */
1495static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1496 struct buffer_head *bh,
1497 u32 blksz)
1498{
1499 int status = -EAGAIN;
1500
1501 mlog_entry_void();
1502
1503 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1504 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1505 status = -EINVAL;
1506 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1507 mlog(ML_ERROR, "found superblock with incorrect block "
1508 "size: found %u, should be %u\n",
1509 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
1510 blksz);
1511 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
1512 OCFS2_MAJOR_REV_LEVEL ||
1513 le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
1514 OCFS2_MINOR_REV_LEVEL) {
1515 mlog(ML_ERROR, "found superblock with bad version: "
1516 "found %u.%u, should be %u.%u\n",
1517 le16_to_cpu(di->id2.i_super.s_major_rev_level),
1518 le16_to_cpu(di->id2.i_super.s_minor_rev_level),
1519 OCFS2_MAJOR_REV_LEVEL,
1520 OCFS2_MINOR_REV_LEVEL);
1521 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
1522 mlog(ML_ERROR, "bad block number on superblock: "
1523 "found %"MLFu64", should be %llu\n",
1524 di->i_blkno, (unsigned long long)bh->b_blocknr);
1525 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
1526 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
1527 mlog(ML_ERROR, "bad cluster size found: %u\n",
1528 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
1529 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
1530 mlog(ML_ERROR, "bad root_blkno: 0\n");
1531 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
1532 mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
1533 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
1534 mlog(ML_ERROR,
1535 "Superblock slots found greater than file system "
1536 "maximum: found %u, max %u\n",
1537 le16_to_cpu(di->id2.i_super.s_max_slots),
1538 OCFS2_MAX_SLOTS);
1539 } else {
1540 /* found it! */
1541 status = 0;
1542 }
1543 }
1544
1545 mlog_exit(status);
1546 return status;
1547}
1548
1549static int ocfs2_check_volume(struct ocfs2_super *osb)
1550{
1551 int status = 0;
1552 int dirty;
1553 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
1554 * recover
1555 * ourselves. */
1556
1557 mlog_entry_void();
1558
1559 /* Init our journal object. */
1560 status = ocfs2_journal_init(osb->journal, &dirty);
1561 if (status < 0) {
1562 mlog(ML_ERROR, "Could not initialize journal!\n");
1563 goto finally;
1564 }
1565
1566 /* If the journal was unmounted cleanly then we don't want to
1567 * recover anything. Otherwise, journal_load will do that
1568 * dirty work for us :) */
1569 if (!dirty) {
1570 status = ocfs2_journal_wipe(osb->journal, 0);
1571 if (status < 0) {
1572 mlog_errno(status);
1573 goto finally;
1574 }
1575 } else {
1576 mlog(ML_NOTICE, "File system was not unmounted cleanly, "
1577 "recovering volume.\n");
1578 }
1579
1580 /* will play back anything left in the journal. */
1581 ocfs2_journal_load(osb->journal);
1582
1583 if (dirty) {
1584 /* recover my local alloc if we didn't unmount cleanly. */
1585 status = ocfs2_begin_local_alloc_recovery(osb,
1586 osb->slot_num,
1587 &local_alloc);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto finally;
1591 }
1592 /* we complete the recovery process after we've marked
1593 * ourselves as mounted. */
1594 }
1595
1596 mlog(0, "Journal loaded.\n");
1597
1598 status = ocfs2_load_local_alloc(osb);
1599 if (status < 0) {
1600 mlog_errno(status);
1601 goto finally;
1602 }
1603
1604 if (dirty) {
1605 /* Recovery will be completed after we've mounted the
1606 * rest of the volume. */
1607 osb->dirty = 1;
1608 osb->local_alloc_copy = local_alloc;
1609 local_alloc = NULL;
1610 }
1611
1612 /* go through each journal, trylock it and if you get the
1613 * lock, and it's marked as dirty, set the bit in the recover
1614 * map and launch a recovery thread for it. */
1615 status = ocfs2_mark_dead_nodes(osb);
1616 if (status < 0)
1617 mlog_errno(status);
1618
1619finally:
1620 if (local_alloc)
1621 kfree(local_alloc);
1622
1623 mlog_exit(status);
1624 return status;
1625}
1626
1627/*
1628 * The routine gets called from dismount or close whenever a dismount on
1629 * volume is requested and the osb open count becomes 1.
1630 * It will remove the osb from the global list and also free up all the
1631 * initialized resources and fileobject.
1632 */
1633static void ocfs2_delete_osb(struct ocfs2_super *osb)
1634{
1635 mlog_entry_void();
1636
1637 /* This function assumes that the caller has the main osb resource */
1638
1639 if (osb->slot_info)
1640 ocfs2_free_slot_info(osb->slot_info);
1641
1642 /* FIXME
1643 * This belongs in journal shutdown, but because we have to
1644 * allocate osb->journal at the start of ocfs2_initalize_osb(),
1645 * we free it here.
1646 */
1647 kfree(osb->journal);
1648 if (osb->local_alloc_copy)
1649 kfree(osb->local_alloc_copy);
1650 kfree(osb->uuid_str);
1651 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
1652 memset(osb, 0, sizeof(struct ocfs2_super));
1653
1654 mlog_exit_void();
1655}
1656
1657/* Put OCFS2 into a readonly state, or (if the user specifies it),
1658 * panic(). We do not support continue-on-error operation. */
1659static void ocfs2_handle_error(struct super_block *sb)
1660{
1661 struct ocfs2_super *osb = OCFS2_SB(sb);
1662
1663 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
1664 panic("OCFS2: (device %s): panic forced after error\n",
1665 sb->s_id);
1666
1667 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
1668
1669 if (sb->s_flags & MS_RDONLY &&
1670 (ocfs2_is_soft_readonly(osb) ||
1671 ocfs2_is_hard_readonly(osb)))
1672 return;
1673
1674 printk(KERN_CRIT "File system is now read-only due to the potential "
1675 "of on-disk corruption. Please run fsck.ocfs2 once the file "
1676 "system is unmounted.\n");
1677 sb->s_flags |= MS_RDONLY;
1678 ocfs2_set_ro_flag(osb, 0);
1679}
1680
1681static char error_buf[1024];
1682
1683void __ocfs2_error(struct super_block *sb,
1684 const char *function,
1685 const char *fmt, ...)
1686{
1687 va_list args;
1688
1689 va_start(args, fmt);
1690 vsprintf(error_buf, fmt, args);
1691 va_end(args);
1692
1693 /* Not using mlog here because we want to show the actual
1694 * function the error came from. */
1695 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
1696 sb->s_id, function, error_buf);
1697
1698 ocfs2_handle_error(sb);
1699}
1700
1701/* Handle critical errors. This is intentionally more drastic than
1702 * ocfs2_handle_error, so we only use for things like journal errors,
1703 * etc. */
1704void __ocfs2_abort(struct super_block* sb,
1705 const char *function,
1706 const char *fmt, ...)
1707{
1708 va_list args;
1709
1710 va_start(args, fmt);
1711 vsprintf(error_buf, fmt, args);
1712 va_end(args);
1713
1714 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
1715 sb->s_id, function, error_buf);
1716
1717 /* We don't have the cluster support yet to go straight to
1718 * hard readonly in here. Until then, we want to keep
1719 * ocfs2_abort() so that we can at least mark critical
1720 * errors.
1721 *
1722 * TODO: This should abort the journal and alert other nodes
1723 * that our slot needs recovery. */
1724
1725 /* Force a panic(). This stinks, but it's better than letting
1726 * things continue without having a proper hard readonly
1727 * here. */
1728 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1729 ocfs2_handle_error(sb);
1730}
1731
1732module_init(ocfs2_init);
1733module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 000000000000..c564177dfbdc
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H
28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num);
33
34void __ocfs2_error(struct super_block *sb,
35 const char *function,
36 const char *fmt, ...);
37#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
38
39void __ocfs2_abort(struct super_block *sb,
40 const char *function,
41 const char *fmt, ...);
42#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
43
44#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 000000000000..f6986bd79e75
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * linux/cluster/ssi/cfs/symlink.c
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
14 * or NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
22 *
23 * Copyright (C) 1992 Rick Sladkey
24 *
25 * Optimization changes Copyright (C) 1994 Florian La Roche
26 *
27 * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
28 *
29 * Portions Copyright (C) 2001 Compaq Computer Corporation
30 *
31 * ocfs2 symlink handling code.
32 *
33 * Copyright (C) 2004, 2005 Oracle.
34 *
35 */
36
37#include <linux/fs.h>
38#include <linux/types.h>
39#include <linux/slab.h>
40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42
43#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "file.h"
50#include "inode.h"
51#include "journal.h"
52#include "symlink.h"
53
54#include "buffer_head_io.h"
55
56static char *ocfs2_page_getlink(struct dentry * dentry,
57 struct page **ppage);
58static char *ocfs2_fast_symlink_getlink(struct inode *inode,
59 struct buffer_head **bh);
60
61/* get the link contents into pagecache */
62static char *ocfs2_page_getlink(struct dentry * dentry,
63 struct page **ppage)
64{
65 struct page * page;
66 struct address_space *mapping = dentry->d_inode->i_mapping;
67 page = read_cache_page(mapping, 0,
68 (filler_t *)mapping->a_ops->readpage, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 wait_on_page_locked(page);
72 if (!PageUptodate(page))
73 goto async_fail;
74 *ppage = page;
75 return kmap(page);
76
77async_fail:
78 page_cache_release(page);
79 return ERR_PTR(-EIO);
80
81sync_fail:
82 return (char*)page;
83}
84
85static char *ocfs2_fast_symlink_getlink(struct inode *inode,
86 struct buffer_head **bh)
87{
88 int status;
89 char *link = NULL;
90 struct ocfs2_dinode *fe;
91
92 mlog_entry_void();
93
94 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
95 OCFS2_I(inode)->ip_blkno,
96 bh,
97 OCFS2_BH_CACHED,
98 inode);
99 if (status < 0) {
100 mlog_errno(status);
101 link = ERR_PTR(status);
102 goto bail;
103 }
104
105 fe = (struct ocfs2_dinode *) (*bh)->b_data;
106 link = (char *) fe->id2.i_symlink;
107bail:
108 mlog_exit(status);
109
110 return link;
111}
112
113static int ocfs2_readlink(struct dentry *dentry,
114 char __user *buffer,
115 int buflen)
116{
117 int ret;
118 char *link;
119 struct buffer_head *bh = NULL;
120 struct inode *inode = dentry->d_inode;
121
122 mlog_entry_void();
123
124 link = ocfs2_fast_symlink_getlink(inode, &bh);
125 if (IS_ERR(link)) {
126 ret = PTR_ERR(link);
127 goto out;
128 }
129
130 ret = vfs_readlink(dentry, buffer, buflen, link);
131
132 brelse(bh);
133out:
134 mlog_exit(ret);
135 return ret;
136}
137
138static void *ocfs2_follow_link(struct dentry *dentry,
139 struct nameidata *nd)
140{
141 int status;
142 char *link;
143 struct inode *inode = dentry->d_inode;
144 struct page *page = NULL;
145 struct buffer_head *bh = NULL;
146
147 if (ocfs2_inode_is_fast_symlink(inode))
148 link = ocfs2_fast_symlink_getlink(inode, &bh);
149 else
150 link = ocfs2_page_getlink(dentry, &page);
151 if (IS_ERR(link)) {
152 status = PTR_ERR(link);
153 mlog_errno(status);
154 goto bail;
155 }
156
157 status = vfs_follow_link(nd, link);
158 if (status)
159 mlog_errno(status);
160bail:
161 if (page) {
162 kunmap(page);
163 page_cache_release(page);
164 }
165 if (bh)
166 brelse(bh);
167
168 return ERR_PTR(status);
169}
170
171struct inode_operations ocfs2_symlink_inode_operations = {
172 .readlink = page_readlink,
173 .follow_link = ocfs2_follow_link,
174 .getattr = ocfs2_getattr,
175};
176struct inode_operations ocfs2_fast_symlink_inode_operations = {
177 .readlink = ocfs2_readlink,
178 .follow_link = ocfs2_follow_link,
179 .getattr = ocfs2_getattr,
180};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 000000000000..1ea9e4d9e9eb
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYMLINK_H
27#define OCFS2_SYMLINK_H
28
29extern struct inode_operations ocfs2_symlink_inode_operations;
30extern struct inode_operations ocfs2_fast_symlink_inode_operations;
31
32/*
33 * Test whether an inode is a fast symlink.
34 */
35static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
36{
37 return (S_ISLNK(inode->i_mode) &&
38 inode->i_blocks == 0);
39}
40
41
42#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 000000000000..600a8bc5b541
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.c
5 *
6 * Initialize, read, write, etc. system files.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include "ocfs2.h"
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "alloc.h"
37#include "dir.h"
38#include "inode.h"
39#include "journal.h"
40#include "sysfile.h"
41
42#include "buffer_head_io.h"
43
44static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
45 int type,
46 u32 slot);
47
48static inline int is_global_system_inode(int type);
49static inline int is_in_system_inode_array(struct ocfs2_super *osb,
50 int type,
51 u32 slot);
52
53static inline int is_global_system_inode(int type)
54{
55 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
56 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
57}
58
59static inline int is_in_system_inode_array(struct ocfs2_super *osb,
60 int type,
61 u32 slot)
62{
63 return slot == osb->slot_num || is_global_system_inode(type);
64}
65
66struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
67 int type,
68 u32 slot)
69{
70 struct inode *inode = NULL;
71 struct inode **arr = NULL;
72
73 /* avoid the lookup if cached in local system file array */
74 if (is_in_system_inode_array(osb, type, slot))
75 arr = &(osb->system_inodes[type]);
76
77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */
79 inode = igrab(inode);
80 if (!inode)
81 BUG();
82
83 return inode;
84 }
85
86 /* this gets one ref thru iget */
87 inode = _ocfs2_get_system_file_inode(osb, type, slot);
88
89 /* add one more if putting into array for first time */
90 if (arr && inode) {
91 *arr = igrab(inode);
92 if (!*arr)
93 BUG();
94 }
95 return inode;
96}
97
98static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
99 int type,
100 u32 slot)
101{
102 char namebuf[40];
103 struct inode *inode = NULL;
104 u64 blkno;
105 struct buffer_head *dirent_bh = NULL;
106 struct ocfs2_dir_entry *de = NULL;
107 int status = 0;
108
109 ocfs2_sprintf_system_inode_name(namebuf,
110 sizeof(namebuf),
111 type, slot);
112
113 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
114 &blkno, osb->sys_root_inode,
115 &dirent_bh, &de);
116 if (status < 0) {
117 goto bail;
118 }
119
120 inode = ocfs2_iget(osb, blkno);
121 if (IS_ERR(inode)) {
122 mlog_errno(PTR_ERR(inode));
123 inode = NULL;
124 goto bail;
125 }
126bail:
127 if (dirent_bh)
128 brelse(dirent_bh);
129 return inode;
130}
131
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 000000000000..cc9ea661ffc1
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYSFILE_H
27#define OCFS2_SYSFILE_H
28
29struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
30 int type,
31 u32 slot);
32
33#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 000000000000..3a0458fd3e1b
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.c
5 *
6 * Tracking the up-to-date-ness of a local buffer_head with respect to
7 * the cluster.
8 *
9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 * Standard buffer head caching flags (uptodate, etc) are insufficient
27 * in a clustered environment - a buffer may be marked up to date on
28 * our local node but could have been modified by another cluster
29 * member. As a result an additional (and performant) caching scheme
30 * is required. A further requirement is that we consume as little
31 * memory as possible - we never pin buffer_head structures in order
32 * to cache them.
33 *
34 * We track the existence of up to date buffers on the inodes which
35 * are associated with them. Because we don't want to pin
36 * buffer_heads, this is only a (strong) hint and several other checks
37 * are made in the I/O path to ensure that we don't use a stale or
38 * invalid buffer without going to disk:
39 * - buffer_jbd is used liberally - if a bh is in the journal on
40 * this node then it *must* be up to date.
41 * - the standard buffer_uptodate() macro is used to detect buffers
42 * which may be invalid (even if we have an up to date tracking
43 * item for them)
44 *
45 * For a full understanding of how this code works together, one
46 * should read the callers in dlmglue.c, the I/O functions in
47 * buffer_head_io.c and ocfs2_journal_access in journal.c
48 */
49
50#include <linux/fs.h>
51#include <linux/types.h>
52#include <linux/slab.h>
53#include <linux/highmem.h>
54#include <linux/buffer_head.h>
55#include <linux/rbtree.h>
56#include <linux/jbd.h>
57
58#define MLOG_MASK_PREFIX ML_UPTODATE
59
60#include <cluster/masklog.h>
61
62#include "ocfs2.h"
63
64#include "inode.h"
65#include "uptodate.h"
66
67struct ocfs2_meta_cache_item {
68 struct rb_node c_node;
69 sector_t c_block;
70};
71
72static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
73
74void ocfs2_metadata_cache_init(struct inode *inode)
75{
76 struct ocfs2_inode_info *oi = OCFS2_I(inode);
77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
78
79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
80 ci->ci_num_cached = 0;
81}
82
83/* No lock taken here as 'root' is not expected to be visible to other
84 * processes. */
85static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
86{
87 unsigned int purged = 0;
88 struct rb_node *node;
89 struct ocfs2_meta_cache_item *item;
90
91 while ((node = rb_last(root)) != NULL) {
92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
93
94 mlog(0, "Purge item %llu\n",
95 (unsigned long long) item->c_block);
96
97 rb_erase(&item->c_node, root);
98 kmem_cache_free(ocfs2_uptodate_cachep, item);
99
100 purged++;
101 }
102 return purged;
103}
104
105/* Called from locking and called from ocfs2_clear_inode. Dump the
106 * cache for a given inode.
107 *
108 * This function is a few more lines longer than necessary due to some
109 * accounting done here, but I think it's worth tracking down those
110 * bugs sooner -- Mark */
111void ocfs2_metadata_cache_purge(struct inode *inode)
112{
113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
114 unsigned int tree, to_purge, purged;
115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
116 struct rb_root root = RB_ROOT;
117
118 spin_lock(&oi->ip_lock);
119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
120 to_purge = ci->ci_num_cached;
121
122 mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
123 tree ? "array" : "tree", oi->ip_blkno);
124
125 /* If we're a tree, save off the root so that we can safely
126 * initialize the cache. We do the work to free tree members
127 * without the spinlock. */
128 if (tree)
129 root = ci->ci_cache.ci_tree;
130
131 ocfs2_metadata_cache_init(inode);
132 spin_unlock(&oi->ip_lock);
133
134 purged = ocfs2_purge_copied_metadata_tree(&root);
135 /* If possible, track the number wiped so that we can more
136 * easily detect counting errors. Unfortunately, this is only
137 * meaningful for trees. */
138 if (tree && purged != to_purge)
139 mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
140 oi->ip_blkno, to_purge, purged);
141}
142
143/* Returns the index in the cache array, -1 if not found.
144 * Requires ip_lock. */
145static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
146 sector_t item)
147{
148 int i;
149
150 for (i = 0; i < ci->ci_num_cached; i++) {
151 if (item == ci->ci_cache.ci_array[i])
152 return i;
153 }
154
155 return -1;
156}
157
158/* Returns the cache item if found, otherwise NULL.
159 * Requires ip_lock. */
160static struct ocfs2_meta_cache_item *
161ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
162 sector_t block)
163{
164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
165 struct ocfs2_meta_cache_item *item = NULL;
166
167 while (n) {
168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
169
170 if (block < item->c_block)
171 n = n->rb_left;
172 else if (block > item->c_block)
173 n = n->rb_right;
174 else
175 return item;
176 }
177
178 return NULL;
179}
180
181static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
182 struct buffer_head *bh)
183{
184 int index = -1;
185 struct ocfs2_meta_cache_item *item = NULL;
186
187 spin_lock(&oi->ip_lock);
188
189 mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
190 oi->ip_blkno, (unsigned long long) bh->b_blocknr,
191 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
192
193 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
194 index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
195 bh->b_blocknr);
196 else
197 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
198 bh->b_blocknr);
199
200 spin_unlock(&oi->ip_lock);
201
202 mlog(0, "index = %d, item = %p\n", index, item);
203
204 return (index != -1) || (item != NULL);
205}
206
207/* Warning: even if it returns true, this does *not* guarantee that
208 * the block is stored in our inode metadata cache. */
209int ocfs2_buffer_uptodate(struct inode *inode,
210 struct buffer_head *bh)
211{
212 /* Doesn't matter if the bh is in our cache or not -- if it's
213 * not marked uptodate then we know it can't have correct
214 * data. */
215 if (!buffer_uptodate(bh))
216 return 0;
217
218 /* OCFS2 does not allow multiple nodes to be changing the same
219 * block at the same time. */
220 if (buffer_jbd(bh))
221 return 1;
222
223 /* Ok, locally the buffer is marked as up to date, now search
224 * our cache to see if we can trust that. */
225 return ocfs2_buffer_cached(OCFS2_I(inode), bh);
226}
227
228/* Requires ip_lock */
229static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
230 sector_t block)
231{
232 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
233
234 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
235 ci->ci_num_cached);
236
237 ci->ci_cache.ci_array[ci->ci_num_cached] = block;
238 ci->ci_num_cached++;
239}
240
241/* By now the caller should have checked that the item does *not*
242 * exist in the tree.
243 * Requires ip_lock. */
244static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
245 struct ocfs2_meta_cache_item *new)
246{
247 sector_t block = new->c_block;
248 struct rb_node *parent = NULL;
249 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
250 struct ocfs2_meta_cache_item *tmp;
251
252 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
253 ci->ci_num_cached);
254
255 while(*p) {
256 parent = *p;
257
258 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
259
260 if (block < tmp->c_block)
261 p = &(*p)->rb_left;
262 else if (block > tmp->c_block)
263 p = &(*p)->rb_right;
264 else {
265 /* This should never happen! */
266 mlog(ML_ERROR, "Duplicate block %llu cached!\n",
267 (unsigned long long) block);
268 BUG();
269 }
270 }
271
272 rb_link_node(&new->c_node, parent, p);
273 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
274 ci->ci_num_cached++;
275}
276
277static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
278 struct ocfs2_caching_info *ci)
279{
280 assert_spin_locked(&oi->ip_lock);
281
282 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
283 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
284}
285
286/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
287 * pointers in tree after we use them - this allows caller to detect
288 * when to free in case of error. */
289static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
290 struct ocfs2_meta_cache_item **tree)
291{
292 int i;
293 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
294
295 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
296 "Inode %"MLFu64", num cached = %u, should be %u\n",
297 oi->ip_blkno, ci->ci_num_cached,
298 OCFS2_INODE_MAX_CACHE_ARRAY);
299 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
300 "Inode %"MLFu64" not marked as inline anymore!\n",
301 oi->ip_blkno);
302 assert_spin_locked(&oi->ip_lock);
303
304 /* Be careful to initialize the tree members *first* because
305 * once the ci_tree is used, the array is junk... */
306 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
307 tree[i]->c_block = ci->ci_cache.ci_array[i];
308
309 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
310 ci->ci_cache.ci_tree = RB_ROOT;
311 /* this will be set again by __ocfs2_insert_cache_tree */
312 ci->ci_num_cached = 0;
313
314 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
315 __ocfs2_insert_cache_tree(ci, tree[i]);
316 tree[i] = NULL;
317 }
318
319 mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
320 oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
321}
322
323/* Slow path function - memory allocation is necessary. See the
324 * comment above ocfs2_set_buffer_uptodate for more information. */
325static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
326 sector_t block,
327 int expand_tree)
328{
329 int i;
330 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
331 struct ocfs2_meta_cache_item *new = NULL;
332 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
333 { NULL, };
334
335 mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
336 oi->ip_blkno, (unsigned long long) block, expand_tree);
337
338 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
339 if (!new) {
340 mlog_errno(-ENOMEM);
341 return;
342 }
343 new->c_block = block;
344
345 if (expand_tree) {
346 /* Do *not* allocate an array here - the removal code
347 * has no way of tracking that. */
348 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
349 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
350 GFP_KERNEL);
351 if (!tree[i]) {
352 mlog_errno(-ENOMEM);
353 goto out_free;
354 }
355
356 /* These are initialized in ocfs2_expand_cache! */
357 }
358 }
359
360 spin_lock(&oi->ip_lock);
361 if (ocfs2_insert_can_use_array(oi, ci)) {
362 mlog(0, "Someone cleared the tree underneath us\n");
363 /* Ok, items were removed from the cache in between
364 * locks. Detect this and revert back to the fast path */
365 ocfs2_append_cache_array(ci, block);
366 spin_unlock(&oi->ip_lock);
367 goto out_free;
368 }
369
370 if (expand_tree)
371 ocfs2_expand_cache(oi, tree);
372
373 __ocfs2_insert_cache_tree(ci, new);
374 spin_unlock(&oi->ip_lock);
375
376 new = NULL;
377out_free:
378 if (new)
379 kmem_cache_free(ocfs2_uptodate_cachep, new);
380
381 /* If these were used, then ocfs2_expand_cache re-set them to
382 * NULL for us. */
383 if (tree[0]) {
384 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
385 if (tree[i])
386 kmem_cache_free(ocfs2_uptodate_cachep,
387 tree[i]);
388 }
389}
390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock --
395 * multiple paths cannot be expanding to a tree at the same time.
396 *
397 * The slow path takes into account that items can be removed
398 * (including the whole tree wiped and reset) when this process it out
399 * allocating memory. In those cases, it reverts back to the fast
400 * path.
401 *
402 * Note that this function may actually fail to insert the block if
403 * memory cannot be allocated. This is not fatal however (but may
404 * result in a performance penalty) */
405void ocfs2_set_buffer_uptodate(struct inode *inode,
406 struct buffer_head *bh)
407{
408 int expand;
409 struct ocfs2_inode_info *oi = OCFS2_I(inode);
410 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
411
412 /* The block may very well exist in our cache already, so avoid
413 * doing any more work in that case. */
414 if (ocfs2_buffer_cached(oi, bh))
415 return;
416
417 mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
418 (unsigned long long) bh->b_blocknr);
419
420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */
422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free
425 * spot. */
426 ocfs2_append_cache_array(ci, bh->b_blocknr);
427 spin_unlock(&oi->ip_lock);
428 return;
429 }
430
431 expand = 0;
432 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
433 /* We need to bump things up to a tree. */
434 expand = 1;
435 }
436 spin_unlock(&oi->ip_lock);
437
438 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
439}
440
441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh)
446{
447 struct ocfs2_inode_info *oi = OCFS2_I(inode);
448
449 /* This should definitely *not* exist in our cache */
450 BUG_ON(ocfs2_buffer_cached(oi, bh));
451
452 set_buffer_uptodate(bh);
453
454 down(&oi->ip_io_sem);
455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem);
457}
458
459/* Requires ip_lock. */
460static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
461 int index)
462{
463 sector_t *array = ci->ci_cache.ci_array;
464 int bytes;
465
466 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
467 BUG_ON(index >= ci->ci_num_cached);
468 BUG_ON(!ci->ci_num_cached);
469
470 mlog(0, "remove index %d (num_cached = %u\n", index,
471 ci->ci_num_cached);
472
473 ci->ci_num_cached--;
474
475 /* don't need to copy if the array is now empty, or if we
476 * removed at the tail */
477 if (ci->ci_num_cached && index < ci->ci_num_cached) {
478 bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
479 memmove(&array[index], &array[index + 1], bytes);
480 }
481}
482
483/* Requires ip_lock. */
484static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
485 struct ocfs2_meta_cache_item *item)
486{
487 mlog(0, "remove block %llu from tree\n",
488 (unsigned long long) item->c_block);
489
490 rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
491 ci->ci_num_cached--;
492}
493
494/* Called when we remove a chunk of metadata from an inode. We don't
495 * bother reverting things to an inlined array in the case of a remove
496 * which moves us back under the limit. */
497void ocfs2_remove_from_cache(struct inode *inode,
498 struct buffer_head *bh)
499{
500 int index;
501 sector_t block = bh->b_blocknr;
502 struct ocfs2_meta_cache_item *item = NULL;
503 struct ocfs2_inode_info *oi = OCFS2_I(inode);
504 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
505
506 spin_lock(&oi->ip_lock);
507 mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
508 oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
509 oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
510
511 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
512 index = ocfs2_search_cache_array(ci, block);
513 if (index != -1)
514 ocfs2_remove_metadata_array(ci, index);
515 } else {
516 item = ocfs2_search_cache_tree(ci, block);
517 if (item)
518 ocfs2_remove_metadata_tree(ci, item);
519 }
520 spin_unlock(&oi->ip_lock);
521
522 if (item)
523 kmem_cache_free(ocfs2_uptodate_cachep, item);
524}
525
526int __init init_ocfs2_uptodate_cache(void)
527{
528 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
529 sizeof(struct ocfs2_meta_cache_item),
530 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
531 if (!ocfs2_uptodate_cachep)
532 return -ENOMEM;
533
534 mlog(0, "%u inlined cache items per inode.\n",
535 OCFS2_INODE_MAX_CACHE_ARRAY);
536
537 return 0;
538}
539
540void __exit exit_ocfs2_uptodate_cache(void)
541{
542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep);
544}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 000000000000..e5aacdf4eabf
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.h
5 *
6 * Cluster uptodate tracking
7 *
8 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H
28
29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void);
31
32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode);
34
35int ocfs2_buffer_uptodate(struct inode *inode,
36 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode,
38 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh);
43
44#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 000000000000..5405ce121c99
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.3.3"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 000000000000..d7395cb91d2f
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 000000000000..021978e0576b
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30#include <linux/kthread.h>
31
32#include <cluster/heartbeat.h>
33#include <cluster/nodemanager.h>
34#include <cluster/tcp.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_VOTE
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "dlmglue.h"
45#include "extent_map.h"
46#include "heartbeat.h"
47#include "inode.h"
48#include "journal.h"
49#include "slot_map.h"
50#include "vote.h"
51
52#include "buffer_head_io.h"
53
54#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
56struct ocfs2_msg_hdr
57{
58 __be32 h_response_id; /* used to lookup message handle on sending
59 * node. */
60 __be32 h_request;
61 __be64 h_blkno;
62 __be32 h_generation;
63 __be32 h_node_num; /* node sending this particular message. */
64};
65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg
70{
71 struct ocfs2_msg_hdr v_hdr;
72 union {
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77 __be32 v_unlink_namelen;
78 __be64 v_unlink_parent;
79 u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
80};
81
82/* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84#define OCFS2_RESPONSE_OK (0)
85#define OCFS2_RESPONSE_BUSY (-16)
86#define OCFS2_RESPONSE_BAD_MSG (-22)
87
88struct ocfs2_response_msg
89{
90 struct ocfs2_msg_hdr r_hdr;
91 __be32 r_response;
92 __be32 r_orphaned_slot;
93};
94
95struct ocfs2_vote_work {
96 struct list_head w_list;
97 struct ocfs2_vote_msg w_msg;
98};
99
100enum ocfs2_vote_request {
101 OCFS2_VOTE_REQ_INVALID = 0,
102 OCFS2_VOTE_REQ_DELETE,
103 OCFS2_VOTE_REQ_UNLINK,
104 OCFS2_VOTE_REQ_RENAME,
105 OCFS2_VOTE_REQ_MOUNT,
106 OCFS2_VOTE_REQ_UMOUNT,
107 OCFS2_VOTE_REQ_LAST
108};
109
110static inline int ocfs2_is_valid_vote_request(int request)
111{
112 return OCFS2_VOTE_REQ_INVALID < request &&
113 request < OCFS2_VOTE_REQ_LAST;
114}
115
116typedef void (*ocfs2_net_response_callback)(void *priv,
117 struct ocfs2_response_msg *resp);
118struct ocfs2_net_response_cb {
119 ocfs2_net_response_callback rc_cb;
120 void *rc_priv;
121};
122
123struct ocfs2_net_wait_ctxt {
124 struct list_head n_list;
125 u32 n_response_id;
126 wait_queue_head_t n_event;
127 struct ocfs2_node_map n_node_map;
128 int n_response; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb *n_callback;
133};
134
135static void ocfs2_process_mount_request(struct ocfs2_super *osb,
136 unsigned int node_num)
137{
138 mlog(0, "MOUNT vote from node %u\n", node_num);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb, node_num);
143 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
144
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
149}
150
151static void ocfs2_process_umount_request(struct ocfs2_super *osb,
152 unsigned int node_num)
153{
154 mlog(0, "UMOUNT vote from node %u\n", node_num);
155 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
156 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
157}
158
159void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
160{
161 struct ocfs2_inode_info *oi = OCFS2_I(inode);
162
163 assert_spin_locked(&oi->ip_lock);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
168 * up the slack. */
169 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
170}
171
172static int ocfs2_process_delete_request(struct inode *inode,
173 int *orphaned_slot)
174{
175 int response = OCFS2_RESPONSE_BUSY;
176
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode->i_ino, inode->i_nlink, *orphaned_slot);
179
180 spin_lock(&OCFS2_I(inode)->ip_lock);
181
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
189 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
190 OCFS2_INVALID_SLOT &&
191 OCFS2_I(inode)->ip_orphaned_slot !=
192 (*orphaned_slot),
193 "Inode %"MLFu64": This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 OCFS2_I(inode)->ip_blkno,
196 OCFS2_I(inode)->ip_orphaned_slot,
197 *orphaned_slot);
198
199 mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
200 OCFS2_I(inode)->ip_blkno, *orphaned_slot);
201
202 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
203 } else {
204 mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
205 OCFS2_I(inode)->ip_orphaned_slot,
206 OCFS2_I(inode)->ip_blkno);
207
208 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
209 }
210
211 /* vote no if the file is still open. */
212 if (OCFS2_I(inode)->ip_open_count) {
213 mlog(0, "open count = %u\n",
214 OCFS2_I(inode)->ip_open_count);
215 spin_unlock(&OCFS2_I(inode)->ip_lock);
216 goto done;
217 }
218 spin_unlock(&OCFS2_I(inode)->ip_lock);
219
220 /* directories are a bit ugly... What if someone is sitting in
221 * it? We want to make sure the inode is removed completely as
222 * a result of the iput in process_vote. */
223 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
224 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
225 goto done;
226 }
227
228 if (filemap_fdatawrite(inode->i_mapping)) {
229 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
230 OCFS2_I(inode)->ip_blkno);
231 goto done;
232 }
233 sync_mapping_buffers(inode->i_mapping);
234 truncate_inode_pages(inode->i_mapping, 0);
235 ocfs2_extent_map_trunc(inode, 0);
236
237 spin_lock(&OCFS2_I(inode)->ip_lock);
238 /* double check open count - someone might have raced this
239 * thread into ocfs2_file_open while we were writing out
240 * data. If we're to allow a wipe of this inode now, we *must*
241 * hold the spinlock until we've marked it. */
242 if (OCFS2_I(inode)->ip_open_count) {
243 mlog(0, "Raced to wipe! open count = %u\n",
244 OCFS2_I(inode)->ip_open_count);
245 spin_unlock(&OCFS2_I(inode)->ip_lock);
246 goto done;
247 }
248
249 /* Mark the inode as being wiped from disk. */
250 ocfs2_mark_inode_remotely_deleted(inode);
251 spin_unlock(&OCFS2_I(inode)->ip_lock);
252
253 /* Not sure this is necessary anymore. */
254 d_prune_aliases(inode);
255
256 /* If we get here, then we're voting 'yes', so commit the
257 * delete on our side. */
258 response = OCFS2_RESPONSE_OK;
259done:
260 return response;
261}
262
263static int ocfs2_match_dentry(struct dentry *dentry,
264 u64 parent_blkno,
265 unsigned int namelen,
266 const char *name)
267{
268 struct inode *parent;
269
270 if (!dentry->d_parent) {
271 mlog(0, "Detached from parent.\n");
272 return 0;
273 }
274
275 parent = dentry->d_parent->d_inode;
276 /* Negative parent dentry? */
277 if (!parent)
278 return 0;
279
280 /* Name is in a different directory. */
281 if (OCFS2_I(parent)->ip_blkno != parent_blkno)
282 return 0;
283
284 if (dentry->d_name.len != namelen)
285 return 0;
286
287 /* comparison above guarantees this is safe. */
288 if (memcmp(dentry->d_name.name, name, namelen))
289 return 0;
290
291 return 1;
292}
293
294static void ocfs2_process_dentry_request(struct inode *inode,
295 int rename,
296 unsigned int new_nlink,
297 u64 parent_blkno,
298 unsigned int namelen,
299 const char *name)
300{
301 struct dentry *dentry = NULL;
302 struct list_head *p;
303 struct ocfs2_inode_info *oi = OCFS2_I(inode);
304
305 mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
306 namelen, namelen, name);
307
308 spin_lock(&dcache_lock);
309
310 /* Another node is removing this name from the system. It is
311 * up to us to find the corresponding dentry and if it exists,
312 * unhash it from the dcache. */
313 list_for_each(p, &inode->i_dentry) {
314 dentry = list_entry(p, struct dentry, d_alias);
315
316 if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
317 mlog(0, "dentry found: %.*s\n",
318 dentry->d_name.len, dentry->d_name.name);
319
320 dget_locked(dentry);
321 break;
322 }
323
324 dentry = NULL;
325 }
326
327 spin_unlock(&dcache_lock);
328
329 if (dentry) {
330 d_delete(dentry);
331 dput(dentry);
332 }
333
334 /* rename votes don't send link counts */
335 if (!rename) {
336 mlog(0, "new_nlink = %u\n", new_nlink);
337
338 /* We don't have the proper locks here to directly
339 * change i_nlink and besides, the vote is sent
340 * *before* the operation so it may have failed on the
341 * other node. This passes a hint to ocfs2_drop_inode
342 * to force ocfs2_delete_inode, who will take the
343 * proper cluster locks to sort things out. */
344 if (new_nlink == 0) {
345 spin_lock(&oi->ip_lock);
346 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
347 spin_unlock(&OCFS2_I(inode)->ip_lock);
348 }
349 }
350}
351
352static void ocfs2_process_vote(struct ocfs2_super *osb,
353 struct ocfs2_vote_msg *msg)
354{
355 int net_status, vote_response;
356 int orphaned_slot = 0;
357 int rename = 0;
358 unsigned int node_num, generation, new_nlink, namelen;
359 u64 blkno, parent_blkno;
360 enum ocfs2_vote_request request;
361 struct inode *inode = NULL;
362 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
363 struct ocfs2_response_msg response;
364
365 /* decode the network mumbo jumbo into local variables. */
366 request = be32_to_cpu(hdr->h_request);
367 blkno = be64_to_cpu(hdr->h_blkno);
368 generation = be32_to_cpu(hdr->h_generation);
369 node_num = be32_to_cpu(hdr->h_node_num);
370 if (request == OCFS2_VOTE_REQ_DELETE)
371 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
372
373 mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
374 "generation = %u, node_num = %u, priv1 = %u\n", request,
375 blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
376
377 if (!ocfs2_is_valid_vote_request(request)) {
378 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
379 request, node_num);
380 vote_response = OCFS2_RESPONSE_BAD_MSG;
381 goto respond;
382 }
383
384 vote_response = OCFS2_RESPONSE_OK;
385
386 switch (request) {
387 case OCFS2_VOTE_REQ_UMOUNT:
388 ocfs2_process_umount_request(osb, node_num);
389 goto respond;
390 case OCFS2_VOTE_REQ_MOUNT:
391 ocfs2_process_mount_request(osb, node_num);
392 goto respond;
393 default:
394 /* avoids a gcc warning */
395 break;
396 }
397
398 /* We cannot process the remaining message types before we're
399 * fully mounted. It's perfectly safe however to send a 'yes'
400 * response as we can't possibly have any of the state they're
401 * asking us to modify yet. */
402 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
403 goto respond;
404
405 /* If we get here, then the request is against an inode. */
406 inode = ocfs2_ilookup_for_vote(osb, blkno,
407 request == OCFS2_VOTE_REQ_DELETE);
408
409 /* Not finding the inode is perfectly valid - it means we're
410 * not interested in what the other node is about to do to it
411 * so in those cases we automatically respond with an
412 * affirmative. Cluster locking ensures that we won't race
413 * interest in the inode with this vote request. */
414 if (!inode)
415 goto respond;
416
417 /* Check generation values. It's possible for us to get a
418 * request against a stale inode. If so then we proceed as if
419 * we had not found an inode in the first place. */
420 if (inode->i_generation != generation) {
421 mlog(0, "generation passed %u != inode generation = %u, "
422 "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
423 "i_count = %u, message type = %u\n",
424 generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
425 OCFS2_I(inode)->ip_blkno, blkno,
426 atomic_read(&inode->i_count), request);
427 iput(inode);
428 inode = NULL;
429 goto respond;
430 }
431
432 switch (request) {
433 case OCFS2_VOTE_REQ_DELETE:
434 vote_response = ocfs2_process_delete_request(inode,
435 &orphaned_slot);
436 break;
437 case OCFS2_VOTE_REQ_RENAME:
438 rename = 1;
439 /* fall through */
440 case OCFS2_VOTE_REQ_UNLINK:
441 parent_blkno = be64_to_cpu(msg->v_unlink_parent);
442 namelen = be32_to_cpu(msg->v_unlink_namelen);
443 /* new_nlink will be ignored in case of a rename vote */
444 new_nlink = be32_to_cpu(msg->md1.v_nlink);
445 ocfs2_process_dentry_request(inode, rename, new_nlink,
446 parent_blkno, namelen,
447 msg->v_unlink_dirent);
448 break;
449 default:
450 mlog(ML_ERROR, "node %u, invalid request: %u\n",
451 node_num, request);
452 vote_response = OCFS2_RESPONSE_BAD_MSG;
453 }
454
455respond:
456 /* Response struture is small so we just put it on the stack
457 * and stuff it inline. */
458 memset(&response, 0, sizeof(struct ocfs2_response_msg));
459 response.r_hdr.h_response_id = hdr->h_response_id;
460 response.r_hdr.h_blkno = hdr->h_blkno;
461 response.r_hdr.h_generation = hdr->h_generation;
462 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
463 response.r_response = cpu_to_be32(vote_response);
464 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
465
466 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
467 osb->net_key,
468 &response,
469 sizeof(struct ocfs2_response_msg),
470 node_num,
471 NULL);
472 /* We still want to error print for ENOPROTOOPT here. The
473 * sending node shouldn't have unregistered his net handler
474 * without sending an unmount vote 1st */
475 if (net_status < 0
476 && net_status != -ETIMEDOUT
477 && net_status != -ENOTCONN)
478 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
479 node_num, net_status);
480
481 if (inode)
482 iput(inode);
483}
484
485static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
486{
487 unsigned long processed;
488 struct ocfs2_lock_res *lockres;
489 struct ocfs2_vote_work *work;
490
491 mlog_entry_void();
492
493 spin_lock(&osb->vote_task_lock);
494 /* grab this early so we know to try again if a state change and
495 * wake happens part-way through our work */
496 osb->vote_work_sequence = osb->vote_wake_sequence;
497
498 processed = osb->blocked_lock_count;
499 while (processed) {
500 BUG_ON(list_empty(&osb->blocked_lock_list));
501
502 lockres = list_entry(osb->blocked_lock_list.next,
503 struct ocfs2_lock_res, l_blocked_list);
504 list_del_init(&lockres->l_blocked_list);
505 osb->blocked_lock_count--;
506 spin_unlock(&osb->vote_task_lock);
507
508 BUG_ON(!processed);
509 processed--;
510
511 ocfs2_process_blocked_lock(osb, lockres);
512
513 spin_lock(&osb->vote_task_lock);
514 }
515
516 while (osb->vote_count) {
517 BUG_ON(list_empty(&osb->vote_list));
518 work = list_entry(osb->vote_list.next,
519 struct ocfs2_vote_work, w_list);
520 list_del(&work->w_list);
521 osb->vote_count--;
522 spin_unlock(&osb->vote_task_lock);
523
524 ocfs2_process_vote(osb, &work->w_msg);
525 kfree(work);
526
527 spin_lock(&osb->vote_task_lock);
528 }
529 spin_unlock(&osb->vote_task_lock);
530
531 mlog_exit_void();
532}
533
534static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
535{
536 int empty = 0;
537
538 spin_lock(&osb->vote_task_lock);
539 if (list_empty(&osb->blocked_lock_list) &&
540 list_empty(&osb->vote_list))
541 empty = 1;
542
543 spin_unlock(&osb->vote_task_lock);
544 return empty;
545}
546
547static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
548{
549 int should_wake = 0;
550
551 spin_lock(&osb->vote_task_lock);
552 if (osb->vote_work_sequence != osb->vote_wake_sequence)
553 should_wake = 1;
554 spin_unlock(&osb->vote_task_lock);
555
556 return should_wake;
557}
558
559int ocfs2_vote_thread(void *arg)
560{
561 int status = 0;
562 struct ocfs2_super *osb = arg;
563
564 /* only quit once we've been asked to stop and there is no more
565 * work available */
566 while (!(kthread_should_stop() &&
567 ocfs2_vote_thread_lists_empty(osb))) {
568
569 wait_event_interruptible(osb->vote_event,
570 ocfs2_vote_thread_should_wake(osb) ||
571 kthread_should_stop());
572
573 mlog(0, "vote_thread: awoken\n");
574
575 ocfs2_vote_thread_do_work(osb);
576 }
577
578 osb->vote_task = NULL;
579 return status;
580}
581
582static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
583{
584 struct ocfs2_net_wait_ctxt *w;
585
586 w = kcalloc(1, sizeof(*w), GFP_KERNEL);
587 if (!w) {
588 mlog_errno(-ENOMEM);
589 goto bail;
590 }
591
592 INIT_LIST_HEAD(&w->n_list);
593 init_waitqueue_head(&w->n_event);
594 ocfs2_node_map_init(&w->n_node_map);
595 w->n_response_id = response_id;
596 w->n_callback = NULL;
597bail:
598 return w;
599}
600
601static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
602{
603 unsigned int ret;
604
605 spin_lock(&osb->net_response_lock);
606 ret = ++osb->net_response_ids;
607 spin_unlock(&osb->net_response_lock);
608
609 return ret;
610}
611
612static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
613 struct ocfs2_net_wait_ctxt *w)
614{
615 spin_lock(&osb->net_response_lock);
616 list_del(&w->n_list);
617 spin_unlock(&osb->net_response_lock);
618}
619
620static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
621 struct ocfs2_net_wait_ctxt *w)
622{
623 spin_lock(&osb->net_response_lock);
624 list_add_tail(&w->n_list,
625 &osb->net_response_list);
626 spin_unlock(&osb->net_response_lock);
627}
628
629static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
630 struct ocfs2_net_wait_ctxt *w,
631 int node_num)
632{
633 assert_spin_locked(&osb->net_response_lock);
634
635 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
636 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
637 wake_up(&w->n_event);
638}
639
640/* Intended to be called from the node down callback, we fake remove
641 * the node from all our response contexts */
642void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
643 int node_num)
644{
645 struct list_head *p;
646 struct ocfs2_net_wait_ctxt *w = NULL;
647
648 spin_lock(&osb->net_response_lock);
649
650 list_for_each(p, &osb->net_response_list) {
651 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
652
653 __ocfs2_mark_node_responded(osb, w, node_num);
654 }
655
656 spin_unlock(&osb->net_response_lock);
657}
658
659static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
660 struct ocfs2_vote_msg *request,
661 unsigned int response_id,
662 int *response,
663 struct ocfs2_net_response_cb *callback)
664{
665 int status, i, remote_err;
666 struct ocfs2_net_wait_ctxt *w = NULL;
667 int dequeued = 0;
668
669 mlog_entry_void();
670
671 w = ocfs2_new_net_wait_ctxt(response_id);
672 if (!w) {
673 status = -ENOMEM;
674 mlog_errno(status);
675 goto bail;
676 }
677 w->n_callback = callback;
678
679 /* we're pretty much ready to go at this point, and this fills
680 * in n_response which we need anyway... */
681 ocfs2_queue_net_wait_ctxt(osb, w);
682
683 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
684
685 while (i != O2NM_INVALID_NODE_NUM) {
686 if (i != osb->node_num) {
687 mlog(0, "trying to send request to node %i\n", i);
688 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
689
690 remote_err = 0;
691 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
692 osb->net_key,
693 request,
694 sizeof(*request),
695 i,
696 &remote_err);
697 if (status == -ETIMEDOUT) {
698 mlog(0, "remote node %d timed out!\n", i);
699 status = -EAGAIN;
700 goto bail;
701 }
702 if (remote_err < 0) {
703 status = remote_err;
704 mlog(0, "remote error %d on node %d!\n",
705 remote_err, i);
706 mlog_errno(status);
707 goto bail;
708 }
709 if (status < 0) {
710 mlog_errno(status);
711 goto bail;
712 }
713 }
714 i++;
715 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
716 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
717 }
718 mlog(0, "done sending, now waiting on responses...\n");
719
720 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
721
722 ocfs2_dequeue_net_wait_ctxt(osb, w);
723 dequeued = 1;
724
725 *response = w->n_response;
726 status = 0;
727bail:
728 if (w) {
729 if (!dequeued)
730 ocfs2_dequeue_net_wait_ctxt(osb, w);
731 kfree(w);
732 }
733
734 mlog_exit(status);
735 return status;
736}
737
738static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
739 u64 blkno,
740 unsigned int generation,
741 enum ocfs2_vote_request type,
742 u32 priv)
743{
744 struct ocfs2_vote_msg *request;
745 struct ocfs2_msg_hdr *hdr;
746
747 BUG_ON(!ocfs2_is_valid_vote_request(type));
748
749 request = kcalloc(1, sizeof(*request), GFP_KERNEL);
750 if (!request) {
751 mlog_errno(-ENOMEM);
752 } else {
753 hdr = &request->v_hdr;
754 hdr->h_node_num = cpu_to_be32(osb->node_num);
755 hdr->h_request = cpu_to_be32(type);
756 hdr->h_blkno = cpu_to_be64(blkno);
757 hdr->h_generation = cpu_to_be32(generation);
758
759 request->md1.v_generic1 = cpu_to_be32(priv);
760 }
761
762 return request;
763}
764
765/* Complete the buildup of a new vote request and process the
766 * broadcast return value. */
767static int ocfs2_do_request_vote(struct ocfs2_super *osb,
768 struct ocfs2_vote_msg *request,
769 struct ocfs2_net_response_cb *callback)
770{
771 int status, response;
772 unsigned int response_id;
773 struct ocfs2_msg_hdr *hdr;
774
775 response_id = ocfs2_new_response_id(osb);
776
777 hdr = &request->v_hdr;
778 hdr->h_response_id = cpu_to_be32(response_id);
779
780 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
781 callback);
782 if (status < 0) {
783 mlog_errno(status);
784 goto bail;
785 }
786
787 status = response;
788bail:
789
790 return status;
791}
792
793static int ocfs2_request_vote(struct inode *inode,
794 struct ocfs2_vote_msg *request,
795 struct ocfs2_net_response_cb *callback)
796{
797 int status;
798 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
799
800 if (ocfs2_inode_is_new(inode))
801 return 0;
802
803 status = -EAGAIN;
804 while (status == -EAGAIN) {
805 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
806 signal_pending(current))
807 return -ERESTARTSYS;
808
809 status = ocfs2_super_lock(osb, 0);
810 if (status < 0) {
811 mlog_errno(status);
812 break;
813 }
814
815 status = 0;
816 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
817 osb->node_num))
818 status = ocfs2_do_request_vote(osb, request, callback);
819
820 ocfs2_super_unlock(osb, 0);
821 }
822 return status;
823}
824
825static void ocfs2_delete_response_cb(void *priv,
826 struct ocfs2_response_msg *resp)
827{
828 int orphaned_slot, node;
829 struct inode *inode = priv;
830
831 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
832 node = be32_to_cpu(resp->r_hdr.h_node_num);
833 mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
834 "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
835
836 /* The other node may not actually know which slot the inode
837 * is orphaned in. */
838 if (orphaned_slot == OCFS2_INVALID_SLOT)
839 return;
840
841 /* Ok, the responding node knows which slot this inode is
842 * orphaned in. We verify that the information is correct and
843 * then record this in the inode. ocfs2_delete_inode will use
844 * this information to determine which lock to take. */
845 spin_lock(&OCFS2_I(inode)->ip_lock);
846 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
847 OCFS2_I(inode)->ip_orphaned_slot
848 != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
849 "says it's orphaned in slot %d, we think it's in %d\n",
850 OCFS2_I(inode)->ip_blkno,
851 be32_to_cpu(resp->r_hdr.h_node_num),
852 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
853
854 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
855 spin_unlock(&OCFS2_I(inode)->ip_lock);
856}
857
858int ocfs2_request_delete_vote(struct inode *inode)
859{
860 int orphaned_slot, status;
861 struct ocfs2_net_response_cb delete_cb;
862 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
863 struct ocfs2_vote_msg *request;
864
865 spin_lock(&OCFS2_I(inode)->ip_lock);
866 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
867 spin_unlock(&OCFS2_I(inode)->ip_lock);
868
869 delete_cb.rc_cb = ocfs2_delete_response_cb;
870 delete_cb.rc_priv = inode;
871
872 mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
873 OCFS2_I(inode)->ip_blkno, orphaned_slot);
874
875 status = -ENOMEM;
876 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
877 inode->i_generation,
878 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
879 if (request) {
880 status = ocfs2_request_vote(inode, request, &delete_cb);
881
882 kfree(request);
883 }
884
885 return status;
886}
887
888static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
889 struct dentry *dentry)
890{
891 struct inode *parent = dentry->d_parent->d_inode;
892
893 /* We need some values which will uniquely identify a dentry
894 * on the other nodes so that they can find it and run
895 * d_delete against it. Parent directory block and full name
896 * should suffice. */
897
898 mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
899 OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
900 dentry->d_name.name);
901
902 request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
903 request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
904 memcpy(request->v_unlink_dirent, dentry->d_name.name,
905 dentry->d_name.len);
906}
907
908int ocfs2_request_unlink_vote(struct inode *inode,
909 struct dentry *dentry,
910 unsigned int nlink)
911{
912 int status;
913 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
914 struct ocfs2_vote_msg *request;
915
916 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
917 return -ENAMETOOLONG;
918
919 status = -ENOMEM;
920 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
921 inode->i_generation,
922 OCFS2_VOTE_REQ_UNLINK, nlink);
923 if (request) {
924 ocfs2_setup_unlink_vote(request, dentry);
925
926 status = ocfs2_request_vote(inode, request, NULL);
927
928 kfree(request);
929 }
930 return status;
931}
932
933int ocfs2_request_rename_vote(struct inode *inode,
934 struct dentry *dentry)
935{
936 int status;
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938 struct ocfs2_vote_msg *request;
939
940 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
941 return -ENAMETOOLONG;
942
943 status = -ENOMEM;
944 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
945 inode->i_generation,
946 OCFS2_VOTE_REQ_RENAME, 0);
947 if (request) {
948 ocfs2_setup_unlink_vote(request, dentry);
949
950 status = ocfs2_request_vote(inode, request, NULL);
951
952 kfree(request);
953 }
954 return status;
955}
956
957int ocfs2_request_mount_vote(struct ocfs2_super *osb)
958{
959 int status;
960 struct ocfs2_vote_msg *request = NULL;
961
962 request = ocfs2_new_vote_request(osb, 0ULL, 0,
963 OCFS2_VOTE_REQ_MOUNT, 0);
964 if (!request) {
965 status = -ENOMEM;
966 goto bail;
967 }
968
969 status = -EAGAIN;
970 while (status == -EAGAIN) {
971 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
972 signal_pending(current)) {
973 status = -ERESTARTSYS;
974 goto bail;
975 }
976
977 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
978 osb->node_num)) {
979 status = 0;
980 goto bail;
981 }
982
983 status = ocfs2_do_request_vote(osb, request, NULL);
984 }
985
986bail:
987 if (request)
988 kfree(request);
989
990 return status;
991}
992
993int ocfs2_request_umount_vote(struct ocfs2_super *osb)
994{
995 int status;
996 struct ocfs2_vote_msg *request = NULL;
997
998 request = ocfs2_new_vote_request(osb, 0ULL, 0,
999 OCFS2_VOTE_REQ_UMOUNT, 0);
1000 if (!request) {
1001 status = -ENOMEM;
1002 goto bail;
1003 }
1004
1005 status = -EAGAIN;
1006 while (status == -EAGAIN) {
1007 /* Do not check signals on this vote... We really want
1008 * this one to go all the way through. */
1009
1010 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
1011 osb->node_num)) {
1012 status = 0;
1013 goto bail;
1014 }
1015
1016 status = ocfs2_do_request_vote(osb, request, NULL);
1017 }
1018
1019bail:
1020 if (request)
1021 kfree(request);
1022
1023 return status;
1024}
1025
1026/* TODO: This should eventually be a hash table! */
1027static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
1028 u32 response_id)
1029{
1030 struct list_head *p;
1031 struct ocfs2_net_wait_ctxt *w = NULL;
1032
1033 list_for_each(p, &osb->net_response_list) {
1034 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
1035 if (response_id == w->n_response_id)
1036 break;
1037 w = NULL;
1038 }
1039
1040 return w;
1041}
1042
1043/* Translate response codes into local node errno values */
1044static inline int ocfs2_translate_response(int response)
1045{
1046 int ret;
1047
1048 switch (response) {
1049 case OCFS2_RESPONSE_OK:
1050 ret = 0;
1051 break;
1052
1053 case OCFS2_RESPONSE_BUSY:
1054 ret = -EBUSY;
1055 break;
1056
1057 default:
1058 ret = -EINVAL;
1059 }
1060
1061 return ret;
1062}
1063
1064static int ocfs2_handle_response_message(struct o2net_msg *msg,
1065 u32 len,
1066 void *data)
1067{
1068 unsigned int response_id, node_num;
1069 int response_status;
1070 struct ocfs2_super *osb = data;
1071 struct ocfs2_response_msg *resp;
1072 struct ocfs2_net_wait_ctxt * w;
1073 struct ocfs2_net_response_cb *resp_cb;
1074
1075 resp = (struct ocfs2_response_msg *) msg->buf;
1076
1077 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
1078 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
1079 response_status =
1080 ocfs2_translate_response(be32_to_cpu(resp->r_response));
1081
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
1085 mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
1086 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
1087 mlog(0, "h_node_num = %u\n", node_num);
1088 mlog(0, "r_response = %d\n", response_status);
1089
1090 spin_lock(&osb->net_response_lock);
1091 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
1092 if (!w) {
1093 mlog(0, "request not found!\n");
1094 goto bail;
1095 }
1096 resp_cb = w->n_callback;
1097
1098 if (response_status && (!w->n_response)) {
1099 /* we only really need one negative response so don't
1100 * set it twice. */
1101 w->n_response = response_status;
1102 }
1103
1104 if (resp_cb) {
1105 spin_unlock(&osb->net_response_lock);
1106
1107 resp_cb->rc_cb(resp_cb->rc_priv, resp);
1108
1109 spin_lock(&osb->net_response_lock);
1110 }
1111
1112 __ocfs2_mark_node_responded(osb, w, node_num);
1113bail:
1114 spin_unlock(&osb->net_response_lock);
1115
1116 return 0;
1117}
1118
1119static int ocfs2_handle_vote_message(struct o2net_msg *msg,
1120 u32 len,
1121 void *data)
1122{
1123 int status;
1124 struct ocfs2_super *osb = data;
1125 struct ocfs2_vote_work *work;
1126
1127 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
1128 if (!work) {
1129 status = -ENOMEM;
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133
1134 INIT_LIST_HEAD(&work->w_list);
1135 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
1136
1137 mlog(0, "scheduling vote request:\n");
1138 mlog(0, "h_response_id = %u\n",
1139 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
1140 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
1141 mlog(0, "h_blkno = %"MLFu64"\n",
1142 be64_to_cpu(work->w_msg.v_hdr.h_blkno));
1143 mlog(0, "h_generation = %u\n",
1144 be32_to_cpu(work->w_msg.v_hdr.h_generation));
1145 mlog(0, "h_node_num = %u\n",
1146 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
1147 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
1148
1149 spin_lock(&osb->vote_task_lock);
1150 list_add_tail(&work->w_list, &osb->vote_list);
1151 osb->vote_count++;
1152 spin_unlock(&osb->vote_task_lock);
1153
1154 ocfs2_kick_vote_thread(osb);
1155
1156 status = 0;
1157bail:
1158 return status;
1159}
1160
1161void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
1162{
1163 if (!osb->net_key)
1164 return;
1165
1166 o2net_unregister_handler_list(&osb->osb_net_handlers);
1167
1168 if (!list_empty(&osb->net_response_list))
1169 mlog(ML_ERROR, "net response list not empty!\n");
1170
1171 osb->net_key = 0;
1172}
1173
1174int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1175{
1176 int status = 0;
1177
1178 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
1179 osb->net_key,
1180 sizeof(struct ocfs2_response_msg),
1181 ocfs2_handle_response_message,
1182 osb, &osb->osb_net_handlers);
1183 if (status) {
1184 mlog_errno(status);
1185 goto bail;
1186 }
1187
1188 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
1189 osb->net_key,
1190 sizeof(struct ocfs2_vote_msg),
1191 ocfs2_handle_vote_message,
1192 osb, &osb->osb_net_handlers);
1193 if (status) {
1194 mlog_errno(status);
1195 goto bail;
1196 }
1197bail:
1198 if (status < 0)
1199 ocfs2_unregister_net_handlers(osb);
1200
1201 return status;
1202}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 000000000000..9cce60703466
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef VOTE_H
28#define VOTE_H
29
30int ocfs2_vote_thread(void *arg);
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_unlink_vote(struct inode *inode,
43 struct dentry *dentry,
44 unsigned int nlink);
45int ocfs2_request_rename_vote(struct inode *inode,
46 struct dentry *dentry);
47int ocfs2_request_mount_vote(struct ocfs2_super *osb);
48int ocfs2_request_umount_vote(struct ocfs2_super *osb);
49int ocfs2_register_net_handlers(struct ocfs2_super *osb);
50void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
51
52void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
53
54void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
55 int node_num);
56#endif