aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2005-12-15 17:31:24 -0500
committerJoel Becker <joel.becker@oracle.com>2006-01-03 14:45:47 -0500
commitccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch)
treec50ed941849ce06ccadd4ce27599b3ef9fdbe2ae
parent8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff)
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/ocfs2.txt55
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/ocfs2/Makefile33
-rw-r--r--fs/ocfs2/alloc.c2040
-rw-r--r--fs/ocfs2/alloc.h82
-rw-r--r--fs/ocfs2/aops.c643
-rw-r--r--fs/ocfs2/aops.h41
-rw-r--r--fs/ocfs2/buffer_head_io.c232
-rw-r--r--fs/ocfs2/buffer_head_io.h73
-rw-r--r--fs/ocfs2/dcache.c91
-rw-r--r--fs/ocfs2/dcache.h31
-rw-r--r--fs/ocfs2/dir.c618
-rw-r--r--fs/ocfs2/dir.h54
-rw-r--r--fs/ocfs2/dlmglue.c2904
-rw-r--r--fs/ocfs2/dlmglue.h111
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c248
-rw-r--r--fs/ocfs2/export.h31
-rw-r--r--fs/ocfs2/extent_map.c994
-rw-r--r--fs/ocfs2/extent_map.h46
-rw-r--r--fs/ocfs2/file.c1237
-rw-r--r--fs/ocfs2/file.h57
-rw-r--r--fs/ocfs2/heartbeat.c378
-rw-r--r--fs/ocfs2/heartbeat.h67
-rw-r--r--fs/ocfs2/inode.c1140
-rw-r--r--fs/ocfs2/inode.h145
-rw-r--r--fs/ocfs2/journal.c1652
-rw-r--r--fs/ocfs2/journal.h457
-rw-r--r--fs/ocfs2/localalloc.c983
-rw-r--r--fs/ocfs2/localalloc.h56
-rw-r--r--fs/ocfs2/mmap.c102
-rw-r--r--fs/ocfs2/mmap.h6
-rw-r--r--fs/ocfs2/namei.c2264
-rw-r--r--fs/ocfs2/namei.h58
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h109
-rw-r--r--fs/ocfs2/ocfs2.h464
-rw-r--r--fs/ocfs2/ocfs2_fs.h638
-rw-r--r--fs/ocfs2/ocfs2_lockid.h73
-rw-r--r--fs/ocfs2/slot_map.c303
-rw-r--r--fs/ocfs2/slot_map.h66
-rw-r--r--fs/ocfs2/suballoc.c1651
-rw-r--r--fs/ocfs2/suballoc.h132
-rw-r--r--fs/ocfs2/super.c1733
-rw-r--r--fs/ocfs2/super.h44
-rw-r--r--fs/ocfs2/symlink.c180
-rw-r--r--fs/ocfs2/symlink.h42
-rw-r--r--fs/ocfs2/sysfile.c131
-rw-r--r--fs/ocfs2/sysfile.h33
-rw-r--r--fs/ocfs2/uptodate.c544
-rw-r--r--fs/ocfs2/uptodate.h44
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/vote.c1202
-rw-r--r--fs/ocfs2/vote.h56
55 files changed, 24504 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d9b0a0691866..2580ada100a0 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
36 - info and mount options for the NTFS filesystem (Windows NT). 36 - info and mount options for the NTFS filesystem (Windows NT).
37proc.txt 37proc.txt
38 - info on Linux's /proc filesystem. 38 - info on Linux's /proc filesystem.
39ocfs2.txt
40 - info and mount options for the OCFS2 clustered filesystem.
39romfs.txt 41romfs.txt
40 - Description of the ROMFS filesystem. 42 - Description of the ROMFS filesystem.
41smbfs.txt 43smbfs.txt
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 000000000000..f2595caf052e
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
1OCFS2 filesystem
2==================
3OCFS2 is a general purpose extent based shared disk cluster file
4system with many similarities to ext3. It supports 64 bit inode
5numbers, and has automatically extending metadata groups which may
6also make it attractive for non-clustered use.
7
8You'll want to install the ocfs2-tools package in order to at least
9get "mount.ocfs2" and "ocfs2_hb_ctl".
10
11Project web page: http://oss.oracle.com/projects/ocfs2
12Tools web page: http://oss.oracle.com/projects/ocfs2-tools
13OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
14
15All code copyright 2005 Oracle except when otherwise noted.
16
17CREDITS:
18Lots of code taken from ext3 and other projects.
19
20Authors in alphabetical order:
21Joel Becker <joel.becker@oracle.com>
22Zach Brown <zach.brown@oracle.com>
23Mark Fasheh <mark.fasheh@oracle.com>
24Kurt Hackel <kurt.hackel@oracle.com>
25Sunil Mushran <sunil.mushran@oracle.com>
26Manish Singh <manish.singh@oracle.com>
27
28Caveats
29=======
30Features which OCFS2 does not support yet:
31 - sparse files
32 - extended attributes
33 - shared writeable mmap
34 - loopback is supported, but data written will not
35 be cluster coherent.
36 - quotas
37 - cluster aware flock
38 - Directory change notification (F_NOTIFY)
39 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
40 - POSIX ACLs
41 - readpages / writepages (not user visible)
42
43Mount options
44=============
45
46OCFS2 supports the following mount options:
47(*) == default
48
49barrier=1 This enables/disables barriers. barrier=0 disables it,
50 barrier=1 enables it.
51errors=remount-ro(*) Remount the filesystem read-only on an error.
52errors=panic Panic and halt the machine if an error occurs.
53intr (*) Allow signals to interrupt cluster operations.
54nointr Do not allow signals to interrupt cluster
55 operations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 86ee06f43794..15888302025f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com
1905L: linux-nvidia@lists.surfsouth.com 1905L: linux-nvidia@lists.surfsouth.com
1906S: Maintained 1906S: Maintained
1907 1907
1908ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
1909P: Mark Fasheh
1910M: mark.fasheh@oracle.com
1911P: Kurt Hackel
1912M: kurt.hackel@oracle.com
1913L: ocfs2-devel@oss.oracle.com
1914W: http://oss.oracle.com/projects/ocfs2/
1915S: Supported
1916
1908OLYMPIC NETWORK DRIVER 1917OLYMPIC NETWORK DRIVER
1909P: Peter De Shrijver 1918P: Peter De Shrijver
1910M: p2@ace.ulyssis.student.kuleuven.ac.be 1919M: p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 000000000000..7d3be845a614
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o
6
7ocfs2-objs := \
8 alloc.o \
9 aops.o \
10 buffer_head_io.o \
11 dcache.o \
12 dir.o \
13 dlmglue.o \
14 export.o \
15 extent_map.o \
16 file.o \
17 heartbeat.o \
18 inode.o \
19 journal.o \
20 localalloc.o \
21 mmap.o \
22 namei.o \
23 slot_map.o \
24 suballoc.o \
25 super.o \
26 symlink.o \
27 sysfile.o \
28 uptodate.o \
29 ver.o \
30 vote.o
31
32obj-$(CONFIG_OCFS2_FS) += cluster/
33obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dlmglue.h"
38#include "extent_map.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "sysfile.h"
44#include "file.h"
45#include "super.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 struct ocfs2_journal_handle *handle,
56 struct inode *inode,
57 int wanted,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
60
61static int ocfs2_add_branch(struct ocfs2_super *osb,
62 struct ocfs2_journal_handle *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 struct ocfs2_journal_handle *handle,
71 struct inode *inode,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 struct ocfs2_journal_handle *handle,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 u64 blkno,
81 u32 new_clusters);
82
83static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 struct inode *inode,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
87
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 struct inode *inode,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
99 u64 blkno)
100{
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
104}
105
106/*
107 * How many free extents have we got before we need more meta data?
108 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 struct inode *inode,
111 struct ocfs2_dinode *fe)
112{
113 int retval;
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
117
118 mlog_entry_void();
119
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 retval = -EIO;
123 goto bail;
124 }
125
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
129 if (retval < 0) {
130 mlog_errno(retval);
131 goto bail;
132 }
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 el = &eb->h_list;
135 } else
136 el = &fe->id2.i_list;
137
138 BUG_ON(el->l_tree_depth != 0);
139
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141bail:
142 if (eb_bh)
143 brelse(eb_bh);
144
145 mlog_exit(retval);
146 return retval;
147}
148
149/* expects array to already be allocated
150 *
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152 * l_count for you
153 */
154static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 struct ocfs2_journal_handle *handle,
156 struct inode *inode,
157 int wanted,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
160{
161 int count, status, i;
162 u16 suballoc_bit_start;
163 u32 num_got;
164 u64 first_blkno;
165 struct ocfs2_extent_block *eb;
166
167 mlog_entry_void();
168
169 count = 0;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
172 handle,
173 meta_ac,
174 wanted - count,
175 &suballoc_bit_start,
176 &num_got,
177 &first_blkno);
178 if (status < 0) {
179 mlog_errno(status);
180 goto bail;
181 }
182
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
186 status = -EIO;
187 mlog_errno(status);
188 goto bail;
189 }
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
194 if (status < 0) {
195 mlog_errno(status);
196 goto bail;
197 }
198
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
209#else
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211#endif
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 eb->h_list.l_count =
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216 suballoc_bit_start++;
217 first_blkno++;
218
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
222 if (status < 0) {
223 mlog_errno(status);
224 goto bail;
225 }
226 }
227
228 count += num_got;
229 }
230
231 status = 0;
232bail:
233 if (status < 0) {
234 for(i = 0; i < wanted; i++) {
235 if (bhs[i])
236 brelse(bhs[i]);
237 bhs[i] = NULL;
238 }
239 }
240 mlog_exit(status);
241 return status;
242}
243
244/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
247 * structure.
248 *
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
251 *
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
254 */
255static int ocfs2_add_branch(struct ocfs2_super *osb,
256 struct ocfs2_journal_handle *handle,
257 struct inode *inode,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
262{
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
271
272 mlog_entry_void();
273
274 BUG_ON(!last_eb_bh);
275
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278 if (eb_bh) {
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 el = &eb->h_list;
281 } else
282 el = &fe->id2.i_list;
283
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
286
287 new_blocks = le16_to_cpu(el->l_tree_depth);
288
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 GFP_KERNEL);
292 if (!new_eb_bhs) {
293 status = -ENOMEM;
294 mlog_errno(status);
295 goto bail;
296 }
297
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 *
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
311 * block. */
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
314 bh = new_eb_bhs[i];
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 status = -EIO;
319 goto bail;
320 }
321 eb_el = &eb->h_list;
322
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
325 if (status < 0) {
326 mlog_errno(status);
327 goto bail;
328 }
329
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339 status = ocfs2_journal_dirty(handle, bh);
340 if (status < 0) {
341 mlog_errno(status);
342 goto bail;
343 }
344
345 next_blkno = le64_to_cpu(eb->h_blkno);
346 }
347
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
356 if (status < 0) {
357 mlog_errno(status);
358 goto bail;
359 }
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
362 if (status < 0) {
363 mlog_errno(status);
364 goto bail;
365 }
366 if (eb_bh) {
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
369 if (status < 0) {
370 mlog_errno(status);
371 goto bail;
372 }
373 }
374
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
382
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
391 if (status < 0)
392 mlog_errno(status);
393 status = ocfs2_journal_dirty(handle, fe_bh);
394 if (status < 0)
395 mlog_errno(status);
396 if (eb_bh) {
397 status = ocfs2_journal_dirty(handle, eb_bh);
398 if (status < 0)
399 mlog_errno(status);
400 }
401
402 status = 0;
403bail:
404 if (new_eb_bhs) {
405 for (i = 0; i < new_blocks; i++)
406 if (new_eb_bhs[i])
407 brelse(new_eb_bhs[i]);
408 kfree(new_eb_bhs);
409 }
410
411 mlog_exit(status);
412 return status;
413}
414
415/*
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
418 * after this call.
419 */
420static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 struct ocfs2_journal_handle *handle,
422 struct inode *inode,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
426{
427 int status, i;
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
433
434 mlog_entry_void();
435
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 &new_eb_bh);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 status = -EIO;
447 goto bail;
448 }
449
450 eb_el = &eb->h_list;
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
453
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
456 if (status < 0) {
457 mlog_errno(status);
458 goto bail;
459 }
460
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) {
472 mlog_errno(status);
473 goto bail;
474 }
475
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
478 if (status < 0) {
479 mlog_errno(status);
480 goto bail;
481 }
482
483 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1);
494
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
499
500 status = ocfs2_journal_dirty(handle, fe_bh);
501 if (status < 0) {
502 mlog_errno(status);
503 goto bail;
504 }
505
506 *ret_new_eb_bh = new_eb_bh;
507 new_eb_bh = NULL;
508 status = 0;
509bail:
510 if (new_eb_bh)
511 brelse(new_eb_bh);
512
513 mlog_exit(status);
514 return status;
515}
516
517/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 struct ocfs2_journal_handle *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %"MLFu64" has a bad "
570 "extent list",
571 OCFS2_I(inode)->ip_blkno);
572 status = -EIO;
573 goto bail;
574 }
575 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
576
577 BUG_ON(i >= num_bhs);
578 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
579 OCFS2_BH_CACHED, inode);
580 if (status < 0) {
581 mlog_errno(status);
582 goto bail;
583 }
584 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
585 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
586 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
587 eb);
588 status = -EIO;
589 goto bail;
590 }
591
592 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
593 OCFS2_JOURNAL_ACCESS_WRITE);
594 if (status < 0) {
595 mlog_errno(status);
596 goto bail;
597 }
598
599 el = &eb->h_list;
600 i++;
601 /* When we leave this loop, eb_bhs[num_bhs - 1] will
602 * hold the bottom-most leaf extent block. */
603 }
604 BUG_ON(el->l_tree_depth);
605
606 el = &fe->id2.i_list;
607 /* If we have tree depth, then the fe update is
608 * trivial, and we want to switch el out for the
609 * bottom-most leaf in order to update it with the
610 * actual extent data below. */
611 next_free = le16_to_cpu(el->l_next_free_rec);
612 if (next_free == 0) {
613 ocfs2_error(inode->i_sb,
614 "Dinode %"MLFu64" has a bad "
615 "extent list",
616 OCFS2_I(inode)->ip_blkno);
617 status = -EIO;
618 goto bail;
619 }
620 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
621 new_clusters);
622 /* (num_bhs - 1) to avoid the leaf */
623 for(i = 0; i < (num_bhs - 1); i++) {
624 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
625 el = &eb->h_list;
626
627 /* finally, make our actual change to the
628 * intermediate extent blocks. */
629 next_free = le16_to_cpu(el->l_next_free_rec);
630 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
631 new_clusters);
632
633 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
634 if (status < 0)
635 mlog_errno(status);
636 }
637 BUG_ON(i != (num_bhs - 1));
638 /* note that the leaf block wasn't touched in
639 * the loop above */
640 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
641 el = &eb->h_list;
642 BUG_ON(el->l_tree_depth);
643 }
644
645 /* yay, we can finally add the actual extent now! */
646 i = le16_to_cpu(el->l_next_free_rec) - 1;
647 if (le16_to_cpu(el->l_next_free_rec) &&
648 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
649 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
650 } else if (le16_to_cpu(el->l_next_free_rec) &&
651 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
652 /* having an empty extent at eof is legal. */
653 if (el->l_recs[i].e_cpos != fe->i_clusters) {
654 ocfs2_error(inode->i_sb,
655 "Dinode %"MLFu64" trailing extent is bad: "
656 "cpos (%u) != number of clusters (%u)",
657 le32_to_cpu(el->l_recs[i].e_cpos),
658 le32_to_cpu(fe->i_clusters));
659 status = -EIO;
660 goto bail;
661 }
662 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
663 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
664 } else {
665 /* No contiguous record, or no empty record at eof, so
666 * we add a new one. */
667
668 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
669 le16_to_cpu(el->l_count));
670 i = le16_to_cpu(el->l_next_free_rec);
671
672 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
673 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
674 el->l_recs[i].e_cpos = fe->i_clusters;
675 le16_add_cpu(&el->l_next_free_rec, 1);
676 }
677
678 /*
679 * extent_map errors are not fatal, so they are ignored outside
680 * of flushing the thing.
681 */
682 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
683 new_clusters);
684 if (status) {
685 mlog_errno(status);
686 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
687 }
688
689 status = ocfs2_journal_dirty(handle, fe_bh);
690 if (status < 0)
691 mlog_errno(status);
692 if (fe->id2.i_list.l_tree_depth) {
693 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
694 if (status < 0)
695 mlog_errno(status);
696 }
697
698 status = 0;
699bail:
700 if (eb_bhs) {
701 for (i = 0; i < num_bhs; i++)
702 if (eb_bhs[i])
703 brelse(eb_bhs[i]);
704 kfree(eb_bhs);
705 }
706
707 mlog_exit(status);
708 return status;
709}
710
711/*
712 * Should only be called when there is no space left in any of the
713 * leaf nodes. What we want to do is find the lowest tree depth
714 * non-leaf extent block with room for new records. There are three
715 * valid results of this search:
716 *
717 * 1) a lowest extent block is found, then we pass it back in
718 * *lowest_eb_bh and return '0'
719 *
720 * 2) the search fails to find anything, but the dinode has room. We
721 * pass NULL back in *lowest_eb_bh, but still return '0'
722 *
723 * 3) the search fails to find anything AND the dinode is full, in
724 * which case we return > 0
725 *
726 * return status < 0 indicates an error.
727 */
728static int ocfs2_find_branch_target(struct ocfs2_super *osb,
729 struct inode *inode,
730 struct buffer_head *fe_bh,
731 struct buffer_head **target_bh)
732{
733 int status = 0, i;
734 u64 blkno;
735 struct ocfs2_dinode *fe;
736 struct ocfs2_extent_block *eb;
737 struct ocfs2_extent_list *el;
738 struct buffer_head *bh = NULL;
739 struct buffer_head *lowest_bh = NULL;
740
741 mlog_entry_void();
742
743 *target_bh = NULL;
744
745 fe = (struct ocfs2_dinode *) fe_bh->b_data;
746 el = &fe->id2.i_list;
747
748 while(le16_to_cpu(el->l_tree_depth) > 1) {
749 if (le16_to_cpu(el->l_next_free_rec) == 0) {
750 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
751 "extent list (next_free_rec == 0)",
752 OCFS2_I(inode)->ip_blkno);
753 status = -EIO;
754 goto bail;
755 }
756 i = le16_to_cpu(el->l_next_free_rec) - 1;
757 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
758 if (!blkno) {
759 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
760 "list where extent # %d has no physical "
761 "block start",
762 OCFS2_I(inode)->ip_blkno, i);
763 status = -EIO;
764 goto bail;
765 }
766
767 if (bh) {
768 brelse(bh);
769 bh = NULL;
770 }
771
772 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
773 inode);
774 if (status < 0) {
775 mlog_errno(status);
776 goto bail;
777 }
778
779 eb = (struct ocfs2_extent_block *) bh->b_data;
780 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
781 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
782 status = -EIO;
783 goto bail;
784 }
785 el = &eb->h_list;
786
787 if (le16_to_cpu(el->l_next_free_rec) <
788 le16_to_cpu(el->l_count)) {
789 if (lowest_bh)
790 brelse(lowest_bh);
791 lowest_bh = bh;
792 get_bh(lowest_bh);
793 }
794 }
795
796 /* If we didn't find one and the fe doesn't have any room,
797 * then return '1' */
798 if (!lowest_bh
799 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
800 status = 1;
801
802 *target_bh = lowest_bh;
803bail:
804 if (bh)
805 brelse(bh);
806
807 mlog_exit(status);
808 return status;
809}
810
811/* the caller needs to update fe->i_clusters */
812int ocfs2_insert_extent(struct ocfs2_super *osb,
813 struct ocfs2_journal_handle *handle,
814 struct inode *inode,
815 struct buffer_head *fe_bh,
816 u64 start_blk,
817 u32 new_clusters,
818 struct ocfs2_alloc_context *meta_ac)
819{
820 int status, i, shift;
821 struct buffer_head *last_eb_bh = NULL;
822 struct buffer_head *bh = NULL;
823 struct ocfs2_dinode *fe;
824 struct ocfs2_extent_block *eb;
825 struct ocfs2_extent_list *el;
826
827 mlog_entry_void();
828
829 mlog(0, "add %u clusters starting at block %"MLFu64" to "
830 "inode %"MLFu64"\n",
831 new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
832
833 fe = (struct ocfs2_dinode *) fe_bh->b_data;
834 el = &fe->id2.i_list;
835
836 if (el->l_tree_depth) {
837 /* jump to end of tree */
838 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
839 &last_eb_bh, OCFS2_BH_CACHED, inode);
840 if (status < 0) {
841 mlog_exit(status);
842 goto bail;
843 }
844 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
845 el = &eb->h_list;
846 }
847
848 /* Can we allocate without adding/shifting tree bits? */
849 i = le16_to_cpu(el->l_next_free_rec) - 1;
850 if (le16_to_cpu(el->l_next_free_rec) == 0
851 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
852 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
853 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
854 goto out_add;
855
856 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
857 "tree now.\n");
858
859 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
860 if (shift < 0) {
861 status = shift;
862 mlog_errno(status);
863 goto bail;
864 }
865
866 /* We traveled all the way to the bottom of the allocation tree
867 * and didn't find room for any more extents - we need to add
868 * another tree level */
869 if (shift) {
870 /* if we hit a leaf, we'd better be empty :) */
871 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
872 le16_to_cpu(el->l_count));
873 BUG_ON(bh);
874 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
875 "(current = %u)\n",
876 le16_to_cpu(fe->id2.i_list.l_tree_depth));
877
878 /* ocfs2_shift_tree_depth will return us a buffer with
879 * the new extent block (so we can pass that to
880 * ocfs2_add_branch). */
881 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
882 meta_ac, &bh);
883 if (status < 0) {
884 mlog_errno(status);
885 goto bail;
886 }
887 /* Special case: we have room now if we shifted from
888 * tree_depth 0 */
889 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
890 goto out_add;
891 }
892
893 /* call ocfs2_add_branch to add the final part of the tree with
894 * the new data. */
895 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
896 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
897 meta_ac);
898 if (status < 0) {
899 mlog_errno(status);
900 goto bail;
901 }
902
903out_add:
904 /* Finally, we can add clusters. */
905 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
906 start_blk, new_clusters);
907 if (status < 0)
908 mlog_errno(status);
909
910bail:
911 if (bh)
912 brelse(bh);
913
914 if (last_eb_bh)
915 brelse(last_eb_bh);
916
917 mlog_exit(status);
918 return status;
919}
920
921static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
922{
923 struct buffer_head *tl_bh = osb->osb_tl_bh;
924 struct ocfs2_dinode *di;
925 struct ocfs2_truncate_log *tl;
926
927 di = (struct ocfs2_dinode *) tl_bh->b_data;
928 tl = &di->id2.i_dealloc;
929
930 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
931 "slot %d, invalid truncate log parameters: used = "
932 "%u, count = %u\n", osb->slot_num,
933 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
934 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
935}
936
937static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
938 unsigned int new_start)
939{
940 unsigned int tail_index;
941 unsigned int current_tail;
942
943 /* No records, nothing to coalesce */
944 if (!le16_to_cpu(tl->tl_used))
945 return 0;
946
947 tail_index = le16_to_cpu(tl->tl_used) - 1;
948 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
949 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
950
951 return current_tail == new_start;
952}
953
954static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
955 struct ocfs2_journal_handle *handle,
956 u64 start_blk,
957 unsigned int num_clusters)
958{
959 int status, index;
960 unsigned int start_cluster, tl_count;
961 struct inode *tl_inode = osb->osb_tl_inode;
962 struct buffer_head *tl_bh = osb->osb_tl_bh;
963 struct ocfs2_dinode *di;
964 struct ocfs2_truncate_log *tl;
965
966 mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
967 num_clusters);
968
969 BUG_ON(!down_trylock(&tl_inode->i_sem));
970
971 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
972
973 di = (struct ocfs2_dinode *) tl_bh->b_data;
974 tl = &di->id2.i_dealloc;
975 if (!OCFS2_IS_VALID_DINODE(di)) {
976 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
977 status = -EIO;
978 goto bail;
979 }
980
981 tl_count = le16_to_cpu(tl->tl_count);
982 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
983 tl_count == 0,
984 "Truncate record count on #%"MLFu64" invalid ("
985 "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
988
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
992 status = -ENOSPC;
993 mlog_errno(status);
994 goto bail;
995 }
996
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
999 if (status < 0) {
1000 mlog_errno(status);
1001 goto bail;
1002 }
1003
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
1006 OCFS2_I(tl_inode)->ip_blkno, index);
1007
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 /*
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 */
1013 index--;
1014
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 num_clusters);
1019 } else {
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1022 }
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1026 if (status < 0) {
1027 mlog_errno(status);
1028 goto bail;
1029 }
1030
1031bail:
1032 mlog_exit(status);
1033 return status;
1034}
1035
1036static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 struct ocfs2_journal_handle *handle,
1038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1040{
1041 int status = 0;
1042 int i;
1043 unsigned int num_clusters;
1044 u64 start_blk;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051 mlog_entry_void();
1052
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1056 while (i >= 0) {
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto bail;
1064 }
1065
1066 tl->tl_used = cpu_to_le16(i);
1067
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1069 if (status < 0) {
1070 mlog_errno(status);
1071 goto bail;
1072 }
1073
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1076 * this. */
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089 /* if start_blk is not set, we ignore the record as
1090 * invalid. */
1091 if (start_blk) {
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1097 num_clusters);
1098 if (status < 0) {
1099 mlog_errno(status);
1100 goto bail;
1101 }
1102 }
1103 i--;
1104 }
1105
1106bail:
1107 mlog_exit(status);
1108 return status;
1109}
1110
1111/* Expects you to already be holding tl_inode->i_sem */
1112static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113{
1114 int status;
1115 unsigned int num_to_flush;
1116 struct ocfs2_journal_handle *handle = NULL;
1117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1123
1124 mlog_entry_void();
1125
1126 BUG_ON(!down_trylock(&tl_inode->i_sem));
1127
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 status = -EIO;
1133 goto bail;
1134 }
1135
1136 num_to_flush = le16_to_cpu(tl->tl_used);
1137 mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
1138 num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
1139 if (!num_to_flush) {
1140 status = 0;
1141 goto bail;
1142 }
1143
1144 handle = ocfs2_alloc_handle(osb);
1145 if (!handle) {
1146 status = -ENOMEM;
1147 mlog_errno(status);
1148 goto bail;
1149 }
1150
1151 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152 GLOBAL_BITMAP_SYSTEM_INODE,
1153 OCFS2_INVALID_SLOT);
1154 if (!data_alloc_inode) {
1155 status = -EINVAL;
1156 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1157 goto bail;
1158 }
1159
1160 ocfs2_handle_add_inode(handle, data_alloc_inode);
1161 status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1162 if (status < 0) {
1163 mlog_errno(status);
1164 goto bail;
1165 }
1166
1167 handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168 if (IS_ERR(handle)) {
1169 status = PTR_ERR(handle);
1170 handle = NULL;
1171 mlog_errno(status);
1172 goto bail;
1173 }
1174
1175 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1176 data_alloc_bh);
1177 if (status < 0) {
1178 mlog_errno(status);
1179 goto bail;
1180 }
1181
1182bail:
1183 if (handle)
1184 ocfs2_commit_trans(handle);
1185
1186 if (data_alloc_inode)
1187 iput(data_alloc_inode);
1188
1189 if (data_alloc_bh)
1190 brelse(data_alloc_bh);
1191
1192 mlog_exit(status);
1193 return status;
1194}
1195
1196int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1197{
1198 int status;
1199 struct inode *tl_inode = osb->osb_tl_inode;
1200
1201 down(&tl_inode->i_sem);
1202 status = __ocfs2_flush_truncate_log(osb);
1203 up(&tl_inode->i_sem);
1204
1205 return status;
1206}
1207
1208static void ocfs2_truncate_log_worker(void *data)
1209{
1210 int status;
1211 struct ocfs2_super *osb = data;
1212
1213 mlog_entry_void();
1214
1215 status = ocfs2_flush_truncate_log(osb);
1216 if (status < 0)
1217 mlog_errno(status);
1218
1219 mlog_exit(status);
1220}
1221
1222#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1223void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1224 int cancel)
1225{
1226 if (osb->osb_tl_inode) {
1227 /* We want to push off log flushes while truncates are
1228 * still running. */
1229 if (cancel)
1230 cancel_delayed_work(&osb->osb_truncate_log_wq);
1231
1232 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1233 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1234 }
1235}
1236
1237static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1238 int slot_num,
1239 struct inode **tl_inode,
1240 struct buffer_head **tl_bh)
1241{
1242 int status;
1243 struct inode *inode = NULL;
1244 struct buffer_head *bh = NULL;
1245
1246 inode = ocfs2_get_system_file_inode(osb,
1247 TRUNCATE_LOG_SYSTEM_INODE,
1248 slot_num);
1249 if (!inode) {
1250 status = -EINVAL;
1251 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1252 goto bail;
1253 }
1254
1255 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1256 OCFS2_BH_CACHED, inode);
1257 if (status < 0) {
1258 iput(inode);
1259 mlog_errno(status);
1260 goto bail;
1261 }
1262
1263 *tl_inode = inode;
1264 *tl_bh = bh;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* called during the 1st stage of node recovery. we stamp a clean
1271 * truncate log and pass back a copy for processing later. if the
1272 * truncate log does not require processing, a *tl_copy is set to
1273 * NULL. */
1274int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1275 int slot_num,
1276 struct ocfs2_dinode **tl_copy)
1277{
1278 int status;
1279 struct inode *tl_inode = NULL;
1280 struct buffer_head *tl_bh = NULL;
1281 struct ocfs2_dinode *di;
1282 struct ocfs2_truncate_log *tl;
1283
1284 *tl_copy = NULL;
1285
1286 mlog(0, "recover truncate log from slot %d\n", slot_num);
1287
1288 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293
1294 di = (struct ocfs2_dinode *) tl_bh->b_data;
1295 tl = &di->id2.i_dealloc;
1296 if (!OCFS2_IS_VALID_DINODE(di)) {
1297 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1298 status = -EIO;
1299 goto bail;
1300 }
1301
1302 if (le16_to_cpu(tl->tl_used)) {
1303 mlog(0, "We'll have %u logs to recover\n",
1304 le16_to_cpu(tl->tl_used));
1305
1306 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1307 if (!(*tl_copy)) {
1308 status = -ENOMEM;
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312
1313 /* Assuming the write-out below goes well, this copy
1314 * will be passed back to recovery for processing. */
1315 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1316
1317 /* All we need to do to clear the truncate log is set
1318 * tl_used. */
1319 tl->tl_used = 0;
1320
1321 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1322 if (status < 0) {
1323 mlog_errno(status);
1324 goto bail;
1325 }
1326 }
1327
1328bail:
1329 if (tl_inode)
1330 iput(tl_inode);
1331 if (tl_bh)
1332 brelse(tl_bh);
1333
1334 if (status < 0 && (*tl_copy)) {
1335 kfree(*tl_copy);
1336 *tl_copy = NULL;
1337 }
1338
1339 mlog_exit(status);
1340 return status;
1341}
1342
1343int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1344 struct ocfs2_dinode *tl_copy)
1345{
1346 int status = 0;
1347 int i;
1348 unsigned int clusters, num_recs, start_cluster;
1349 u64 start_blk;
1350 struct ocfs2_journal_handle *handle;
1351 struct inode *tl_inode = osb->osb_tl_inode;
1352 struct ocfs2_truncate_log *tl;
1353
1354 mlog_entry_void();
1355
1356 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1357 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1358 return -EINVAL;
1359 }
1360
1361 tl = &tl_copy->id2.i_dealloc;
1362 num_recs = le16_to_cpu(tl->tl_used);
1363 mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
1364 tl_copy->i_blkno);
1365
1366 down(&tl_inode->i_sem);
1367 for(i = 0; i < num_recs; i++) {
1368 if (ocfs2_truncate_log_needs_flush(osb)) {
1369 status = __ocfs2_flush_truncate_log(osb);
1370 if (status < 0) {
1371 mlog_errno(status);
1372 goto bail_up;
1373 }
1374 }
1375
1376 handle = ocfs2_start_trans(osb, NULL,
1377 OCFS2_TRUNCATE_LOG_UPDATE);
1378 if (IS_ERR(handle)) {
1379 status = PTR_ERR(handle);
1380 mlog_errno(status);
1381 goto bail_up;
1382 }
1383
1384 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1385 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1386 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1387
1388 status = ocfs2_truncate_log_append(osb, handle,
1389 start_blk, clusters);
1390 ocfs2_commit_trans(handle);
1391 if (status < 0) {
1392 mlog_errno(status);
1393 goto bail_up;
1394 }
1395 }
1396
1397bail_up:
1398 up(&tl_inode->i_sem);
1399
1400 mlog_exit(status);
1401 return status;
1402}
1403
1404void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1405{
1406 int status;
1407 struct inode *tl_inode = osb->osb_tl_inode;
1408
1409 mlog_entry_void();
1410
1411 if (tl_inode) {
1412 cancel_delayed_work(&osb->osb_truncate_log_wq);
1413 flush_workqueue(ocfs2_wq);
1414
1415 status = ocfs2_flush_truncate_log(osb);
1416 if (status < 0)
1417 mlog_errno(status);
1418
1419 brelse(osb->osb_tl_bh);
1420 iput(osb->osb_tl_inode);
1421 }
1422
1423 mlog_exit_void();
1424}
1425
1426int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1427{
1428 int status;
1429 struct inode *tl_inode = NULL;
1430 struct buffer_head *tl_bh = NULL;
1431
1432 mlog_entry_void();
1433
1434 status = ocfs2_get_truncate_log_info(osb,
1435 osb->slot_num,
1436 &tl_inode,
1437 &tl_bh);
1438 if (status < 0)
1439 mlog_errno(status);
1440
1441 /* ocfs2_truncate_log_shutdown keys on the existence of
1442 * osb->osb_tl_inode so we don't set any of the osb variables
1443 * until we're sure all is well. */
1444 INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
1445 osb->osb_tl_bh = tl_bh;
1446 osb->osb_tl_inode = tl_inode;
1447
1448 mlog_exit(status);
1449 return status;
1450}
1451
1452/* This function will figure out whether the currently last extent
1453 * block will be deleted, and if it will, what the new last extent
1454 * block will be so we can update his h_next_leaf_blk field, as well
1455 * as the dinodes i_last_eb_blk */
1456static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1457 struct inode *inode,
1458 struct ocfs2_dinode *fe,
1459 u32 new_i_clusters,
1460 struct buffer_head *old_last_eb,
1461 struct buffer_head **new_last_eb)
1462{
1463 int i, status = 0;
1464 u64 block = 0;
1465 struct ocfs2_extent_block *eb;
1466 struct ocfs2_extent_list *el;
1467 struct buffer_head *bh = NULL;
1468
1469 *new_last_eb = NULL;
1470
1471 if (!OCFS2_IS_VALID_DINODE(fe)) {
1472 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1473 status = -EIO;
1474 goto bail;
1475 }
1476
1477 /* we have no tree, so of course, no last_eb. */
1478 if (!fe->id2.i_list.l_tree_depth)
1479 goto bail;
1480
1481 /* trunc to zero special case - this makes tree_depth = 0
1482 * regardless of what it is. */
1483 if (!new_i_clusters)
1484 goto bail;
1485
1486 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1487 el = &(eb->h_list);
1488 BUG_ON(!el->l_next_free_rec);
1489
1490 /* Make sure that this guy will actually be empty after we
1491 * clear away the data. */
1492 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1493 goto bail;
1494
1495 /* Ok, at this point, we know that last_eb will definitely
1496 * change, so lets traverse the tree and find the second to
1497 * last extent block. */
1498 el = &(fe->id2.i_list);
1499 /* go down the tree, */
1500 do {
1501 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1502 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1503 new_i_clusters) {
1504 block = le64_to_cpu(el->l_recs[i].e_blkno);
1505 break;
1506 }
1507 }
1508 BUG_ON(i < 0);
1509
1510 if (bh) {
1511 brelse(bh);
1512 bh = NULL;
1513 }
1514
1515 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1516 inode);
1517 if (status < 0) {
1518 mlog_errno(status);
1519 goto bail;
1520 }
1521 eb = (struct ocfs2_extent_block *) bh->b_data;
1522 el = &eb->h_list;
1523 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1524 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1525 status = -EIO;
1526 goto bail;
1527 }
1528 } while (el->l_tree_depth);
1529
1530 *new_last_eb = bh;
1531 get_bh(*new_last_eb);
1532 mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
1533bail:
1534 if (bh)
1535 brelse(bh);
1536
1537 return status;
1538}
1539
1540static int ocfs2_do_truncate(struct ocfs2_super *osb,
1541 unsigned int clusters_to_del,
1542 struct inode *inode,
1543 struct buffer_head *fe_bh,
1544 struct buffer_head *old_last_eb_bh,
1545 struct ocfs2_journal_handle *handle,
1546 struct ocfs2_truncate_context *tc)
1547{
1548 int status, i, depth;
1549 struct ocfs2_dinode *fe;
1550 struct ocfs2_extent_block *eb;
1551 struct ocfs2_extent_block *last_eb = NULL;
1552 struct ocfs2_extent_list *el;
1553 struct buffer_head *eb_bh = NULL;
1554 struct buffer_head *last_eb_bh = NULL;
1555 u64 next_eb = 0;
1556 u64 delete_blk = 0;
1557
1558 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1559
1560 status = ocfs2_find_new_last_ext_blk(osb,
1561 inode,
1562 fe,
1563 le32_to_cpu(fe->i_clusters) -
1564 clusters_to_del,
1565 old_last_eb_bh,
1566 &last_eb_bh);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 if (last_eb_bh)
1572 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1573
1574 status = ocfs2_journal_access(handle, inode, fe_bh,
1575 OCFS2_JOURNAL_ACCESS_WRITE);
1576 if (status < 0) {
1577 mlog_errno(status);
1578 goto bail;
1579 }
1580 el = &(fe->id2.i_list);
1581
1582 spin_lock(&OCFS2_I(inode)->ip_lock);
1583 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1584 clusters_to_del;
1585 spin_unlock(&OCFS2_I(inode)->ip_lock);
1586 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1587 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1588 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1589
1590 i = le16_to_cpu(el->l_next_free_rec) - 1;
1591
1592 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1593 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1594 /* tree depth zero, we can just delete the clusters, otherwise
1595 * we need to record the offset of the next level extent block
1596 * as we may overwrite it. */
1597 if (!el->l_tree_depth)
1598 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1599 + ocfs2_clusters_to_blocks(osb->sb,
1600 le32_to_cpu(el->l_recs[i].e_clusters));
1601 else
1602 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1603
1604 if (!el->l_recs[i].e_clusters) {
1605 /* if we deleted the whole extent record, then clear
1606 * out the other fields and update the extent
1607 * list. For depth > 0 trees, we've already recorded
1608 * the extent block in 'next_eb' */
1609 el->l_recs[i].e_cpos = 0;
1610 el->l_recs[i].e_blkno = 0;
1611 BUG_ON(!el->l_next_free_rec);
1612 le16_add_cpu(&el->l_next_free_rec, -1);
1613 }
1614
1615 depth = le16_to_cpu(el->l_tree_depth);
1616 if (!fe->i_clusters) {
1617 /* trunc to zero is a special case. */
1618 el->l_tree_depth = 0;
1619 fe->i_last_eb_blk = 0;
1620 } else if (last_eb)
1621 fe->i_last_eb_blk = last_eb->h_blkno;
1622
1623 status = ocfs2_journal_dirty(handle, fe_bh);
1624 if (status < 0) {
1625 mlog_errno(status);
1626 goto bail;
1627 }
1628
1629 if (last_eb) {
1630 /* If there will be a new last extent block, then by
1631 * definition, there cannot be any leaves to the right of
1632 * him. */
1633 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1634 OCFS2_JOURNAL_ACCESS_WRITE);
1635 if (status < 0) {
1636 mlog_errno(status);
1637 goto bail;
1638 }
1639 last_eb->h_next_leaf_blk = 0;
1640 status = ocfs2_journal_dirty(handle, last_eb_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 /* if our tree depth > 0, update all the tree blocks below us. */
1648 while (depth) {
1649 mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
1650 depth, next_eb);
1651 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1652 OCFS2_BH_CACHED, inode);
1653 if (status < 0) {
1654 mlog_errno(status);
1655 goto bail;
1656 }
1657 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1658 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1659 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1660 status = -EIO;
1661 goto bail;
1662 }
1663 el = &(eb->h_list);
1664
1665 status = ocfs2_journal_access(handle, inode, eb_bh,
1666 OCFS2_JOURNAL_ACCESS_WRITE);
1667 if (status < 0) {
1668 mlog_errno(status);
1669 goto bail;
1670 }
1671
1672 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1673 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1674
1675 i = le16_to_cpu(el->l_next_free_rec) - 1;
1676
1677 mlog(0, "extent block %"MLFu64", before: record %d: "
1678 "(%u, %u, %"MLFu64"), next = %u\n",
1679 le64_to_cpu(eb->h_blkno), i,
1680 le32_to_cpu(el->l_recs[i].e_cpos),
1681 le32_to_cpu(el->l_recs[i].e_clusters),
1682 le64_to_cpu(el->l_recs[i].e_blkno),
1683 le16_to_cpu(el->l_next_free_rec));
1684
1685 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1686 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1687
1688 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1689 /* bottom-most block requires us to delete data.*/
1690 if (!el->l_tree_depth)
1691 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1692 + ocfs2_clusters_to_blocks(osb->sb,
1693 le32_to_cpu(el->l_recs[i].e_clusters));
1694 if (!el->l_recs[i].e_clusters) {
1695 el->l_recs[i].e_cpos = 0;
1696 el->l_recs[i].e_blkno = 0;
1697 BUG_ON(!el->l_next_free_rec);
1698 le16_add_cpu(&el->l_next_free_rec, -1);
1699 }
1700 mlog(0, "extent block %"MLFu64", after: record %d: "
1701 "(%u, %u, %"MLFu64"), next = %u\n",
1702 le64_to_cpu(eb->h_blkno), i,
1703 le32_to_cpu(el->l_recs[i].e_cpos),
1704 le32_to_cpu(el->l_recs[i].e_clusters),
1705 le64_to_cpu(el->l_recs[i].e_blkno),
1706 le16_to_cpu(el->l_next_free_rec));
1707
1708 status = ocfs2_journal_dirty(handle, eb_bh);
1709 if (status < 0) {
1710 mlog_errno(status);
1711 goto bail;
1712 }
1713
1714 if (!el->l_next_free_rec) {
1715 mlog(0, "deleting this extent block.\n");
1716
1717 ocfs2_remove_from_cache(inode, eb_bh);
1718
1719 BUG_ON(eb->h_suballoc_slot);
1720 BUG_ON(el->l_recs[0].e_clusters);
1721 BUG_ON(el->l_recs[0].e_cpos);
1722 BUG_ON(el->l_recs[0].e_blkno);
1723 status = ocfs2_free_extent_block(handle,
1724 tc->tc_ext_alloc_inode,
1725 tc->tc_ext_alloc_bh,
1726 eb);
1727 if (status < 0) {
1728 mlog_errno(status);
1729 goto bail;
1730 }
1731 }
1732 brelse(eb_bh);
1733 eb_bh = NULL;
1734 depth--;
1735 }
1736
1737 BUG_ON(!delete_blk);
1738 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1739 clusters_to_del);
1740 if (status < 0) {
1741 mlog_errno(status);
1742 goto bail;
1743 }
1744 status = 0;
1745bail:
1746 if (!status)
1747 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1748 else
1749 ocfs2_extent_map_drop(inode, 0);
1750 mlog_exit(status);
1751 return status;
1752}
1753
1754/*
1755 * It is expected, that by the time you call this function,
1756 * inode->i_size and fe->i_size have been adjusted.
1757 *
1758 * WARNING: This will kfree the truncate context
1759 */
1760int ocfs2_commit_truncate(struct ocfs2_super *osb,
1761 struct inode *inode,
1762 struct buffer_head *fe_bh,
1763 struct ocfs2_truncate_context *tc)
1764{
1765 int status, i, credits, tl_sem = 0;
1766 u32 clusters_to_del, target_i_clusters;
1767 u64 last_eb = 0;
1768 struct ocfs2_dinode *fe;
1769 struct ocfs2_extent_block *eb;
1770 struct ocfs2_extent_list *el;
1771 struct buffer_head *last_eb_bh;
1772 struct ocfs2_journal_handle *handle = NULL;
1773 struct inode *tl_inode = osb->osb_tl_inode;
1774
1775 mlog_entry_void();
1776
1777 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1778
1779 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1780 i_size_read(inode));
1781
1782 last_eb_bh = tc->tc_last_eb_bh;
1783 tc->tc_last_eb_bh = NULL;
1784
1785 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1786
1787 if (fe->id2.i_list.l_tree_depth) {
1788 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1789 el = &eb->h_list;
1790 } else
1791 el = &fe->id2.i_list;
1792 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1793start:
1794 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1795 "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
1796 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1797 le32_to_cpu(fe->i_clusters), last_eb,
1798 le64_to_cpu(fe->i_last_eb_blk),
1799 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1800
1801 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1802 mlog(0, "last_eb changed!\n");
1803 BUG_ON(!fe->id2.i_list.l_tree_depth);
1804 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1805 /* i_last_eb_blk may have changed, read it if
1806 * necessary. We don't have to worry about the
1807 * truncate to zero case here (where there becomes no
1808 * last_eb) because we never loop back after our work
1809 * is done. */
1810 if (last_eb_bh) {
1811 brelse(last_eb_bh);
1812 last_eb_bh = NULL;
1813 }
1814
1815 status = ocfs2_read_block(osb, last_eb,
1816 &last_eb_bh, OCFS2_BH_CACHED,
1817 inode);
1818 if (status < 0) {
1819 mlog_errno(status);
1820 goto bail;
1821 }
1822 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1823 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1824 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1825 status = -EIO;
1826 goto bail;
1827 }
1828 el = &(eb->h_list);
1829 }
1830
1831 /* by now, el will point to the extent list on the bottom most
1832 * portion of this tree. */
1833 i = le16_to_cpu(el->l_next_free_rec) - 1;
1834 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1835 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1836 else
1837 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1838 le32_to_cpu(el->l_recs[i].e_cpos)) -
1839 target_i_clusters;
1840
1841 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1842
1843 down(&tl_inode->i_sem);
1844 tl_sem = 1;
1845 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1846 * record is free for use. If there isn't any, we flush to get
1847 * an empty truncate log. */
1848 if (ocfs2_truncate_log_needs_flush(osb)) {
1849 status = __ocfs2_flush_truncate_log(osb);
1850 if (status < 0) {
1851 mlog_errno(status);
1852 goto bail;
1853 }
1854 }
1855
1856 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1857 fe, el);
1858 handle = ocfs2_start_trans(osb, NULL, credits);
1859 if (IS_ERR(handle)) {
1860 status = PTR_ERR(handle);
1861 handle = NULL;
1862 mlog_errno(status);
1863 goto bail;
1864 }
1865
1866 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1867 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1868 if (status < 0)
1869 mlog_errno(status);
1870
1871 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1872 last_eb_bh, handle, tc);
1873 if (status < 0) {
1874 mlog_errno(status);
1875 goto bail;
1876 }
1877
1878 up(&tl_inode->i_sem);
1879 tl_sem = 0;
1880
1881 ocfs2_commit_trans(handle);
1882 handle = NULL;
1883
1884 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1885 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1886 goto start;
1887bail:
1888 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1889
1890 ocfs2_schedule_truncate_log_flush(osb, 1);
1891
1892 if (tl_sem)
1893 up(&tl_inode->i_sem);
1894
1895 if (handle)
1896 ocfs2_commit_trans(handle);
1897
1898 if (last_eb_bh)
1899 brelse(last_eb_bh);
1900
1901 /* This will drop the ext_alloc cluster lock for us */
1902 ocfs2_free_truncate_context(tc);
1903
1904 mlog_exit(status);
1905 return status;
1906}
1907
1908
1909/*
1910 * Expects the inode to already be locked. This will figure out which
1911 * inodes need to be locked and will put them on the returned truncate
1912 * context.
1913 */
1914int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1915 struct inode *inode,
1916 struct buffer_head *fe_bh,
1917 struct ocfs2_truncate_context **tc)
1918{
1919 int status, metadata_delete;
1920 unsigned int new_i_clusters;
1921 struct ocfs2_dinode *fe;
1922 struct ocfs2_extent_block *eb;
1923 struct ocfs2_extent_list *el;
1924 struct buffer_head *last_eb_bh = NULL;
1925 struct inode *ext_alloc_inode = NULL;
1926 struct buffer_head *ext_alloc_bh = NULL;
1927
1928 mlog_entry_void();
1929
1930 *tc = NULL;
1931
1932 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1933 i_size_read(inode));
1934 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1935
1936 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1937 "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
1938
1939 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1940 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
1941 "%u and size %"MLFu64" whereas struct inode has "
1942 "cluster count %u and size %llu which caused an "
1943 "invalid truncate to %u clusters.",
1944 le64_to_cpu(fe->i_blkno),
1945 le32_to_cpu(fe->i_clusters),
1946 le64_to_cpu(fe->i_size),
1947 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1948 new_i_clusters);
1949 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1950 status = -EIO;
1951 goto bail;
1952 }
1953
1954 *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1955 if (!(*tc)) {
1956 status = -ENOMEM;
1957 mlog_errno(status);
1958 goto bail;
1959 }
1960
1961 metadata_delete = 0;
1962 if (fe->id2.i_list.l_tree_depth) {
1963 /* If we have a tree, then the truncate may result in
1964 * metadata deletes. Figure this out from the
1965 * rightmost leaf block.*/
1966 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1967 &last_eb_bh, OCFS2_BH_CACHED, inode);
1968 if (status < 0) {
1969 mlog_errno(status);
1970 goto bail;
1971 }
1972 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1973 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1974 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1975
1976 brelse(last_eb_bh);
1977 status = -EIO;
1978 goto bail;
1979 }
1980 el = &(eb->h_list);
1981 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1982 metadata_delete = 1;
1983 }
1984
1985 (*tc)->tc_last_eb_bh = last_eb_bh;
1986
1987 if (metadata_delete) {
1988 mlog(0, "Will have to delete metadata for this trunc. "
1989 "locking allocator.\n");
1990 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1991 if (!ext_alloc_inode) {
1992 status = -ENOMEM;
1993 mlog_errno(status);
1994 goto bail;
1995 }
1996
1997 down(&ext_alloc_inode->i_sem);
1998 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
1999
2000 status = ocfs2_meta_lock(ext_alloc_inode,
2001 NULL,
2002 &ext_alloc_bh,
2003 1);
2004 if (status < 0) {
2005 mlog_errno(status);
2006 goto bail;
2007 }
2008 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2009 (*tc)->tc_ext_alloc_locked = 1;
2010 }
2011
2012 status = 0;
2013bail:
2014 if (status < 0) {
2015 if (*tc)
2016 ocfs2_free_truncate_context(*tc);
2017 *tc = NULL;
2018 }
2019 mlog_exit_void();
2020 return status;
2021}
2022
2023static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2024{
2025 if (tc->tc_ext_alloc_inode) {
2026 if (tc->tc_ext_alloc_locked)
2027 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2028
2029 up(&tc->tc_ext_alloc_inode->i_sem);
2030 iput(tc->tc_ext_alloc_inode);
2031 }
2032
2033 if (tc->tc_ext_alloc_bh)
2034 brelse(tc->tc_ext_alloc_bh);
2035
2036 if (tc->tc_last_eb_bh)
2037 brelse(tc->tc_last_eb_bh);
2038
2039 kfree(tc);
2040}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 000000000000..12ba897743f4
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H
28
29struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb,
31 struct ocfs2_journal_handle *handle,
32 struct inode *inode,
33 struct buffer_head *fe_bh,
34 u64 blkno,
35 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb,
38 struct inode *inode,
39 struct ocfs2_dinode *fe);
40/* how many new metadata chunks would an allocation need at maximum? */
41static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
42{
43 /*
44 * Rather than do all the work of determining how much we need
45 * (involves a ton of reads and locks), just ask for the
46 * maximal limit. That's a tree depth shift. So, one block for
47 * level of the tree (current l_tree_depth), one block for the
48 * new tree_depth==0 extent_block, and one block at the new
49 * top-of-the tree.
50 */
51 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
52}
53
54int ocfs2_truncate_log_init(struct ocfs2_super *osb);
55void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
56void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
57 int cancel);
58int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
59int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
60 int slot_num,
61 struct ocfs2_dinode **tl_copy);
62int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
63 struct ocfs2_dinode *tl_copy);
64
65struct ocfs2_truncate_context {
66 struct inode *tc_ext_alloc_inode;
67 struct buffer_head *tc_ext_alloc_bh;
68 int tc_ext_alloc_locked; /* is it cluster locked? */
69 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
70 struct buffer_head *tc_last_eb_bh;
71};
72
73int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode,
75 struct buffer_head *fe_bh,
76 struct ocfs2_truncate_context **tc);
77int ocfs2_commit_truncate(struct ocfs2_super *osb,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc);
81
82#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 000000000000..8f4467a930a5
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27
28#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "alloc.h"
34#include "aops.h"
35#include "dlmglue.h"
36#include "extent_map.h"
37#include "file.h"
38#include "inode.h"
39#include "journal.h"
40#include "super.h"
41#include "symlink.h"
42
43#include "buffer_head_io.h"
44
45static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
46 struct buffer_head *bh_result, int create)
47{
48 int err = -EIO;
49 int status;
50 struct ocfs2_dinode *fe = NULL;
51 struct buffer_head *bh = NULL;
52 struct buffer_head *buffer_cache_bh = NULL;
53 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
54 void *kaddr;
55
56 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
57 (unsigned long long)iblock, bh_result, create);
58
59 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
60
61 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
62 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
63 (unsigned long long)iblock);
64 goto bail;
65 }
66
67 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
68 OCFS2_I(inode)->ip_blkno,
69 &bh, OCFS2_BH_CACHED, inode);
70 if (status < 0) {
71 mlog_errno(status);
72 goto bail;
73 }
74 fe = (struct ocfs2_dinode *) bh->b_data;
75
76 if (!OCFS2_IS_VALID_DINODE(fe)) {
77 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
78 fe->i_blkno, 7, fe->i_signature);
79 goto bail;
80 }
81
82 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
83 le32_to_cpu(fe->i_clusters))) {
84 mlog(ML_ERROR, "block offset is outside the allocated size: "
85 "%llu\n", (unsigned long long)iblock);
86 goto bail;
87 }
88
89 /* We don't use the page cache to create symlink data, so if
90 * need be, copy it over from the buffer cache. */
91 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
92 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
93 iblock;
94 buffer_cache_bh = sb_getblk(osb->sb, blkno);
95 if (!buffer_cache_bh) {
96 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
97 goto bail;
98 }
99
100 /* we haven't locked out transactions, so a commit
101 * could've happened. Since we've got a reference on
102 * the bh, even if it commits while we're doing the
103 * copy, the data is still good. */
104 if (buffer_jbd(buffer_cache_bh)
105 && ocfs2_inode_is_new(inode)) {
106 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
107 if (!kaddr) {
108 mlog(ML_ERROR, "couldn't kmap!\n");
109 goto bail;
110 }
111 memcpy(kaddr + (bh_result->b_size * iblock),
112 buffer_cache_bh->b_data,
113 bh_result->b_size);
114 kunmap_atomic(kaddr, KM_USER0);
115 set_buffer_uptodate(bh_result);
116 }
117 brelse(buffer_cache_bh);
118 }
119
120 map_bh(bh_result, inode->i_sb,
121 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
122
123 err = 0;
124
125bail:
126 if (bh)
127 brelse(bh);
128
129 mlog_exit(err);
130 return err;
131}
132
133static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create)
135{
136 int err = 0;
137 u64 p_blkno, past_eof;
138
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create);
141
142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
144 inode, inode->i_ino);
145
146 if (S_ISLNK(inode->i_mode)) {
147 /* this always does I/O for some reason. */
148 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
149 goto bail;
150 }
151
152 /* this can happen if another node truncs after our extend! */
153 spin_lock(&OCFS2_I(inode)->ip_lock);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%"MLFu64", NULL)\n", err, inode,
166 (unsigned long long)iblock, p_blkno);
167 goto bail;
168 }
169
170 map_bh(bh_result, inode->i_sb, p_blkno);
171
172 if (bh_result->b_blocknr == 0) {
173 err = -EIO;
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
175 "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
176 p_blkno, OCFS2_I(inode)->ip_blkno);
177 }
178
179 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
180 mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
181
182 if (create && (iblock >= past_eof))
183 set_buffer_new(bh_result);
184
185bail:
186 if (err < 0)
187 err = -EIO;
188
189 mlog_exit(err);
190 return err;
191}
192
193static int ocfs2_readpage(struct file *file, struct page *page)
194{
195 struct inode *inode = page->mapping->host;
196 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
197 int ret, unlock = 1;
198
199 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
200
201 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
202 if (ret != 0) {
203 if (ret == AOP_TRUNCATED_PAGE)
204 unlock = 0;
205 mlog_errno(ret);
206 goto out;
207 }
208
209 down_read(&OCFS2_I(inode)->ip_alloc_sem);
210
211 /*
212 * i_size might have just been updated as we grabed the meta lock. We
213 * might now be discovering a truncate that hit on another node.
214 * block_read_full_page->get_block freaks out if it is asked to read
215 * beyond the end of a file, so we check here. Callers
216 * (generic_file_read, fault->nopage) are clever enough to check i_size
217 * and notice that the page they just read isn't needed.
218 *
219 * XXX sys_readahead() seems to get that wrong?
220 */
221 if (start >= i_size_read(inode)) {
222 char *addr = kmap(page);
223 memset(addr, 0, PAGE_SIZE);
224 flush_dcache_page(page);
225 kunmap(page);
226 SetPageUptodate(page);
227 ret = 0;
228 goto out_alloc;
229 }
230
231 ret = ocfs2_data_lock_with_page(inode, 0, page);
232 if (ret != 0) {
233 if (ret == AOP_TRUNCATED_PAGE)
234 unlock = 0;
235 mlog_errno(ret);
236 goto out_alloc;
237 }
238
239 ret = block_read_full_page(page, ocfs2_get_block);
240 unlock = 0;
241
242 ocfs2_data_unlock(inode, 0);
243out_alloc:
244 up_read(&OCFS2_I(inode)->ip_alloc_sem);
245 ocfs2_meta_unlock(inode, 0);
246out:
247 if (unlock)
248 unlock_page(page);
249 mlog_exit(ret);
250 return ret;
251}
252
253/* Note: Because we don't support holes, our allocation has
254 * already happened (allocation writes zeros to the file data)
255 * so we don't have to worry about ordered writes in
256 * ocfs2_writepage.
257 *
258 * ->writepage is called during the process of invalidating the page cache
259 * during blocked lock processing. It can't block on any cluster locks
260 * to during block mapping. It's relying on the fact that the block
261 * mapping can't have disappeared under the dirty pages that it is
262 * being asked to write back.
263 */
264static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
265{
266 int ret;
267
268 mlog_entry("(0x%p)\n", page);
269
270 ret = block_write_full_page(page, ocfs2_get_block, wbc);
271
272 mlog_exit(ret);
273
274 return ret;
275}
276
277/*
278 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
279 * from loopback. It must be able to perform its own locking around
280 * ocfs2_get_block().
281 */
282int ocfs2_prepare_write(struct file *file, struct page *page,
283 unsigned from, unsigned to)
284{
285 struct inode *inode = page->mapping->host;
286 int ret;
287
288 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
289
290 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
291 if (ret != 0) {
292 mlog_errno(ret);
293 goto out;
294 }
295
296 down_read(&OCFS2_I(inode)->ip_alloc_sem);
297
298 ret = block_prepare_write(page, from, to, ocfs2_get_block);
299
300 up_read(&OCFS2_I(inode)->ip_alloc_sem);
301
302 ocfs2_meta_unlock(inode, 0);
303out:
304 mlog_exit(ret);
305 return ret;
306}
307
308/* Taken from ext3. We don't necessarily need the full blown
309 * functionality yet, but IMHO it's better to cut and paste the whole
310 * thing so we can avoid introducing our own bugs (and easily pick up
311 * their fixes when they happen) --Mark */
312static int walk_page_buffers( handle_t *handle,
313 struct buffer_head *head,
314 unsigned from,
315 unsigned to,
316 int *partial,
317 int (*fn)( handle_t *handle,
318 struct buffer_head *bh))
319{
320 struct buffer_head *bh;
321 unsigned block_start, block_end;
322 unsigned blocksize = head->b_size;
323 int err, ret = 0;
324 struct buffer_head *next;
325
326 for ( bh = head, block_start = 0;
327 ret == 0 && (bh != head || !block_start);
328 block_start = block_end, bh = next)
329 {
330 next = bh->b_this_page;
331 block_end = block_start + blocksize;
332 if (block_end <= from || block_start >= to) {
333 if (partial && !buffer_uptodate(bh))
334 *partial = 1;
335 continue;
336 }
337 err = (*fn)(handle, bh);
338 if (!ret)
339 ret = err;
340 }
341 return ret;
342}
343
344struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
345 struct page *page,
346 unsigned from,
347 unsigned to)
348{
349 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
350 struct ocfs2_journal_handle *handle = NULL;
351 int ret = 0;
352
353 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
354 if (!handle) {
355 ret = -ENOMEM;
356 mlog_errno(ret);
357 goto out;
358 }
359
360 if (ocfs2_should_order_data(inode)) {
361 ret = walk_page_buffers(handle->k_handle,
362 page_buffers(page),
363 from, to, NULL,
364 ocfs2_journal_dirty_data);
365 if (ret < 0)
366 mlog_errno(ret);
367 }
368out:
369 if (ret) {
370 if (handle)
371 ocfs2_commit_trans(handle);
372 handle = ERR_PTR(ret);
373 }
374 return handle;
375}
376
377static int ocfs2_commit_write(struct file *file, struct page *page,
378 unsigned from, unsigned to)
379{
380 int ret, extending = 0, locklevel = 0;
381 loff_t new_i_size;
382 struct buffer_head *di_bh = NULL;
383 struct inode *inode = page->mapping->host;
384 struct ocfs2_journal_handle *handle = NULL;
385
386 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
387
388 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
389 * us to sample inode->i_size here without the metadata lock:
390 *
391 * 1) We're currently holding the inode alloc lock, so no
392 * nodes can change it underneath us.
393 *
394 * 2) We've had to take the metadata lock at least once
395 * already to check for extending writes, hence insuring
396 * that our current copy is also up to date.
397 */
398 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
399 if (new_i_size > i_size_read(inode)) {
400 extending = 1;
401 locklevel = 1;
402 }
403
404 ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
405 if (ret != 0) {
406 mlog_errno(ret);
407 goto out;
408 }
409
410 ret = ocfs2_data_lock_with_page(inode, 1, page);
411 if (ret != 0) {
412 mlog_errno(ret);
413 goto out_unlock_meta;
414 }
415
416 if (extending) {
417 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
418 if (IS_ERR(handle)) {
419 ret = PTR_ERR(handle);
420 handle = NULL;
421 goto out_unlock_data;
422 }
423
424 /* Mark our buffer early. We'd rather catch this error up here
425 * as opposed to after a successful commit_write which would
426 * require us to set back inode->i_size. */
427 ret = ocfs2_journal_access(handle, inode, di_bh,
428 OCFS2_JOURNAL_ACCESS_WRITE);
429 if (ret < 0) {
430 mlog_errno(ret);
431 goto out_commit;
432 }
433 }
434
435 /* might update i_size */
436 ret = generic_commit_write(file, page, from, to);
437 if (ret < 0) {
438 mlog_errno(ret);
439 goto out_commit;
440 }
441
442 if (extending) {
443 loff_t size = (u64) i_size_read(inode);
444 struct ocfs2_dinode *di =
445 (struct ocfs2_dinode *)di_bh->b_data;
446
447 /* ocfs2_mark_inode_dirty is too heavy to use here. */
448 inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
449 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
450
451 di->i_size = cpu_to_le64(size);
452 di->i_ctime = di->i_mtime =
453 cpu_to_le64(inode->i_mtime.tv_sec);
454 di->i_ctime_nsec = di->i_mtime_nsec =
455 cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 ret = ocfs2_journal_dirty(handle, di_bh);
458 if (ret < 0) {
459 mlog_errno(ret);
460 goto out_commit;
461 }
462 }
463
464 BUG_ON(extending && (i_size_read(inode) != new_i_size));
465
466out_commit:
467 if (handle)
468 ocfs2_commit_trans(handle);
469out_unlock_data:
470 ocfs2_data_unlock(inode, 1);
471out_unlock_meta:
472 ocfs2_meta_unlock(inode, locklevel);
473out:
474 if (di_bh)
475 brelse(di_bh);
476
477 mlog_exit(ret);
478 return ret;
479}
480
481static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
482{
483 sector_t status;
484 u64 p_blkno = 0;
485 int err = 0;
486 struct inode *inode = mapping->host;
487
488 mlog_entry("(block = %llu)\n", (unsigned long long)block);
489
490 /* We don't need to lock journal system files, since they aren't
491 * accessed concurrently from multiple nodes.
492 */
493 if (!INODE_JOURNAL(inode)) {
494 err = ocfs2_meta_lock(inode, NULL, NULL, 0);
495 if (err) {
496 if (err != -ENOENT)
497 mlog_errno(err);
498 goto bail;
499 }
500 down_read(&OCFS2_I(inode)->ip_alloc_sem);
501 }
502
503 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
504 NULL);
505
506 if (!INODE_JOURNAL(inode)) {
507 up_read(&OCFS2_I(inode)->ip_alloc_sem);
508 ocfs2_meta_unlock(inode, 0);
509 }
510
511 if (err) {
512 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
513 (unsigned long long)block);
514 mlog_errno(err);
515 goto bail;
516 }
517
518
519bail:
520 status = err ? 0 : p_blkno;
521
522 mlog_exit((int)status);
523
524 return status;
525}
526
527/*
528 * TODO: Make this into a generic get_blocks function.
529 *
530 * From do_direct_io in direct-io.c:
531 * "So what we do is to permit the ->get_blocks function to populate
532 * bh.b_size with the size of IO which is permitted at this offset and
533 * this i_blkbits."
534 *
535 * This function is called directly from get_more_blocks in direct-io.c.
536 *
537 * called like this: dio->get_blocks(dio->inode, fs_startblk,
538 * fs_count, map_bh, dio->rw == WRITE);
539 */
540static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
541 unsigned long max_blocks,
542 struct buffer_head *bh_result, int create)
543{
544 int ret;
545 u64 vbo_max; /* file offset, max_blocks from iblock */
546 u64 p_blkno;
547 int contig_blocks;
548 unsigned char blocksize_bits;
549
550 if (!inode || !bh_result) {
551 mlog(ML_ERROR, "inode or bh_result is null\n");
552 return -EIO;
553 }
554
555 blocksize_bits = inode->i_sb->s_blocksize_bits;
556
557 /* This function won't even be called if the request isn't all
558 * nicely aligned and of the right size, so there's no need
559 * for us to check any of that. */
560
561 vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
562
563 spin_lock(&OCFS2_I(inode)->ip_lock);
564 if ((iblock + max_blocks) >
565 ocfs2_clusters_to_blocks(inode->i_sb,
566 OCFS2_I(inode)->ip_clusters)) {
567 spin_unlock(&OCFS2_I(inode)->ip_lock);
568 ret = -EIO;
569 goto bail;
570 }
571 spin_unlock(&OCFS2_I(inode)->ip_lock);
572
573 /* This figures out the size of the next contiguous block, and
574 * our logical offset */
575 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
576 &contig_blocks);
577 if (ret) {
578 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
579 (unsigned long long)iblock);
580 ret = -EIO;
581 goto bail;
582 }
583
584 map_bh(bh_result, inode->i_sb, p_blkno);
585
586 /* make sure we don't map more than max_blocks blocks here as
587 that's all the kernel will handle at this point. */
588 if (max_blocks < contig_blocks)
589 contig_blocks = max_blocks;
590 bh_result->b_size = contig_blocks << blocksize_bits;
591bail:
592 return ret;
593}
594
595/*
596 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
597 * particularly interested in the aio/dio case. Like the core uses
598 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
599 * truncation on another.
600 */
601static void ocfs2_dio_end_io(struct kiocb *iocb,
602 loff_t offset,
603 ssize_t bytes,
604 void *private)
605{
606 struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
607
608 /* this io's submitter should not have unlocked this before we could */
609 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
610 ocfs2_iocb_clear_rw_locked(iocb);
611 up_read(&inode->i_alloc_sem);
612 ocfs2_rw_unlock(inode, 0);
613}
614
615static ssize_t ocfs2_direct_IO(int rw,
616 struct kiocb *iocb,
617 const struct iovec *iov,
618 loff_t offset,
619 unsigned long nr_segs)
620{
621 struct file *file = iocb->ki_filp;
622 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
623 int ret;
624
625 mlog_entry_void();
626 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
627 inode->i_sb->s_bdev, iov, offset,
628 nr_segs,
629 ocfs2_direct_IO_get_blocks,
630 ocfs2_dio_end_io);
631 mlog_exit(ret);
632 return ret;
633}
634
635struct address_space_operations ocfs2_aops = {
636 .readpage = ocfs2_readpage,
637 .writepage = ocfs2_writepage,
638 .prepare_write = ocfs2_prepare_write,
639 .commit_write = ocfs2_commit_write,
640 .bmap = ocfs2_bmap,
641 .sync_page = block_sync_page,
642 .direct_IO = ocfs2_direct_IO
643};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 000000000000..d40456d509a0
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H
24
25int ocfs2_prepare_write(struct file *file, struct page *page,
26 unsigned from, unsigned to);
27
28struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page,
30 unsigned from,
31 unsigned to);
32
33/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \
37 set_bit(0, (unsigned long *)&iocb->private)
38#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private)
40
41#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 000000000000..d424041b38e9
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * io.c
5 *
6 * Buffer cache handling
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "inode.h"
37#include "journal.h"
38#include "uptodate.h"
39
40#include "buffer_head_io.h"
41
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode)
44{
45 int ret = 0;
46
47 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
48 (unsigned long long)bh->b_blocknr, inode);
49
50 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
51 BUG_ON(buffer_jbd(bh));
52
53 /* No need to check for a soft readonly file system here. non
54 * journalled writes are only ever done on system files which
55 * can get modified during recovery even if read-only. */
56 if (ocfs2_is_hard_readonly(osb)) {
57 ret = -EROFS;
58 goto out;
59 }
60
61 down(&OCFS2_I(inode)->ip_io_sem);
62
63 lock_buffer(bh);
64 set_buffer_uptodate(bh);
65
66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh);
68
69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh);
72
73 wait_on_buffer(bh);
74
75 if (buffer_uptodate(bh)) {
76 ocfs2_set_buffer_uptodate(inode, bh);
77 } else {
78 /* We don't need to remove the clustered uptodate
79 * information for this bh as it's not marked locally
80 * uptodate. */
81 ret = -EIO;
82 brelse(bh);
83 }
84
85 up(&OCFS2_I(inode)->ip_io_sem);
86out:
87 mlog_exit(ret);
88 return ret;
89}
90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
92 struct buffer_head *bhs[], int flags,
93 struct inode *inode)
94{
95 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0;
98 struct buffer_head *bh;
99
100 mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
101 block, nr, flags, inode);
102
103 if (osb == NULL || osb->sb == NULL || bhs == NULL) {
104 status = -EINVAL;
105 mlog_errno(status);
106 goto bail;
107 }
108
109 if (nr < 0) {
110 mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
111 status = -EINVAL;
112 mlog_errno(status);
113 goto bail;
114 }
115
116 if (nr == 0) {
117 mlog(ML_BH_IO, "No buffers will be read!\n");
118 status = 0;
119 goto bail;
120 }
121
122 sb = osb->sb;
123
124 if (flags & OCFS2_BH_CACHED && !inode)
125 flags &= ~OCFS2_BH_CACHED;
126
127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem);
129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) {
133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem);
135 status = -EIO;
136 mlog_errno(status);
137 goto bail;
138 }
139 }
140 bh = bhs[i];
141 ignore_cache = 0;
142
143 if (flags & OCFS2_BH_CACHED &&
144 !ocfs2_buffer_uptodate(inode, bh)) {
145 mlog(ML_UPTODATE,
146 "bh (%llu), inode %"MLFu64" not uptodate\n",
147 (unsigned long long)bh->b_blocknr,
148 OCFS2_I(inode)->ip_blkno);
149 ignore_cache = 1;
150 }
151
152 /* XXX: Can we ever get this and *not* have the cached
153 * flag set? */
154 if (buffer_jbd(bh)) {
155 if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
156 mlog(ML_BH_IO, "trying to sync read a jbd "
157 "managed bh (blocknr = %llu)\n",
158 (unsigned long long)bh->b_blocknr);
159 continue;
160 }
161
162 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
163 if (buffer_dirty(bh)) {
164 /* This should probably be a BUG, or
165 * at least return an error. */
166 mlog(ML_BH_IO, "asking me to sync read a dirty "
167 "buffer! (blocknr = %llu)\n",
168 (unsigned long long)bh->b_blocknr);
169 continue;
170 }
171
172 lock_buffer(bh);
173 if (buffer_jbd(bh)) {
174#ifdef CATCH_BH_JBD_RACES
175 mlog(ML_ERROR, "block %llu had the JBD bit set "
176 "while I was in lock_buffer!",
177 (unsigned long long)bh->b_blocknr);
178 BUG();
179#else
180 unlock_buffer(bh);
181 continue;
182#endif
183 }
184 clear_buffer_uptodate(bh);
185 get_bh(bh); /* for end_buffer_read_sync() */
186 bh->b_end_io = end_buffer_read_sync;
187 if (flags & OCFS2_BH_READAHEAD)
188 submit_bh(READA, bh);
189 else
190 submit_bh(READ, bh);
191 continue;
192 }
193 }
194
195 status = 0;
196
197 for (i = (nr - 1); i >= 0; i--) {
198 bh = bhs[i];
199
200 /* We know this can't have changed as we hold the
201 * inode sem. Avoid doing any work on the bh if the
202 * journal has it. */
203 if (!buffer_jbd(bh))
204 wait_on_buffer(bh);
205
206 if (!buffer_uptodate(bh)) {
207 /* Status won't be cleared from here on out,
208 * so we can safely record this and loop back
209 * to cleanup the other buffers. Don't need to
210 * remove the clustered uptodate information
211 * for this bh as it's not marked locally
212 * uptodate. */
213 status = -EIO;
214 brelse(bh);
215 bhs[i] = NULL;
216 continue;
217 }
218
219 if (inode)
220 ocfs2_set_buffer_uptodate(inode, bh);
221 }
222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem);
224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
227
228bail:
229
230 mlog_exit(status);
231 return status;
232}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 000000000000..6ecb90937b68
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_buffer_head.h
5 *
6 * Buffer cache handling functions defined
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_BUFFER_HEAD_IO_H
27#define OCFS2_BUFFER_HEAD_IO_H
28
29#include <linux/buffer_head.h>
30
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate);
33
34static inline int ocfs2_read_block(struct ocfs2_super *osb,
35 u64 off,
36 struct buffer_head **bh,
37 int flags,
38 struct inode *inode);
39
40int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh,
42 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb,
44 u64 block,
45 int nr,
46 struct buffer_head *bhs[],
47 int flags,
48 struct inode *inode);
49
50
51#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
53
54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
55 struct buffer_head **bh, int flags,
56 struct inode *inode)
57{
58 int status = 0;
59
60 if (bh == NULL) {
61 printk("ocfs2: bh == NULL\n");
62 status = -EINVAL;
63 goto bail;
64 }
65
66 status = ocfs2_read_blocks(osb, off, 1, bh,
67 flags, inode);
68
69bail:
70 return status;
71}
72
73#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 000000000000..bd85182e97bc
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.c
5 *
6 * dentry cache handling code
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/namei.h>
30
31#define MLOG_MASK_PREFIX ML_DCACHE
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dcache.h"
38#include "file.h"
39#include "inode.h"
40
41static int ocfs2_dentry_revalidate(struct dentry *dentry,
42 struct nameidata *nd)
43{
44 struct inode *inode = dentry->d_inode;
45 int ret = 0; /* if all else fails, just return false */
46 struct ocfs2_super *osb;
47
48 mlog_entry("(0x%p, '%.*s')\n", dentry,
49 dentry->d_name.len, dentry->d_name.name);
50
51 /* Never trust a negative dentry - force a new lookup. */
52 if (inode == NULL) {
53 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
54 dentry->d_name.name);
55 goto bail;
56 }
57
58 osb = OCFS2_SB(inode->i_sb);
59
60 BUG_ON(!osb);
61
62 if (inode != osb->root_inode) {
63 spin_lock(&OCFS2_I(inode)->ip_lock);
64 /* did we or someone else delete this inode? */
65 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
66 spin_unlock(&OCFS2_I(inode)->ip_lock);
67 mlog(0, "inode (%"MLFu64") deleted, returning false\n",
68 OCFS2_I(inode)->ip_blkno);
69 goto bail;
70 }
71 spin_unlock(&OCFS2_I(inode)->ip_lock);
72
73 if (!inode->i_nlink) {
74 mlog(0, "Inode %"MLFu64" orphaned, returning false "
75 "dir = %d\n", OCFS2_I(inode)->ip_blkno,
76 S_ISDIR(inode->i_mode));
77 goto bail;
78 }
79 }
80
81 ret = 1;
82
83bail:
84 mlog_exit(ret);
85
86 return ret;
87}
88
89struct dentry_operations ocfs2_dentry_ops = {
90 .d_revalidate = ocfs2_dentry_revalidate,
91};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 000000000000..90072771114b
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H
28
29extern struct dentry_operations ocfs2_dentry_ops;
30
31#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 000000000000..856e20ae8263
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c
5 *
6 * Creates, reads, walks and deletes directory-nodes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dir.h"
51#include "dlmglue.h"
52#include "extent_map.h"
53#include "file.h"
54#include "inode.h"
55#include "journal.h"
56#include "namei.h"
57#include "suballoc.h"
58#include "uptodate.h"
59
60#include "buffer_head_io.h"
61
62static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64};
65
66static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir,
68 struct buffer_head *parent_fe_bh,
69 struct buffer_head **new_de_bh);
70/*
71 * ocfs2_readdir()
72 *
73 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75{
76 int error = 0;
77 unsigned long offset, blk;
78 int i, num, stored;
79 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de;
81 int err;
82 struct inode *inode = filp->f_dentry->d_inode;
83 struct super_block * sb = inode->i_sb;
84 int have_disk_lock = 0;
85
86 mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
87
88 stored = 0;
89 bh = NULL;
90
91 error = ocfs2_meta_lock(inode, NULL, NULL, 0);
92 if (error < 0) {
93 if (error != -ENOENT)
94 mlog_errno(error);
95 /* we haven't got any yet, so propagate the error. */
96 stored = error;
97 goto bail;
98 }
99 have_disk_lock = 1;
100
101 offset = filp->f_pos & (sb->s_blocksize - 1);
102
103 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
104 blk = (filp->f_pos) >> sb->s_blocksize_bits;
105 bh = ocfs2_bread(inode, blk, &err, 0);
106 if (!bh) {
107 mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
108 "at offset %lld\n",
109 OCFS2_I(inode)->ip_blkno,
110 filp->f_pos);
111 filp->f_pos += sb->s_blocksize - offset;
112 continue;
113 }
114
115 /*
116 * Do the readahead (8k)
117 */
118 if (!offset) {
119 for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
120 i > 0; i--) {
121 tmp = ocfs2_bread(inode, ++blk, &err, 1);
122 if (tmp)
123 brelse(tmp);
124 }
125 }
126
127revalidate:
128 /* If the dir block has changed since the last call to
129 * readdir(2), then we might be pointing to an invalid
130 * dirent right now. Scan from the start of the block
131 * to make sure. */
132 if (filp->f_version != inode->i_version) {
133 for (i = 0; i < sb->s_blocksize && i < offset; ) {
134 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
135 /* It's too expensive to do a full
136 * dirent test each time round this
137 * loop, but we do have to test at
138 * least that it is non-zero. A
139 * failure will be detected in the
140 * dirent test below. */
141 if (le16_to_cpu(de->rec_len) <
142 OCFS2_DIR_REC_LEN(1))
143 break;
144 i += le16_to_cpu(de->rec_len);
145 }
146 offset = i;
147 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
148 | offset;
149 filp->f_version = inode->i_version;
150 }
151
152 while (!error && filp->f_pos < i_size_read(inode)
153 && offset < sb->s_blocksize) {
154 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
155 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
156 /* On error, skip the f_pos to the
157 next block. */
158 filp->f_pos = (filp->f_pos |
159 (sb->s_blocksize - 1)) + 1;
160 brelse(bh);
161 goto bail;
162 }
163 offset += le16_to_cpu(de->rec_len);
164 if (le64_to_cpu(de->inode)) {
165 /* We might block in the next section
166 * if the data destination is
167 * currently swapped out. So, use a
168 * version stamp to detect whether or
169 * not the directory has been modified
170 * during the copy operation.
171 */
172 unsigned long version = filp->f_version;
173 unsigned char d_type = DT_UNKNOWN;
174
175 if (de->file_type < OCFS2_FT_MAX)
176 d_type = ocfs2_filetype_table[de->file_type];
177 error = filldir(dirent, de->name,
178 de->name_len,
179 filp->f_pos,
180 ino_from_blkno(sb, le64_to_cpu(de->inode)),
181 d_type);
182 if (error)
183 break;
184 if (version != filp->f_version)
185 goto revalidate;
186 stored ++;
187 }
188 filp->f_pos += le16_to_cpu(de->rec_len);
189 }
190 offset = 0;
191 brelse(bh);
192 }
193
194 stored = 0;
195bail:
196 if (have_disk_lock)
197 ocfs2_meta_unlock(inode, 0);
198
199 mlog_exit(stored);
200
201 return stored;
202}
203
204/*
205 * NOTE: this should always be called with parent dir i_sem taken.
206 */
207int ocfs2_find_files_on_disk(const char *name,
208 int namelen,
209 u64 *blkno,
210 struct inode *inode,
211 struct buffer_head **dirent_bh,
212 struct ocfs2_dir_entry **dirent)
213{
214 int status = -ENOENT;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216
217 mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
218 "inode=%p)\n",
219 osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
220
221 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
222 if (!*dirent_bh || !*dirent) {
223 status = -ENOENT;
224 goto leave;
225 }
226
227 *blkno = le64_to_cpu((*dirent)->inode);
228
229 status = 0;
230leave:
231 if (status < 0) {
232 *dirent = NULL;
233 if (*dirent_bh) {
234 brelse(*dirent_bh);
235 *dirent_bh = NULL;
236 }
237 }
238
239 mlog_exit(status);
240 return status;
241}
242
243/* Check for a name within a directory.
244 *
245 * Return 0 if the name does not exist
246 * Return -EEXIST if the directory contains the name
247 *
248 * Callers should have i_sem + a cluster lock on dir
249 */
250int ocfs2_check_dir_for_entry(struct inode *dir,
251 const char *name,
252 int namelen)
253{
254 int ret;
255 struct buffer_head *dirent_bh = NULL;
256 struct ocfs2_dir_entry *dirent = NULL;
257
258 mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
259 namelen, name);
260
261 ret = -EEXIST;
262 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
263 if (dirent_bh)
264 goto bail;
265
266 ret = 0;
267bail:
268 if (dirent_bh)
269 brelse(dirent_bh);
270
271 mlog_exit(ret);
272 return ret;
273}
274
275/*
276 * routine to check that the specified directory is empty (for rmdir)
277 */
278int ocfs2_empty_dir(struct inode *inode)
279{
280 unsigned long offset;
281 struct buffer_head * bh;
282 struct ocfs2_dir_entry * de, * de1;
283 struct super_block * sb;
284 int err;
285
286 sb = inode->i_sb;
287 if ((i_size_read(inode) <
288 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
289 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
290 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
291 "no data block\n",
292 OCFS2_I(inode)->ip_blkno);
293 return 1;
294 }
295
296 de = (struct ocfs2_dir_entry *) bh->b_data;
297 de1 = (struct ocfs2_dir_entry *)
298 ((char *)de + le16_to_cpu(de->rec_len));
299 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
300 !le64_to_cpu(de1->inode) ||
301 strcmp(".", de->name) ||
302 strcmp("..", de1->name)) {
303 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
304 "no `.' or `..'\n",
305 OCFS2_I(inode)->ip_blkno);
306 brelse(bh);
307 return 1;
308 }
309 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
310 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
311 while (offset < i_size_read(inode) ) {
312 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
313 brelse(bh);
314 bh = ocfs2_bread(inode,
315 offset >> sb->s_blocksize_bits, &err, 0);
316 if (!bh) {
317 mlog(ML_ERROR, "directory #%"MLFu64" contains "
318 "a hole at offset %lu\n",
319 OCFS2_I(inode)->ip_blkno, offset);
320 offset += sb->s_blocksize;
321 continue;
322 }
323 de = (struct ocfs2_dir_entry *) bh->b_data;
324 }
325 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
326 brelse(bh);
327 return 1;
328 }
329 if (le64_to_cpu(de->inode)) {
330 brelse(bh);
331 return 0;
332 }
333 offset += le16_to_cpu(de->rec_len);
334 de = (struct ocfs2_dir_entry *)
335 ((char *)de + le16_to_cpu(de->rec_len));
336 }
337 brelse(bh);
338 return 1;
339}
340
341/* returns a bh of the 1st new block in the allocation. */
342int ocfs2_do_extend_dir(struct super_block *sb,
343 struct ocfs2_journal_handle *handle,
344 struct inode *dir,
345 struct buffer_head *parent_fe_bh,
346 struct ocfs2_alloc_context *data_ac,
347 struct ocfs2_alloc_context *meta_ac,
348 struct buffer_head **new_bh)
349{
350 int status;
351 int extend;
352 u64 p_blkno;
353
354 spin_lock(&OCFS2_I(dir)->ip_lock);
355 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
356 spin_unlock(&OCFS2_I(dir)->ip_lock);
357
358 if (extend) {
359 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
360 parent_fe_bh, handle,
361 data_ac, meta_ac, NULL);
362 BUG_ON(status == -EAGAIN);
363 if (status < 0) {
364 mlog_errno(status);
365 goto bail;
366 }
367 }
368
369 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
370 (sb->s_blocksize_bits - 9)),
371 1, &p_blkno, NULL);
372 if (status < 0) {
373 mlog_errno(status);
374 goto bail;
375 }
376
377 *new_bh = sb_getblk(sb, p_blkno);
378 if (!*new_bh) {
379 status = -EIO;
380 mlog_errno(status);
381 goto bail;
382 }
383 status = 0;
384bail:
385 mlog_exit(status);
386 return status;
387}
388
389/* assumes you already have a cluster lock on the directory. */
390static int ocfs2_extend_dir(struct ocfs2_super *osb,
391 struct inode *dir,
392 struct buffer_head *parent_fe_bh,
393 struct buffer_head **new_de_bh)
394{
395 int status = 0;
396 int credits, num_free_extents;
397 loff_t dir_i_size;
398 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
399 struct ocfs2_alloc_context *data_ac = NULL;
400 struct ocfs2_alloc_context *meta_ac = NULL;
401 struct ocfs2_journal_handle *handle = NULL;
402 struct buffer_head *new_bh = NULL;
403 struct ocfs2_dir_entry * de;
404 struct super_block *sb = osb->sb;
405
406 mlog_entry_void();
407
408 dir_i_size = i_size_read(dir);
409 mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
410 OCFS2_I(dir)->ip_blkno, dir_i_size);
411
412 handle = ocfs2_alloc_handle(osb);
413 if (handle == NULL) {
414 status = -ENOMEM;
415 mlog_errno(status);
416 goto bail;
417 }
418
419 /* dir->i_size is always block aligned. */
420 spin_lock(&OCFS2_I(dir)->ip_lock);
421 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
422 spin_unlock(&OCFS2_I(dir)->ip_lock);
423 num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
424 if (num_free_extents < 0) {
425 status = num_free_extents;
426 mlog_errno(status);
427 goto bail;
428 }
429
430 if (!num_free_extents) {
431 status = ocfs2_reserve_new_metadata(osb, handle,
432 fe, &meta_ac);
433 if (status < 0) {
434 if (status != -ENOSPC)
435 mlog_errno(status);
436 goto bail;
437 }
438 }
439
440 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
441 if (status < 0) {
442 if (status != -ENOSPC)
443 mlog_errno(status);
444 goto bail;
445 }
446
447 credits = ocfs2_calc_extend_credits(sb, fe, 1);
448 } else {
449 spin_unlock(&OCFS2_I(dir)->ip_lock);
450 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
451 }
452
453 handle = ocfs2_start_trans(osb, handle, credits);
454 if (IS_ERR(handle)) {
455 status = PTR_ERR(handle);
456 handle = NULL;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
462 data_ac, meta_ac, &new_bh);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467
468 ocfs2_set_new_buffer_uptodate(dir, new_bh);
469
470 status = ocfs2_journal_access(handle, dir, new_bh,
471 OCFS2_JOURNAL_ACCESS_CREATE);
472 if (status < 0) {
473 mlog_errno(status);
474 goto bail;
475 }
476 memset(new_bh->b_data, 0, sb->s_blocksize);
477 de = (struct ocfs2_dir_entry *) new_bh->b_data;
478 de->inode = 0;
479 de->rec_len = cpu_to_le16(sb->s_blocksize);
480 status = ocfs2_journal_dirty(handle, new_bh);
481 if (status < 0) {
482 mlog_errno(status);
483 goto bail;
484 }
485
486 dir_i_size += dir->i_sb->s_blocksize;
487 i_size_write(dir, dir_i_size);
488 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
489 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
490 if (status < 0) {
491 mlog_errno(status);
492 goto bail;
493 }
494
495 *new_de_bh = new_bh;
496 get_bh(*new_de_bh);
497bail:
498 if (handle)
499 ocfs2_commit_trans(handle);
500
501 if (data_ac)
502 ocfs2_free_alloc_context(data_ac);
503 if (meta_ac)
504 ocfs2_free_alloc_context(meta_ac);
505
506 if (new_bh)
507 brelse(new_bh);
508
509 mlog_exit(status);
510 return status;
511}
512
513/*
514 * Search the dir for a good spot, extending it if necessary. The
515 * block containing an appropriate record is returned in ret_de_bh.
516 */
517int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
518 struct inode *dir,
519 struct buffer_head *parent_fe_bh,
520 const char *name,
521 int namelen,
522 struct buffer_head **ret_de_bh)
523{
524 unsigned long offset;
525 struct buffer_head * bh = NULL;
526 unsigned short rec_len;
527 struct ocfs2_dinode *fe;
528 struct ocfs2_dir_entry *de;
529 struct super_block *sb;
530 int status;
531
532 mlog_entry_void();
533
534 mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
535 namelen, OCFS2_I(dir)->ip_blkno);
536
537 BUG_ON(!S_ISDIR(dir->i_mode));
538 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
539 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
540
541 sb = dir->i_sb;
542
543 if (!namelen) {
544 status = -EINVAL;
545 mlog_errno(status);
546 goto bail;
547 }
548
549 bh = ocfs2_bread(dir, 0, &status, 0);
550 if (!bh) {
551 mlog_errno(status);
552 goto bail;
553 }
554
555 rec_len = OCFS2_DIR_REC_LEN(namelen);
556 offset = 0;
557 de = (struct ocfs2_dir_entry *) bh->b_data;
558 while (1) {
559 if ((char *)de >= sb->s_blocksize + bh->b_data) {
560 brelse(bh);
561 bh = NULL;
562
563 if (i_size_read(dir) <= offset) {
564 status = ocfs2_extend_dir(osb,
565 dir,
566 parent_fe_bh,
567 &bh);
568 if (status < 0) {
569 mlog_errno(status);
570 goto bail;
571 }
572 BUG_ON(!bh);
573 *ret_de_bh = bh;
574 get_bh(*ret_de_bh);
575 goto bail;
576 }
577 bh = ocfs2_bread(dir,
578 offset >> sb->s_blocksize_bits,
579 &status,
580 0);
581 if (!bh) {
582 mlog_errno(status);
583 goto bail;
584 }
585 /* move to next block */
586 de = (struct ocfs2_dir_entry *) bh->b_data;
587 }
588 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
589 status = -ENOENT;
590 goto bail;
591 }
592 if (ocfs2_match(namelen, name, de)) {
593 status = -EEXIST;
594 goto bail;
595 }
596 if (((le64_to_cpu(de->inode) == 0) &&
597 (le16_to_cpu(de->rec_len) >= rec_len)) ||
598 (le16_to_cpu(de->rec_len) >=
599 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
600 /* Ok, we found a spot. Return this bh and let
601 * the caller actually fill it in. */
602 *ret_de_bh = bh;
603 get_bh(*ret_de_bh);
604 status = 0;
605 goto bail;
606 }
607 offset += le16_to_cpu(de->rec_len);
608 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
609 }
610
611 status = 0;
612bail:
613 if (bh)
614 brelse(bh);
615
616 mlog_exit(status);
617 return status;
618}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 000000000000..5f614ec9649c
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H
28
29int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name,
31 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
33int ocfs2_find_files_on_disk(const char *name,
34 int namelen,
35 u64 *blkno,
36 struct inode *inode,
37 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir,
42 struct buffer_head *parent_fe_bh,
43 const char *name,
44 int namelen,
45 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb,
48 struct ocfs2_journal_handle *handle,
49 struct inode *dir,
50 struct buffer_head *parent_fe_bh,
51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac,
53 struct buffer_head **new_bh);
54#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 000000000000..e971ec2f8407
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "dlmglue.h"
50#include "extent_map.h"
51#include "heartbeat.h"
52#include "inode.h"
53#include "journal.h"
54#include "slot_map.h"
55#include "super.h"
56#include "uptodate.h"
57#include "vote.h"
58
59#include "buffer_head_io.h"
60
61struct ocfs2_mask_waiter {
62 struct list_head mw_item;
63 int mw_status;
64 struct completion mw_complete;
65 unsigned long mw_mask;
66 unsigned long mw_goal;
67};
68
69static void ocfs2_inode_ast_func(void *opaque);
70static void ocfs2_inode_bast_func(void *opaque,
71 int level);
72static void ocfs2_super_ast_func(void *opaque);
73static void ocfs2_super_bast_func(void *opaque,
74 int level);
75static void ocfs2_rename_ast_func(void *opaque);
76static void ocfs2_rename_bast_func(void *opaque,
77 int level);
78
79/* so far, all locks have gotten along with the same unlock ast */
80static void ocfs2_unlock_ast_func(void *opaque,
81 enum dlm_status status);
82static int ocfs2_do_unblock_meta(struct inode *inode,
83 int *requeue);
84static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 int *requeue);
86static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 int *requeue);
88static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 int *requeue);
90static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 int *requeue);
92typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 struct ocfs2_lock_res *lockres,
95 int *requeue,
96 ocfs2_convert_worker_t *worker);
97
98struct ocfs2_lock_res_ops {
99 void (*ast)(void *);
100 void (*bast)(void *, int);
101 void (*unlock_ast)(void *, enum dlm_status);
102 int (*unblock)(struct ocfs2_lock_res *, int *);
103};
104
105static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 .ast = ocfs2_inode_ast_func,
107 .bast = ocfs2_inode_bast_func,
108 .unlock_ast = ocfs2_unlock_ast_func,
109 .unblock = ocfs2_unblock_inode_lock,
110};
111
112static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 .ast = ocfs2_inode_ast_func,
114 .bast = ocfs2_inode_bast_func,
115 .unlock_ast = ocfs2_unlock_ast_func,
116 .unblock = ocfs2_unblock_meta,
117};
118
119static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 int blocking);
121
122static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 .ast = ocfs2_inode_ast_func,
124 .bast = ocfs2_inode_bast_func,
125 .unlock_ast = ocfs2_unlock_ast_func,
126 .unblock = ocfs2_unblock_data,
127};
128
129static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 .ast = ocfs2_super_ast_func,
131 .bast = ocfs2_super_bast_func,
132 .unlock_ast = ocfs2_unlock_ast_func,
133 .unblock = ocfs2_unblock_osb_lock,
134};
135
136static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 .ast = ocfs2_rename_ast_func,
138 .bast = ocfs2_rename_bast_func,
139 .unlock_ast = ocfs2_unlock_ast_func,
140 .unblock = ocfs2_unblock_osb_lock,
141};
142
143static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144{
145 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 lockres->l_type == OCFS2_LOCK_TYPE_RW;
148}
149
150static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151{
152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153}
154
155static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156{
157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158}
159
160static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161{
162 BUG_ON(!ocfs2_is_super_lock(lockres)
163 && !ocfs2_is_rename_lock(lockres));
164
165 return (struct ocfs2_super *) lockres->l_priv;
166}
167
168static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169{
170 BUG_ON(!ocfs2_is_inode_lock(lockres));
171
172 return (struct inode *) lockres->l_priv;
173}
174
175static int ocfs2_lock_create(struct ocfs2_super *osb,
176 struct ocfs2_lock_res *lockres,
177 int level,
178 int dlm_flags);
179static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 int wanted);
181static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 struct ocfs2_lock_res *lockres,
183 int level);
184static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 struct ocfs2_lock_res *lockres);
190static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 int convert);
192#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
194 "resource %s: %s\n", dlm_errname(_stat), _func, \
195 _lockres->l_name, dlm_errmsg(_stat)); \
196} while (0)
197static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 struct ocfs2_lock_res *lockres);
199static int ocfs2_meta_lock_update(struct inode *inode,
200 struct buffer_head **bh);
201static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202static inline int ocfs2_highest_compat_lock_level(int level);
203static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 struct ocfs2_lock_res *lockres,
205 int new_level);
206
207static char *ocfs2_lock_type_strings[] = {
208 [OCFS2_LOCK_TYPE_META] = "Meta",
209 [OCFS2_LOCK_TYPE_DATA] = "Data",
210 [OCFS2_LOCK_TYPE_SUPER] = "Super",
211 [OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 /* Need to differntiate from [R]ename.. serializing writes is the
213 * important job it does, anyway. */
214 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
215};
216
217static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218{
219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 return ocfs2_lock_type_strings[type];
221}
222
223static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 u64 blkno,
225 u32 generation,
226 char *name)
227{
228 int len;
229
230 mlog_entry_void();
231
232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233
234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
236 generation);
237
238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239
240 mlog(0, "built lock resource with name: %s\n", name);
241
242 mlog_exit_void();
243}
244
245static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
246
247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 struct ocfs2_dlm_debug *dlm_debug)
249{
250 mlog(0, "Add tracking for lockres %s\n", res->l_name);
251
252 spin_lock(&ocfs2_dlm_tracking_lock);
253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 spin_unlock(&ocfs2_dlm_tracking_lock);
255}
256
257static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258{
259 spin_lock(&ocfs2_dlm_tracking_lock);
260 if (!list_empty(&res->l_debug_list))
261 list_del_init(&res->l_debug_list);
262 spin_unlock(&ocfs2_dlm_tracking_lock);
263}
264
265static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 struct ocfs2_lock_res *res,
267 enum ocfs2_lock_type type,
268 u64 blkno,
269 u32 generation,
270 struct ocfs2_lock_res_ops *ops,
271 void *priv)
272{
273 ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274
275 res->l_type = type;
276 res->l_ops = ops;
277 res->l_priv = priv;
278
279 res->l_level = LKM_IVMODE;
280 res->l_requested = LKM_IVMODE;
281 res->l_blocking = LKM_IVMODE;
282 res->l_action = OCFS2_AST_INVALID;
283 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284
285 res->l_flags = OCFS2_LOCK_INITIALIZED;
286
287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288}
289
290void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291{
292 /* This also clears out the lock status block */
293 memset(res, 0, sizeof(struct ocfs2_lock_res));
294 spin_lock_init(&res->l_lock);
295 init_waitqueue_head(&res->l_event);
296 INIT_LIST_HEAD(&res->l_blocked_list);
297 INIT_LIST_HEAD(&res->l_mask_waiters);
298}
299
300void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 enum ocfs2_lock_type type,
302 struct inode *inode)
303{
304 struct ocfs2_lock_res_ops *ops;
305
306 switch(type) {
307 case OCFS2_LOCK_TYPE_RW:
308 ops = &ocfs2_inode_rw_lops;
309 break;
310 case OCFS2_LOCK_TYPE_META:
311 ops = &ocfs2_inode_meta_lops;
312 break;
313 case OCFS2_LOCK_TYPE_DATA:
314 ops = &ocfs2_inode_data_lops;
315 break;
316 default:
317 mlog_bug_on_msg(1, "type: %d\n", type);
318 ops = NULL; /* thanks, gcc */
319 break;
320 };
321
322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 OCFS2_I(inode)->ip_blkno,
324 inode->i_generation, ops, inode);
325}
326
327static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 struct ocfs2_super *osb)
329{
330 /* Superblock lockres doesn't come from a slab so we call init
331 * once on it manually. */
332 ocfs2_lock_res_init_once(res);
333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 OCFS2_SUPER_BLOCK_BLKNO, 0,
335 &ocfs2_super_lops, osb);
336}
337
338static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 struct ocfs2_super *osb)
340{
341 /* Rename lockres doesn't come from a slab so we call init
342 * once on it manually. */
343 ocfs2_lock_res_init_once(res);
344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 &ocfs2_rename_lops, osb);
346}
347
348void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349{
350 mlog_entry_void();
351
352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 return;
354
355 ocfs2_remove_lockres_tracking(res);
356
357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 "Lockres %s is on the blocked list\n",
359 res->l_name);
360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 "Lockres %s has mask waiters pending\n",
362 res->l_name);
363 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 "Lockres %s is locked\n",
365 res->l_name);
366 mlog_bug_on_msg(res->l_ro_holders,
367 "Lockres %s has %u ro holders\n",
368 res->l_name, res->l_ro_holders);
369 mlog_bug_on_msg(res->l_ex_holders,
370 "Lockres %s has %u ex holders\n",
371 res->l_name, res->l_ex_holders);
372
373 /* Need to clear out the lock status block for the dlm */
374 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375
376 res->l_flags = 0UL;
377 mlog_exit_void();
378}
379
380static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 int level)
382{
383 mlog_entry_void();
384
385 BUG_ON(!lockres);
386
387 switch(level) {
388 case LKM_EXMODE:
389 lockres->l_ex_holders++;
390 break;
391 case LKM_PRMODE:
392 lockres->l_ro_holders++;
393 break;
394 default:
395 BUG();
396 }
397
398 mlog_exit_void();
399}
400
401static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 int level)
403{
404 mlog_entry_void();
405
406 BUG_ON(!lockres);
407
408 switch(level) {
409 case LKM_EXMODE:
410 BUG_ON(!lockres->l_ex_holders);
411 lockres->l_ex_holders--;
412 break;
413 case LKM_PRMODE:
414 BUG_ON(!lockres->l_ro_holders);
415 lockres->l_ro_holders--;
416 break;
417 default:
418 BUG();
419 }
420 mlog_exit_void();
421}
422
423/* WARNING: This function lives in a world where the only three lock
424 * levels are EX, PR, and NL. It *will* have to be adjusted when more
425 * lock types are added. */
426static inline int ocfs2_highest_compat_lock_level(int level)
427{
428 int new_level = LKM_EXMODE;
429
430 if (level == LKM_EXMODE)
431 new_level = LKM_NLMODE;
432 else if (level == LKM_PRMODE)
433 new_level = LKM_PRMODE;
434 return new_level;
435}
436
437static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 unsigned long newflags)
439{
440 struct list_head *pos, *tmp;
441 struct ocfs2_mask_waiter *mw;
442
443 assert_spin_locked(&lockres->l_lock);
444
445 lockres->l_flags = newflags;
446
447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 continue;
451
452 list_del_init(&mw->mw_item);
453 mw->mw_status = 0;
454 complete(&mw->mw_complete);
455 }
456}
457static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458{
459 lockres_set_flags(lockres, lockres->l_flags | or);
460}
461static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 unsigned long clear)
463{
464 lockres_set_flags(lockres, lockres->l_flags & ~clear);
465}
466
467static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468{
469 mlog_entry_void();
470
471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475
476 lockres->l_level = lockres->l_requested;
477 if (lockres->l_level <=
478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 lockres->l_blocking = LKM_NLMODE;
480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 }
482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483
484 mlog_exit_void();
485}
486
487static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488{
489 mlog_entry_void();
490
491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493
494 /* Convert from RO to EX doesn't really need anything as our
495 * information is already up to data. Convert from NL to
496 * *anything* however should mark ourselves as needing an
497 * update */
498 if (lockres->l_level == LKM_NLMODE)
499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500
501 lockres->l_level = lockres->l_requested;
502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503
504 mlog_exit_void();
505}
506
507static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508{
509 mlog_entry_void();
510
511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513
514 if (lockres->l_requested > LKM_NLMODE &&
515 !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517
518 lockres->l_level = lockres->l_requested;
519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521
522 mlog_exit_void();
523}
524
525static void ocfs2_inode_ast_func(void *opaque)
526{
527 struct ocfs2_lock_res *lockres = opaque;
528 struct inode *inode;
529 struct dlm_lockstatus *lksb;
530 unsigned long flags;
531
532 mlog_entry_void();
533
534 inode = ocfs2_lock_res_inode(lockres);
535
536 mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
537 OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 ocfs2_lock_type_string(lockres->l_type));
539
540 BUG_ON(!ocfs2_is_inode_lock(lockres));
541
542 spin_lock_irqsave(&lockres->l_lock, flags);
543
544 lksb = &(lockres->l_lksb);
545 if (lksb->status != DLM_NORMAL) {
546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 "on inode %"MLFu64"\n", lksb->status,
548 OCFS2_I(inode)->ip_blkno);
549 spin_unlock_irqrestore(&lockres->l_lock, flags);
550 mlog_exit_void();
551 return;
552 }
553
554 switch(lockres->l_action) {
555 case OCFS2_AST_ATTACH:
556 ocfs2_generic_handle_attach_action(lockres);
557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 break;
559 case OCFS2_AST_CONVERT:
560 ocfs2_generic_handle_convert_action(lockres);
561 break;
562 case OCFS2_AST_DOWNCONVERT:
563 ocfs2_generic_handle_downconvert_action(lockres);
564 break;
565 default:
566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 "lockres flags = 0x%lx, unlock action: %u\n",
568 lockres->l_name, lockres->l_action, lockres->l_flags,
569 lockres->l_unlock_action);
570
571 BUG();
572 }
573
574 /* data and rw locking ignores refresh flag for now. */
575 if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577
578 /* set it to something invalid so if we get called again we
579 * can catch it. */
580 lockres->l_action = OCFS2_AST_INVALID;
581 spin_unlock_irqrestore(&lockres->l_lock, flags);
582 wake_up(&lockres->l_event);
583
584 mlog_exit_void();
585}
586
587static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 int level)
589{
590 int needs_downconvert = 0;
591 mlog_entry_void();
592
593 assert_spin_locked(&lockres->l_lock);
594
595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596
597 if (level > lockres->l_blocking) {
598 /* only schedule a downconvert if we haven't already scheduled
599 * one that goes low enough to satisfy the level we're
600 * blocking. this also catches the case where we get
601 * duplicate BASTs */
602 if (ocfs2_highest_compat_lock_level(level) <
603 ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 needs_downconvert = 1;
605
606 lockres->l_blocking = level;
607 }
608
609 mlog_exit(needs_downconvert);
610 return needs_downconvert;
611}
612
613static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 struct ocfs2_lock_res *lockres,
615 int level)
616{
617 int needs_downconvert;
618 unsigned long flags;
619
620 mlog_entry_void();
621
622 BUG_ON(level <= LKM_NLMODE);
623
624 spin_lock_irqsave(&lockres->l_lock, flags);
625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 if (needs_downconvert)
627 ocfs2_schedule_blocked_lock(osb, lockres);
628 spin_unlock_irqrestore(&lockres->l_lock, flags);
629
630 ocfs2_kick_vote_thread(osb);
631
632 wake_up(&lockres->l_event);
633 mlog_exit_void();
634}
635
636static void ocfs2_inode_bast_func(void *opaque, int level)
637{
638 struct ocfs2_lock_res *lockres = opaque;
639 struct inode *inode;
640 struct ocfs2_super *osb;
641
642 mlog_entry_void();
643
644 BUG_ON(!ocfs2_is_inode_lock(lockres));
645
646 inode = ocfs2_lock_res_inode(lockres);
647 osb = OCFS2_SB(inode->i_sb);
648
649 mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
650 "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
651 lockres->l_level,
652 ocfs2_lock_type_string(lockres->l_type));
653
654 ocfs2_generic_bast_func(osb, lockres, level);
655
656 mlog_exit_void();
657}
658
659static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
660 int ignore_refresh)
661{
662 struct dlm_lockstatus *lksb = &lockres->l_lksb;
663 unsigned long flags;
664
665 spin_lock_irqsave(&lockres->l_lock, flags);
666
667 if (lksb->status != DLM_NORMAL) {
668 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
669 lockres->l_name, lksb->status);
670 spin_unlock_irqrestore(&lockres->l_lock, flags);
671 return;
672 }
673
674 switch(lockres->l_action) {
675 case OCFS2_AST_ATTACH:
676 ocfs2_generic_handle_attach_action(lockres);
677 break;
678 case OCFS2_AST_CONVERT:
679 ocfs2_generic_handle_convert_action(lockres);
680 break;
681 case OCFS2_AST_DOWNCONVERT:
682 ocfs2_generic_handle_downconvert_action(lockres);
683 break;
684 default:
685 BUG();
686 }
687
688 if (ignore_refresh)
689 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
690
691 /* set it to something invalid so if we get called again we
692 * can catch it. */
693 lockres->l_action = OCFS2_AST_INVALID;
694 spin_unlock_irqrestore(&lockres->l_lock, flags);
695
696 wake_up(&lockres->l_event);
697}
698
699static void ocfs2_super_ast_func(void *opaque)
700{
701 struct ocfs2_lock_res *lockres = opaque;
702
703 mlog_entry_void();
704 mlog(0, "Superblock AST fired\n");
705
706 BUG_ON(!ocfs2_is_super_lock(lockres));
707 ocfs2_generic_ast_func(lockres, 0);
708
709 mlog_exit_void();
710}
711
712static void ocfs2_super_bast_func(void *opaque,
713 int level)
714{
715 struct ocfs2_lock_res *lockres = opaque;
716 struct ocfs2_super *osb;
717
718 mlog_entry_void();
719 mlog(0, "Superblock BAST fired\n");
720
721 BUG_ON(!ocfs2_is_super_lock(lockres));
722 osb = ocfs2_lock_res_super(lockres);
723 ocfs2_generic_bast_func(osb, lockres, level);
724
725 mlog_exit_void();
726}
727
728static void ocfs2_rename_ast_func(void *opaque)
729{
730 struct ocfs2_lock_res *lockres = opaque;
731
732 mlog_entry_void();
733
734 mlog(0, "Rename AST fired\n");
735
736 BUG_ON(!ocfs2_is_rename_lock(lockres));
737
738 ocfs2_generic_ast_func(lockres, 1);
739
740 mlog_exit_void();
741}
742
743static void ocfs2_rename_bast_func(void *opaque,
744 int level)
745{
746 struct ocfs2_lock_res *lockres = opaque;
747 struct ocfs2_super *osb;
748
749 mlog_entry_void();
750
751 mlog(0, "Rename BAST fired\n");
752
753 BUG_ON(!ocfs2_is_rename_lock(lockres));
754
755 osb = ocfs2_lock_res_super(lockres);
756 ocfs2_generic_bast_func(osb, lockres, level);
757
758 mlog_exit_void();
759}
760
761static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
762 int convert)
763{
764 unsigned long flags;
765
766 mlog_entry_void();
767 spin_lock_irqsave(&lockres->l_lock, flags);
768 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
769 if (convert)
770 lockres->l_action = OCFS2_AST_INVALID;
771 else
772 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
773 spin_unlock_irqrestore(&lockres->l_lock, flags);
774
775 wake_up(&lockres->l_event);
776 mlog_exit_void();
777}
778
779/* Note: If we detect another process working on the lock (i.e.,
780 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
781 * to do the right thing in that case.
782 */
783static int ocfs2_lock_create(struct ocfs2_super *osb,
784 struct ocfs2_lock_res *lockres,
785 int level,
786 int dlm_flags)
787{
788 int ret = 0;
789 enum dlm_status status;
790 unsigned long flags;
791
792 mlog_entry_void();
793
794 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
795 dlm_flags);
796
797 spin_lock_irqsave(&lockres->l_lock, flags);
798 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
799 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
800 spin_unlock_irqrestore(&lockres->l_lock, flags);
801 goto bail;
802 }
803
804 lockres->l_action = OCFS2_AST_ATTACH;
805 lockres->l_requested = level;
806 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
807 spin_unlock_irqrestore(&lockres->l_lock, flags);
808
809 status = dlmlock(osb->dlm,
810 level,
811 &lockres->l_lksb,
812 dlm_flags,
813 lockres->l_name,
814 lockres->l_ops->ast,
815 lockres,
816 lockres->l_ops->bast);
817 if (status != DLM_NORMAL) {
818 ocfs2_log_dlm_error("dlmlock", status, lockres);
819 ret = -EINVAL;
820 ocfs2_recover_from_dlm_error(lockres, 1);
821 }
822
823 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
824
825bail:
826 mlog_exit(ret);
827 return ret;
828}
829
830static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
831 int flag)
832{
833 unsigned long flags;
834 int ret;
835
836 spin_lock_irqsave(&lockres->l_lock, flags);
837 ret = lockres->l_flags & flag;
838 spin_unlock_irqrestore(&lockres->l_lock, flags);
839
840 return ret;
841}
842
843static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
844
845{
846 wait_event(lockres->l_event,
847 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
848}
849
850static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
851
852{
853 wait_event(lockres->l_event,
854 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
855}
856
857/* predict what lock level we'll be dropping down to on behalf
858 * of another node, and return true if the currently wanted
859 * level will be compatible with it. */
860static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
861 int wanted)
862{
863 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
864
865 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
866}
867
868static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
869{
870 INIT_LIST_HEAD(&mw->mw_item);
871 init_completion(&mw->mw_complete);
872}
873
874static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
875{
876 wait_for_completion(&mw->mw_complete);
877 /* Re-arm the completion in case we want to wait on it again */
878 INIT_COMPLETION(mw->mw_complete);
879 return mw->mw_status;
880}
881
882static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
883 struct ocfs2_mask_waiter *mw,
884 unsigned long mask,
885 unsigned long goal)
886{
887 BUG_ON(!list_empty(&mw->mw_item));
888
889 assert_spin_locked(&lockres->l_lock);
890
891 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
892 mw->mw_mask = mask;
893 mw->mw_goal = goal;
894}
895
896/* returns 0 if the mw that was removed was already satisfied, -EBUSY
897 * if the mask still hadn't reached its goal */
898static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
899 struct ocfs2_mask_waiter *mw)
900{
901 unsigned long flags;
902 int ret = 0;
903
904 spin_lock_irqsave(&lockres->l_lock, flags);
905 if (!list_empty(&mw->mw_item)) {
906 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
907 ret = -EBUSY;
908
909 list_del_init(&mw->mw_item);
910 init_completion(&mw->mw_complete);
911 }
912 spin_unlock_irqrestore(&lockres->l_lock, flags);
913
914 return ret;
915
916}
917
918static int ocfs2_cluster_lock(struct ocfs2_super *osb,
919 struct ocfs2_lock_res *lockres,
920 int level,
921 int lkm_flags,
922 int arg_flags)
923{
924 struct ocfs2_mask_waiter mw;
925 enum dlm_status status;
926 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
927 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
928 unsigned long flags;
929
930 mlog_entry_void();
931
932 ocfs2_init_mask_waiter(&mw);
933
934again:
935 wait = 0;
936
937 if (catch_signals && signal_pending(current)) {
938 ret = -ERESTARTSYS;
939 goto out;
940 }
941
942 spin_lock_irqsave(&lockres->l_lock, flags);
943
944 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
945 "Cluster lock called on freeing lockres %s! flags "
946 "0x%lx\n", lockres->l_name, lockres->l_flags);
947
948 /* We only compare against the currently granted level
949 * here. If the lock is blocked waiting on a downconvert,
950 * we'll get caught below. */
951 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
952 level > lockres->l_level) {
953 /* is someone sitting in dlm_lock? If so, wait on
954 * them. */
955 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
956 wait = 1;
957 goto unlock;
958 }
959
960 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
961 /* lock has not been created yet. */
962 spin_unlock_irqrestore(&lockres->l_lock, flags);
963
964 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
965 if (ret < 0) {
966 mlog_errno(ret);
967 goto out;
968 }
969 goto again;
970 }
971
972 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
973 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
974 /* is the lock is currently blocked on behalf of
975 * another node */
976 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
977 wait = 1;
978 goto unlock;
979 }
980
981 if (level > lockres->l_level) {
982 if (lockres->l_action != OCFS2_AST_INVALID)
983 mlog(ML_ERROR, "lockres %s has action %u pending\n",
984 lockres->l_name, lockres->l_action);
985
986 lockres->l_action = OCFS2_AST_CONVERT;
987 lockres->l_requested = level;
988 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
989 spin_unlock_irqrestore(&lockres->l_lock, flags);
990
991 BUG_ON(level == LKM_IVMODE);
992 BUG_ON(level == LKM_NLMODE);
993
994 mlog(0, "lock %s, convert from %d to level = %d\n",
995 lockres->l_name, lockres->l_level, level);
996
997 /* call dlm_lock to upgrade lock now */
998 status = dlmlock(osb->dlm,
999 level,
1000 &lockres->l_lksb,
1001 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1002 lockres->l_name,
1003 lockres->l_ops->ast,
1004 lockres,
1005 lockres->l_ops->bast);
1006 if (status != DLM_NORMAL) {
1007 if ((lkm_flags & LKM_NOQUEUE) &&
1008 (status == DLM_NOTQUEUED))
1009 ret = -EAGAIN;
1010 else {
1011 ocfs2_log_dlm_error("dlmlock", status,
1012 lockres);
1013 ret = -EINVAL;
1014 }
1015 ocfs2_recover_from_dlm_error(lockres, 1);
1016 goto out;
1017 }
1018
1019 mlog(0, "lock %s, successfull return from dlmlock\n",
1020 lockres->l_name);
1021
1022 /* At this point we've gone inside the dlm and need to
1023 * complete our work regardless. */
1024 catch_signals = 0;
1025
1026 /* wait for busy to clear and carry on */
1027 goto again;
1028 }
1029
1030 /* Ok, if we get here then we're good to go. */
1031 ocfs2_inc_holders(lockres, level);
1032
1033 ret = 0;
1034unlock:
1035 spin_unlock_irqrestore(&lockres->l_lock, flags);
1036out:
1037 /*
1038 * This is helping work around a lock inversion between the page lock
1039 * and dlm locks. One path holds the page lock while calling aops
1040 * which block acquiring dlm locks. The voting thread holds dlm
1041 * locks while acquiring page locks while down converting data locks.
1042 * This block is helping an aop path notice the inversion and back
1043 * off to unlock its page lock before trying the dlm lock again.
1044 */
1045 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1046 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1047 wait = 0;
1048 if (lockres_remove_mask_waiter(lockres, &mw))
1049 ret = -EAGAIN;
1050 else
1051 goto again;
1052 }
1053 if (wait) {
1054 ret = ocfs2_wait_for_mask(&mw);
1055 if (ret == 0)
1056 goto again;
1057 mlog_errno(ret);
1058 }
1059
1060 mlog_exit(ret);
1061 return ret;
1062}
1063
1064static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1065 struct ocfs2_lock_res *lockres,
1066 int level)
1067{
1068 unsigned long flags;
1069
1070 mlog_entry_void();
1071 spin_lock_irqsave(&lockres->l_lock, flags);
1072 ocfs2_dec_holders(lockres, level);
1073 ocfs2_vote_on_unlock(osb, lockres);
1074 spin_unlock_irqrestore(&lockres->l_lock, flags);
1075 mlog_exit_void();
1076}
1077
1078static int ocfs2_create_new_inode_lock(struct inode *inode,
1079 struct ocfs2_lock_res *lockres)
1080{
1081 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1082 unsigned long flags;
1083
1084 spin_lock_irqsave(&lockres->l_lock, flags);
1085 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1086 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1087 spin_unlock_irqrestore(&lockres->l_lock, flags);
1088
1089 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1090}
1091
1092/* Grants us an EX lock on the data and metadata resources, skipping
1093 * the normal cluster directory lookup. Use this ONLY on newly created
1094 * inodes which other nodes can't possibly see, and which haven't been
1095 * hashed in the inode hash yet. This can give us a good performance
1096 * increase as it'll skip the network broadcast normally associated
1097 * with creating a new lock resource. */
1098int ocfs2_create_new_inode_locks(struct inode *inode)
1099{
1100 int ret;
1101
1102 BUG_ON(!inode);
1103 BUG_ON(!ocfs2_inode_is_new(inode));
1104
1105 mlog_entry_void();
1106
1107 mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
1108
1109 /* NOTE: That we don't increment any of the holder counts, nor
1110 * do we add anything to a journal handle. Since this is
1111 * supposed to be a new inode which the cluster doesn't know
1112 * about yet, there is no need to. As far as the LVB handling
1113 * is concerned, this is basically like acquiring an EX lock
1114 * on a resource which has an invalid one -- we'll set it
1115 * valid when we release the EX. */
1116
1117 ret = ocfs2_create_new_inode_lock(inode,
1118 &OCFS2_I(inode)->ip_rw_lockres);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto bail;
1122 }
1123
1124 ret = ocfs2_create_new_inode_lock(inode,
1125 &OCFS2_I(inode)->ip_meta_lockres);
1126 if (ret) {
1127 mlog_errno(ret);
1128 goto bail;
1129 }
1130
1131 ret = ocfs2_create_new_inode_lock(inode,
1132 &OCFS2_I(inode)->ip_data_lockres);
1133 if (ret) {
1134 mlog_errno(ret);
1135 goto bail;
1136 }
1137
1138bail:
1139 mlog_exit(ret);
1140 return ret;
1141}
1142
1143int ocfs2_rw_lock(struct inode *inode, int write)
1144{
1145 int status, level;
1146 struct ocfs2_lock_res *lockres;
1147
1148 BUG_ON(!inode);
1149
1150 mlog_entry_void();
1151
1152 mlog(0, "inode %"MLFu64" take %s RW lock\n",
1153 OCFS2_I(inode)->ip_blkno,
1154 write ? "EXMODE" : "PRMODE");
1155
1156 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1157
1158 level = write ? LKM_EXMODE : LKM_PRMODE;
1159
1160 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1161 0);
1162 if (status < 0)
1163 mlog_errno(status);
1164
1165 mlog_exit(status);
1166 return status;
1167}
1168
1169void ocfs2_rw_unlock(struct inode *inode, int write)
1170{
1171 int level = write ? LKM_EXMODE : LKM_PRMODE;
1172 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1173
1174 mlog_entry_void();
1175
1176 mlog(0, "inode %"MLFu64" drop %s RW lock\n",
1177 OCFS2_I(inode)->ip_blkno,
1178 write ? "EXMODE" : "PRMODE");
1179
1180 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1181
1182 mlog_exit_void();
1183}
1184
1185int ocfs2_data_lock_full(struct inode *inode,
1186 int write,
1187 int arg_flags)
1188{
1189 int status = 0, level;
1190 struct ocfs2_lock_res *lockres;
1191
1192 BUG_ON(!inode);
1193
1194 mlog_entry_void();
1195
1196 mlog(0, "inode %"MLFu64" take %s DATA lock\n",
1197 OCFS2_I(inode)->ip_blkno,
1198 write ? "EXMODE" : "PRMODE");
1199
1200 /* We'll allow faking a readonly data lock for
1201 * rodevices. */
1202 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1203 if (write) {
1204 status = -EROFS;
1205 mlog_errno(status);
1206 }
1207 goto out;
1208 }
1209
1210 lockres = &OCFS2_I(inode)->ip_data_lockres;
1211
1212 level = write ? LKM_EXMODE : LKM_PRMODE;
1213
1214 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1215 0, arg_flags);
1216 if (status < 0 && status != -EAGAIN)
1217 mlog_errno(status);
1218
1219out:
1220 mlog_exit(status);
1221 return status;
1222}
1223
1224/* see ocfs2_meta_lock_with_page() */
1225int ocfs2_data_lock_with_page(struct inode *inode,
1226 int write,
1227 struct page *page)
1228{
1229 int ret;
1230
1231 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1232 if (ret == -EAGAIN) {
1233 unlock_page(page);
1234 if (ocfs2_data_lock(inode, write) == 0)
1235 ocfs2_data_unlock(inode, write);
1236 ret = AOP_TRUNCATED_PAGE;
1237 }
1238
1239 return ret;
1240}
1241
1242static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1243 struct ocfs2_lock_res *lockres)
1244{
1245 int kick = 0;
1246
1247 mlog_entry_void();
1248
1249 /* If we know that another node is waiting on our lock, kick
1250 * the vote thread * pre-emptively when we reach a release
1251 * condition. */
1252 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1253 switch(lockres->l_blocking) {
1254 case LKM_EXMODE:
1255 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1256 kick = 1;
1257 break;
1258 case LKM_PRMODE:
1259 if (!lockres->l_ex_holders)
1260 kick = 1;
1261 break;
1262 default:
1263 BUG();
1264 }
1265 }
1266
1267 if (kick)
1268 ocfs2_kick_vote_thread(osb);
1269
1270 mlog_exit_void();
1271}
1272
1273void ocfs2_data_unlock(struct inode *inode,
1274 int write)
1275{
1276 int level = write ? LKM_EXMODE : LKM_PRMODE;
1277 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1278
1279 mlog_entry_void();
1280
1281 mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
1282 OCFS2_I(inode)->ip_blkno,
1283 write ? "EXMODE" : "PRMODE");
1284
1285 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1286 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1287
1288 mlog_exit_void();
1289}
1290
1291#define OCFS2_SEC_BITS 34
1292#define OCFS2_SEC_SHIFT (64 - 34)
1293#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1294
1295/* LVB only has room for 64 bits of time here so we pack it for
1296 * now. */
1297static u64 ocfs2_pack_timespec(struct timespec *spec)
1298{
1299 u64 res;
1300 u64 sec = spec->tv_sec;
1301 u32 nsec = spec->tv_nsec;
1302
1303 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1304
1305 return res;
1306}
1307
1308/* Call this with the lockres locked. I am reasonably sure we don't
1309 * need ip_lock in this function as anyone who would be changing those
1310 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1311static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1312{
1313 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1314 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1315 struct ocfs2_meta_lvb *lvb;
1316
1317 mlog_entry_void();
1318
1319 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1320
1321 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION);
1322 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1323 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1324 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1325 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1326 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1327 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1328 lvb->lvb_iatime_packed =
1329 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1330 lvb->lvb_ictime_packed =
1331 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1332 lvb->lvb_imtime_packed =
1333 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1334
1335 mlog_meta_lvb(0, lockres);
1336
1337 mlog_exit_void();
1338}
1339
1340static void ocfs2_unpack_timespec(struct timespec *spec,
1341 u64 packed_time)
1342{
1343 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1344 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1345}
1346
1347static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1348{
1349 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1350 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1351 struct ocfs2_meta_lvb *lvb;
1352
1353 mlog_entry_void();
1354
1355 mlog_meta_lvb(0, lockres);
1356
1357 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1358
1359 /* We're safe here without the lockres lock... */
1360 spin_lock(&oi->ip_lock);
1361 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1362 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1363
1364 /* fast-symlinks are a special case */
1365 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1366 inode->i_blocks = 0;
1367 else
1368 inode->i_blocks =
1369 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1370
1371 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1372 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1373 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1374 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1375 ocfs2_unpack_timespec(&inode->i_atime,
1376 be64_to_cpu(lvb->lvb_iatime_packed));
1377 ocfs2_unpack_timespec(&inode->i_mtime,
1378 be64_to_cpu(lvb->lvb_imtime_packed));
1379 ocfs2_unpack_timespec(&inode->i_ctime,
1380 be64_to_cpu(lvb->lvb_ictime_packed));
1381 spin_unlock(&oi->ip_lock);
1382
1383 mlog_exit_void();
1384}
1385
1386static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1387{
1388 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1389
1390 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1391 return 1;
1392 return 0;
1393}
1394
1395/* Determine whether a lock resource needs to be refreshed, and
1396 * arbitrate who gets to refresh it.
1397 *
1398 * 0 means no refresh needed.
1399 *
1400 * > 0 means you need to refresh this and you MUST call
1401 * ocfs2_complete_lock_res_refresh afterwards. */
1402static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1403{
1404 unsigned long flags;
1405 int status = 0;
1406
1407 mlog_entry_void();
1408
1409refresh_check:
1410 spin_lock_irqsave(&lockres->l_lock, flags);
1411 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1412 spin_unlock_irqrestore(&lockres->l_lock, flags);
1413 goto bail;
1414 }
1415
1416 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1417 spin_unlock_irqrestore(&lockres->l_lock, flags);
1418
1419 ocfs2_wait_on_refreshing_lock(lockres);
1420 goto refresh_check;
1421 }
1422
1423 /* Ok, I'll be the one to refresh this lock. */
1424 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1425 spin_unlock_irqrestore(&lockres->l_lock, flags);
1426
1427 status = 1;
1428bail:
1429 mlog_exit(status);
1430 return status;
1431}
1432
1433/* If status is non zero, I'll mark it as not being in refresh
1434 * anymroe, but i won't clear the needs refresh flag. */
1435static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1436 int status)
1437{
1438 unsigned long flags;
1439 mlog_entry_void();
1440
1441 spin_lock_irqsave(&lockres->l_lock, flags);
1442 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1443 if (!status)
1444 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1445 spin_unlock_irqrestore(&lockres->l_lock, flags);
1446
1447 wake_up(&lockres->l_event);
1448
1449 mlog_exit_void();
1450}
1451
1452/* may or may not return a bh if it went to disk. */
1453static int ocfs2_meta_lock_update(struct inode *inode,
1454 struct buffer_head **bh)
1455{
1456 int status = 0;
1457 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1458 struct ocfs2_lock_res *lockres;
1459 struct ocfs2_dinode *fe;
1460
1461 mlog_entry_void();
1462
1463 spin_lock(&oi->ip_lock);
1464 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1465 mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
1466 "were waiting on a lock. ip_flags = 0x%x\n",
1467 oi->ip_blkno, oi->ip_flags);
1468 spin_unlock(&oi->ip_lock);
1469 status = -ENOENT;
1470 goto bail;
1471 }
1472 spin_unlock(&oi->ip_lock);
1473
1474 lockres = &oi->ip_meta_lockres;
1475
1476 if (!ocfs2_should_refresh_lock_res(lockres))
1477 goto bail;
1478
1479 /* This will discard any caching information we might have had
1480 * for the inode metadata. */
1481 ocfs2_metadata_cache_purge(inode);
1482
1483 /* will do nothing for inode types that don't use the extent
1484 * map (directories, bitmap files, etc) */
1485 ocfs2_extent_map_trunc(inode, 0);
1486
1487 if (ocfs2_meta_lvb_is_trustable(lockres)) {
1488 mlog(0, "Trusting LVB on inode %"MLFu64"\n",
1489 oi->ip_blkno);
1490 ocfs2_refresh_inode_from_lvb(inode);
1491 } else {
1492 /* Boo, we have to go to disk. */
1493 /* read bh, cast, ocfs2_refresh_inode */
1494 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1495 bh, OCFS2_BH_CACHED, inode);
1496 if (status < 0) {
1497 mlog_errno(status);
1498 goto bail_refresh;
1499 }
1500 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1501
1502 /* This is a good chance to make sure we're not
1503 * locking an invalid object.
1504 *
1505 * We bug on a stale inode here because we checked
1506 * above whether it was wiped from disk. The wiping
1507 * node provides a guarantee that we receive that
1508 * message and can mark the inode before dropping any
1509 * locks associated with it. */
1510 if (!OCFS2_IS_VALID_DINODE(fe)) {
1511 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1512 status = -EIO;
1513 goto bail_refresh;
1514 }
1515 mlog_bug_on_msg(inode->i_generation !=
1516 le32_to_cpu(fe->i_generation),
1517 "Invalid dinode %"MLFu64" disk generation: %u "
1518 "inode->i_generation: %u\n",
1519 oi->ip_blkno, le32_to_cpu(fe->i_generation),
1520 inode->i_generation);
1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 "Stale dinode %"MLFu64" dtime: %"MLFu64" "
1524 "flags: 0x%x\n", oi->ip_blkno,
1525 le64_to_cpu(fe->i_dtime),
1526 le32_to_cpu(fe->i_flags));
1527
1528 ocfs2_refresh_inode(inode, fe);
1529 }
1530
1531 status = 0;
1532bail_refresh:
1533 ocfs2_complete_lock_res_refresh(lockres, status);
1534bail:
1535 mlog_exit(status);
1536 return status;
1537}
1538
1539static int ocfs2_assign_bh(struct inode *inode,
1540 struct buffer_head **ret_bh,
1541 struct buffer_head *passed_bh)
1542{
1543 int status;
1544
1545 if (passed_bh) {
1546 /* Ok, the update went to disk for us, use the
1547 * returned bh. */
1548 *ret_bh = passed_bh;
1549 get_bh(*ret_bh);
1550
1551 return 0;
1552 }
1553
1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 OCFS2_I(inode)->ip_blkno,
1556 ret_bh,
1557 OCFS2_BH_CACHED,
1558 inode);
1559 if (status < 0)
1560 mlog_errno(status);
1561
1562 return status;
1563}
1564
1565/*
1566 * returns < 0 error if the callback will never be called, otherwise
1567 * the result of the lock will be communicated via the callback.
1568 */
1569int ocfs2_meta_lock_full(struct inode *inode,
1570 struct ocfs2_journal_handle *handle,
1571 struct buffer_head **ret_bh,
1572 int ex,
1573 int arg_flags)
1574{
1575 int status, level, dlm_flags, acquired;
1576 struct ocfs2_lock_res *lockres;
1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 struct buffer_head *local_bh = NULL;
1579
1580 BUG_ON(!inode);
1581
1582 mlog_entry_void();
1583
1584 mlog(0, "inode %"MLFu64", take %s META lock\n",
1585 OCFS2_I(inode)->ip_blkno,
1586 ex ? "EXMODE" : "PRMODE");
1587
1588 status = 0;
1589 acquired = 0;
1590 /* We'll allow faking a readonly metadata lock for
1591 * rodevices. */
1592 if (ocfs2_is_hard_readonly(osb)) {
1593 if (ex)
1594 status = -EROFS;
1595 goto bail;
1596 }
1597
1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 wait_event(osb->recovery_event,
1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601
1602 acquired = 0;
1603 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 dlm_flags = 0;
1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 dlm_flags |= LKM_NOQUEUE;
1608
1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 if (status < 0) {
1611 if (status != -EAGAIN && status != -EIOCBRETRY)
1612 mlog_errno(status);
1613 goto bail;
1614 }
1615
1616 /* Notify the error cleanup path to drop the cluster lock. */
1617 acquired = 1;
1618
1619 /* We wait twice because a node may have died while we were in
1620 * the lower dlm layers. The second time though, we've
1621 * committed to owning this lock so we don't allow signals to
1622 * abort the operation. */
1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 wait_event(osb->recovery_event,
1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626
1627 /* This is fun. The caller may want a bh back, or it may
1628 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 * may or may not read one, depending on what's in the
1630 * LVB. The result of all of this is that we've *only* gone to
1631 * disk if we have to, so the complexity is worthwhile. */
1632 status = ocfs2_meta_lock_update(inode, &local_bh);
1633 if (status < 0) {
1634 if (status != -ENOENT)
1635 mlog_errno(status);
1636 goto bail;
1637 }
1638
1639 if (ret_bh) {
1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 if (handle) {
1648 status = ocfs2_handle_add_lock(handle, inode);
1649 if (status < 0)
1650 mlog_errno(status);
1651 }
1652
1653bail:
1654 if (status < 0) {
1655 if (ret_bh && (*ret_bh)) {
1656 brelse(*ret_bh);
1657 *ret_bh = NULL;
1658 }
1659 if (acquired)
1660 ocfs2_meta_unlock(inode, ex);
1661 }
1662
1663 if (local_bh)
1664 brelse(local_bh);
1665
1666 mlog_exit(status);
1667 return status;
1668}
1669
1670/*
1671 * This is working around a lock inversion between tasks acquiring DLM locks
1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673 * while acquiring page locks.
1674 *
1675 * ** These _with_page variantes are only intended to be called from aop
1676 * methods that hold page locks and return a very specific *positive* error
1677 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678 *
1679 * The DLM is called such that it returns -EAGAIN if it would have blocked
1680 * waiting for the vote thread. In that case we unlock our page so the vote
1681 * thread can make progress. Once we've done this we have to return
1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683 * into the VFS who will then immediately retry the aop call.
1684 *
1685 * We do a blocking lock and immediate unlock before returning, though, so that
1686 * the lock has a great chance of being cached on this node by the time the VFS
1687 * calls back to retry the aop. This has a potential to livelock as nodes
1688 * ping locks back and forth, but that's a risk we're willing to take to avoid
1689 * the lock inversion simply.
1690 */
1691int ocfs2_meta_lock_with_page(struct inode *inode,
1692 struct ocfs2_journal_handle *handle,
1693 struct buffer_head **ret_bh,
1694 int ex,
1695 struct page *page)
1696{
1697 int ret;
1698
1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 OCFS2_LOCK_NONBLOCK);
1701 if (ret == -EAGAIN) {
1702 unlock_page(page);
1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 ocfs2_meta_unlock(inode, ex);
1705 ret = AOP_TRUNCATED_PAGE;
1706 }
1707
1708 return ret;
1709}
1710
1711void ocfs2_meta_unlock(struct inode *inode,
1712 int ex)
1713{
1714 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716
1717 mlog_entry_void();
1718
1719 mlog(0, "inode %"MLFu64" drop %s META lock\n",
1720 OCFS2_I(inode)->ip_blkno,
1721 ex ? "EXMODE" : "PRMODE");
1722
1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725
1726 mlog_exit_void();
1727}
1728
1729int ocfs2_super_lock(struct ocfs2_super *osb,
1730 int ex)
1731{
1732 int status;
1733 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 struct buffer_head *bh;
1736 struct ocfs2_slot_info *si = osb->slot_info;
1737
1738 mlog_entry_void();
1739
1740 if (ocfs2_is_hard_readonly(osb))
1741 return -EROFS;
1742
1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 if (status < 0) {
1745 mlog_errno(status);
1746 goto bail;
1747 }
1748
1749 /* The super block lock path is really in the best position to
1750 * know when resources covered by the lock need to be
1751 * refreshed, so we do it here. Of course, making sense of
1752 * everything is up to the caller :) */
1753 status = ocfs2_should_refresh_lock_res(lockres);
1754 if (status < 0) {
1755 mlog_errno(status);
1756 goto bail;
1757 }
1758 if (status) {
1759 bh = si->si_bh;
1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 si->si_inode);
1762 if (status == 0)
1763 ocfs2_update_slot_info(si);
1764
1765 ocfs2_complete_lock_res_refresh(lockres, status);
1766
1767 if (status < 0)
1768 mlog_errno(status);
1769 }
1770bail:
1771 mlog_exit(status);
1772 return status;
1773}
1774
1775void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 int ex)
1777{
1778 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780
1781 ocfs2_cluster_unlock(osb, lockres, level);
1782}
1783
1784int ocfs2_rename_lock(struct ocfs2_super *osb)
1785{
1786 int status;
1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788
1789 if (ocfs2_is_hard_readonly(osb))
1790 return -EROFS;
1791
1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 if (status < 0)
1794 mlog_errno(status);
1795
1796 return status;
1797}
1798
1799void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800{
1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802
1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804}
1805
1806/* Reference counting of the dlm debug structure. We want this because
1807 * open references on the debug inodes can live on after a mount, so
1808 * we can't rely on the ocfs2_super to always exist. */
1809static void ocfs2_dlm_debug_free(struct kref *kref)
1810{
1811 struct ocfs2_dlm_debug *dlm_debug;
1812
1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814
1815 kfree(dlm_debug);
1816}
1817
1818void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819{
1820 if (dlm_debug)
1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822}
1823
1824static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825{
1826 kref_get(&debug->d_refcnt);
1827}
1828
1829struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830{
1831 struct ocfs2_dlm_debug *dlm_debug;
1832
1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 if (!dlm_debug) {
1835 mlog_errno(-ENOMEM);
1836 goto out;
1837 }
1838
1839 kref_init(&dlm_debug->d_refcnt);
1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 dlm_debug->d_locking_state = NULL;
1842out:
1843 return dlm_debug;
1844}
1845
1846/* Access to this is arbitrated for us via seq_file->sem. */
1847struct ocfs2_dlm_seq_priv {
1848 struct ocfs2_dlm_debug *p_dlm_debug;
1849 struct ocfs2_lock_res p_iter_res;
1850 struct ocfs2_lock_res p_tmp_res;
1851};
1852
1853static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 struct ocfs2_dlm_seq_priv *priv)
1855{
1856 struct ocfs2_lock_res *iter, *ret = NULL;
1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858
1859 assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860
1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 /* discover the head of the list */
1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 mlog(0, "End of list found, %p\n", ret);
1865 break;
1866 }
1867
1868 /* We track our "dummy" iteration lockres' by a NULL
1869 * l_ops field. */
1870 if (iter->l_ops != NULL) {
1871 ret = iter;
1872 break;
1873 }
1874 }
1875
1876 return ret;
1877}
1878
1879static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880{
1881 struct ocfs2_dlm_seq_priv *priv = m->private;
1882 struct ocfs2_lock_res *iter;
1883
1884 spin_lock(&ocfs2_dlm_tracking_lock);
1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 if (iter) {
1887 /* Since lockres' have the lifetime of their container
1888 * (which can be inodes, ocfs2_supers, etc) we want to
1889 * copy this out to a temporary lockres while still
1890 * under the spinlock. Obviously after this we can't
1891 * trust any pointers on the copy returned, but that's
1892 * ok as the information we want isn't typically held
1893 * in them. */
1894 priv->p_tmp_res = *iter;
1895 iter = &priv->p_tmp_res;
1896 }
1897 spin_unlock(&ocfs2_dlm_tracking_lock);
1898
1899 return iter;
1900}
1901
1902static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903{
1904}
1905
1906static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907{
1908 struct ocfs2_dlm_seq_priv *priv = m->private;
1909 struct ocfs2_lock_res *iter = v;
1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911
1912 spin_lock(&ocfs2_dlm_tracking_lock);
1913 iter = ocfs2_dlm_next_res(iter, priv);
1914 list_del_init(&dummy->l_debug_list);
1915 if (iter) {
1916 list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 priv->p_tmp_res = *iter;
1918 iter = &priv->p_tmp_res;
1919 }
1920 spin_unlock(&ocfs2_dlm_tracking_lock);
1921
1922 return iter;
1923}
1924
1925/* So that debugfs.ocfs2 can determine which format is being used */
1926#define OCFS2_DLM_DEBUG_STR_VERSION 1
1927static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928{
1929 int i;
1930 char *lvb;
1931 struct ocfs2_lock_res *lockres = v;
1932
1933 if (!lockres)
1934 return -EINVAL;
1935
1936 seq_printf(m, "0x%x\t"
1937 "%.*s\t"
1938 "%d\t"
1939 "0x%lx\t"
1940 "0x%x\t"
1941 "0x%x\t"
1942 "%u\t"
1943 "%u\t"
1944 "%d\t"
1945 "%d\t",
1946 OCFS2_DLM_DEBUG_STR_VERSION,
1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 lockres->l_level,
1949 lockres->l_flags,
1950 lockres->l_action,
1951 lockres->l_unlock_action,
1952 lockres->l_ro_holders,
1953 lockres->l_ex_holders,
1954 lockres->l_requested,
1955 lockres->l_blocking);
1956
1957 /* Dump the raw LVB */
1958 lvb = lockres->l_lksb.lvb;
1959 for(i = 0; i < DLM_LVB_LEN; i++)
1960 seq_printf(m, "0x%x\t", lvb[i]);
1961
1962 /* End the line */
1963 seq_printf(m, "\n");
1964 return 0;
1965}
1966
1967static struct seq_operations ocfs2_dlm_seq_ops = {
1968 .start = ocfs2_dlm_seq_start,
1969 .stop = ocfs2_dlm_seq_stop,
1970 .next = ocfs2_dlm_seq_next,
1971 .show = ocfs2_dlm_seq_show,
1972};
1973
1974static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975{
1976 struct seq_file *seq = (struct seq_file *) file->private_data;
1977 struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 struct ocfs2_lock_res *res = &priv->p_iter_res;
1979
1980 ocfs2_remove_lockres_tracking(res);
1981 ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 return seq_release_private(inode, file);
1983}
1984
1985static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986{
1987 int ret;
1988 struct ocfs2_dlm_seq_priv *priv;
1989 struct seq_file *seq;
1990 struct ocfs2_super *osb;
1991
1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 if (!priv) {
1994 ret = -ENOMEM;
1995 mlog_errno(ret);
1996 goto out;
1997 }
1998 osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 priv->p_dlm_debug = osb->osb_dlm_debug;
2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002
2003 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 if (ret) {
2005 kfree(priv);
2006 mlog_errno(ret);
2007 goto out;
2008 }
2009
2010 seq = (struct seq_file *) file->private_data;
2011 seq->private = priv;
2012
2013 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 priv->p_dlm_debug);
2015
2016out:
2017 return ret;
2018}
2019
2020static struct file_operations ocfs2_dlm_debug_fops = {
2021 .open = ocfs2_dlm_debug_open,
2022 .release = ocfs2_dlm_debug_release,
2023 .read = seq_read,
2024 .llseek = seq_lseek,
2025};
2026
2027static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028{
2029 int ret = 0;
2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031
2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 S_IFREG|S_IRUSR,
2034 osb->osb_debug_root,
2035 osb,
2036 &ocfs2_dlm_debug_fops);
2037 if (!dlm_debug->d_locking_state) {
2038 ret = -EINVAL;
2039 mlog(ML_ERROR,
2040 "Unable to create locking state debugfs file.\n");
2041 goto out;
2042 }
2043
2044 ocfs2_get_dlm_debug(dlm_debug);
2045out:
2046 return ret;
2047}
2048
2049static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050{
2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052
2053 if (dlm_debug) {
2054 debugfs_remove(dlm_debug->d_locking_state);
2055 ocfs2_put_dlm_debug(dlm_debug);
2056 }
2057}
2058
2059int ocfs2_dlm_init(struct ocfs2_super *osb)
2060{
2061 int status;
2062 u32 dlm_key;
2063 struct dlm_ctxt *dlm;
2064
2065 mlog_entry_void();
2066
2067 status = ocfs2_dlm_init_debug(osb);
2068 if (status < 0) {
2069 mlog_errno(status);
2070 goto bail;
2071 }
2072
2073 /* launch vote thread */
2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
2075 osb->osb_id);
2076 if (IS_ERR(osb->vote_task)) {
2077 status = PTR_ERR(osb->vote_task);
2078 osb->vote_task = NULL;
2079 mlog_errno(status);
2080 goto bail;
2081 }
2082
2083 /* used by the dlm code to make message headers unique, each
2084 * node in this domain must agree on this. */
2085 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2086
2087 /* for now, uuid == domain */
2088 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2089 if (IS_ERR(dlm)) {
2090 status = PTR_ERR(dlm);
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2095 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2096 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2097
2098 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2099
2100 osb->dlm = dlm;
2101
2102 status = 0;
2103bail:
2104 if (status < 0) {
2105 ocfs2_dlm_shutdown_debug(osb);
2106 if (osb->vote_task)
2107 kthread_stop(osb->vote_task);
2108 }
2109
2110 mlog_exit(status);
2111 return status;
2112}
2113
2114void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2115{
2116 mlog_entry_void();
2117
2118 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2119
2120 ocfs2_drop_osb_locks(osb);
2121
2122 if (osb->vote_task) {
2123 kthread_stop(osb->vote_task);
2124 osb->vote_task = NULL;
2125 }
2126
2127 ocfs2_lock_res_free(&osb->osb_super_lockres);
2128 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2129
2130 dlm_unregister_domain(osb->dlm);
2131 osb->dlm = NULL;
2132
2133 ocfs2_dlm_shutdown_debug(osb);
2134
2135 mlog_exit_void();
2136}
2137
2138static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2139{
2140 struct ocfs2_lock_res *lockres = opaque;
2141 unsigned long flags;
2142
2143 mlog_entry_void();
2144
2145 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2146 lockres->l_unlock_action);
2147
2148 spin_lock_irqsave(&lockres->l_lock, flags);
2149 /* We tried to cancel a convert request, but it was already
2150 * granted. All we want to do here is clear our unlock
2151 * state. The wake_up call done at the bottom is redundant
2152 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2153 * hurt anything anyway */
2154 if (status == DLM_CANCELGRANT &&
2155 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2156 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2157
2158 /* We don't clear the busy flag in this case as it
2159 * should have been cleared by the ast which the dlm
2160 * has called. */
2161 goto complete_unlock;
2162 }
2163
2164 if (status != DLM_NORMAL) {
2165 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2166 "unlock_action %d\n", status, lockres->l_name,
2167 lockres->l_unlock_action);
2168 spin_unlock_irqrestore(&lockres->l_lock, flags);
2169 return;
2170 }
2171
2172 switch(lockres->l_unlock_action) {
2173 case OCFS2_UNLOCK_CANCEL_CONVERT:
2174 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2175 lockres->l_action = OCFS2_AST_INVALID;
2176 break;
2177 case OCFS2_UNLOCK_DROP_LOCK:
2178 lockres->l_level = LKM_IVMODE;
2179 break;
2180 default:
2181 BUG();
2182 }
2183
2184 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2185complete_unlock:
2186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2187 spin_unlock_irqrestore(&lockres->l_lock, flags);
2188
2189 wake_up(&lockres->l_event);
2190
2191 mlog_exit_void();
2192}
2193
2194typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2195
2196struct drop_lock_cb {
2197 ocfs2_pre_drop_cb_t *drop_func;
2198 void *drop_data;
2199};
2200
2201static int ocfs2_drop_lock(struct ocfs2_super *osb,
2202 struct ocfs2_lock_res *lockres,
2203 struct drop_lock_cb *dcb)
2204{
2205 enum dlm_status status;
2206 unsigned long flags;
2207
2208 /* We didn't get anywhere near actually using this lockres. */
2209 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2210 goto out;
2211
2212 spin_lock_irqsave(&lockres->l_lock, flags);
2213
2214 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2215 "lockres %s, flags 0x%lx\n",
2216 lockres->l_name, lockres->l_flags);
2217
2218 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2219 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2220 "%u, unlock_action = %u\n",
2221 lockres->l_name, lockres->l_flags, lockres->l_action,
2222 lockres->l_unlock_action);
2223
2224 spin_unlock_irqrestore(&lockres->l_lock, flags);
2225
2226 /* XXX: Today we just wait on any busy
2227 * locks... Perhaps we need to cancel converts in the
2228 * future? */
2229 ocfs2_wait_on_busy_lock(lockres);
2230
2231 spin_lock_irqsave(&lockres->l_lock, flags);
2232 }
2233
2234 if (dcb)
2235 dcb->drop_func(lockres, dcb->drop_data);
2236
2237 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2238 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2239 lockres->l_name);
2240 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2241 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2242
2243 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2244 spin_unlock_irqrestore(&lockres->l_lock, flags);
2245 goto out;
2246 }
2247
2248 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2249
2250 /* make sure we never get here while waiting for an ast to
2251 * fire. */
2252 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2253
2254 /* is this necessary? */
2255 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2256 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2257 spin_unlock_irqrestore(&lockres->l_lock, flags);
2258
2259 mlog(0, "lock %s\n", lockres->l_name);
2260
2261 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2262 lockres->l_ops->unlock_ast, lockres);
2263 if (status != DLM_NORMAL) {
2264 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2265 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2266 dlm_print_one_lock(lockres->l_lksb.lockid);
2267 BUG();
2268 }
2269 mlog(0, "lock %s, successfull return from dlmunlock\n",
2270 lockres->l_name);
2271
2272 ocfs2_wait_on_busy_lock(lockres);
2273out:
2274 mlog_exit(0);
2275 return 0;
2276}
2277
2278/* Mark the lockres as being dropped. It will no longer be
2279 * queued if blocking, but we still may have to wait on it
2280 * being dequeued from the vote thread before we can consider
2281 * it safe to drop.
2282 *
2283 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2284void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2285{
2286 int status;
2287 struct ocfs2_mask_waiter mw;
2288 unsigned long flags;
2289
2290 ocfs2_init_mask_waiter(&mw);
2291
2292 spin_lock_irqsave(&lockres->l_lock, flags);
2293 lockres->l_flags |= OCFS2_LOCK_FREEING;
2294 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2295 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2296 spin_unlock_irqrestore(&lockres->l_lock, flags);
2297
2298 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2299
2300 status = ocfs2_wait_for_mask(&mw);
2301 if (status)
2302 mlog_errno(status);
2303
2304 spin_lock_irqsave(&lockres->l_lock, flags);
2305 }
2306 spin_unlock_irqrestore(&lockres->l_lock, flags);
2307}
2308
2309static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2310{
2311 int status;
2312
2313 mlog_entry_void();
2314
2315 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2316
2317 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2318 if (status < 0)
2319 mlog_errno(status);
2320
2321 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2322
2323 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2324 if (status < 0)
2325 mlog_errno(status);
2326
2327 mlog_exit(status);
2328}
2329
2330static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2331{
2332 struct inode *inode = data;
2333
2334 /* the metadata lock requires a bit more work as we have an
2335 * LVB to worry about. */
2336 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2337 lockres->l_level == LKM_EXMODE &&
2338 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2339 __ocfs2_stuff_meta_lvb(inode);
2340}
2341
2342int ocfs2_drop_inode_locks(struct inode *inode)
2343{
2344 int status, err;
2345 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2346
2347 mlog_entry_void();
2348
2349 /* No need to call ocfs2_mark_lockres_freeing here -
2350 * ocfs2_clear_inode has done it for us. */
2351
2352 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2353 &OCFS2_I(inode)->ip_data_lockres,
2354 NULL);
2355 if (err < 0)
2356 mlog_errno(err);
2357
2358 status = err;
2359
2360 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2361 &OCFS2_I(inode)->ip_meta_lockres,
2362 &meta_dcb);
2363 if (err < 0)
2364 mlog_errno(err);
2365 if (err < 0 && !status)
2366 status = err;
2367
2368 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2369 &OCFS2_I(inode)->ip_rw_lockres,
2370 NULL);
2371 if (err < 0)
2372 mlog_errno(err);
2373 if (err < 0 && !status)
2374 status = err;
2375
2376 mlog_exit(status);
2377 return status;
2378}
2379
2380static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2381 int new_level)
2382{
2383 assert_spin_locked(&lockres->l_lock);
2384
2385 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2386
2387 if (lockres->l_level <= new_level) {
2388 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2389 lockres->l_level, new_level);
2390 BUG();
2391 }
2392
2393 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2394 lockres->l_name, new_level, lockres->l_blocking);
2395
2396 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2397 lockres->l_requested = new_level;
2398 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2399}
2400
2401static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2402 struct ocfs2_lock_res *lockres,
2403 int new_level,
2404 int lvb)
2405{
2406 int ret, dlm_flags = LKM_CONVERT;
2407 enum dlm_status status;
2408
2409 mlog_entry_void();
2410
2411 if (lvb)
2412 dlm_flags |= LKM_VALBLK;
2413
2414 status = dlmlock(osb->dlm,
2415 new_level,
2416 &lockres->l_lksb,
2417 dlm_flags,
2418 lockres->l_name,
2419 lockres->l_ops->ast,
2420 lockres,
2421 lockres->l_ops->bast);
2422 if (status != DLM_NORMAL) {
2423 ocfs2_log_dlm_error("dlmlock", status, lockres);
2424 ret = -EINVAL;
2425 ocfs2_recover_from_dlm_error(lockres, 1);
2426 goto bail;
2427 }
2428
2429 ret = 0;
2430bail:
2431 mlog_exit(ret);
2432 return ret;
2433}
2434
2435/* returns 1 when the caller should unlock and call dlmunlock */
2436static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2437 struct ocfs2_lock_res *lockres)
2438{
2439 assert_spin_locked(&lockres->l_lock);
2440
2441 mlog_entry_void();
2442 mlog(0, "lock %s\n", lockres->l_name);
2443
2444 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2445 /* If we're already trying to cancel a lock conversion
2446 * then just drop the spinlock and allow the caller to
2447 * requeue this lock. */
2448
2449 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2450 return 0;
2451 }
2452
2453 /* were we in a convert when we got the bast fire? */
2454 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2455 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2456 /* set things up for the unlockast to know to just
2457 * clear out the ast_action and unset busy, etc. */
2458 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2459
2460 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2461 "lock %s, invalid flags: 0x%lx\n",
2462 lockres->l_name, lockres->l_flags);
2463
2464 return 1;
2465}
2466
2467static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2468 struct ocfs2_lock_res *lockres)
2469{
2470 int ret;
2471 enum dlm_status status;
2472
2473 mlog_entry_void();
2474 mlog(0, "lock %s\n", lockres->l_name);
2475
2476 ret = 0;
2477 status = dlmunlock(osb->dlm,
2478 &lockres->l_lksb,
2479 LKM_CANCEL,
2480 lockres->l_ops->unlock_ast,
2481 lockres);
2482 if (status != DLM_NORMAL) {
2483 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2484 ret = -EINVAL;
2485 ocfs2_recover_from_dlm_error(lockres, 0);
2486 }
2487
2488 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2489
2490 mlog_exit(ret);
2491 return ret;
2492}
2493
2494static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2495 struct ocfs2_lock_res *lockres,
2496 int new_level)
2497{
2498 int ret;
2499
2500 mlog_entry_void();
2501
2502 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2503
2504 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2505 ret = 0;
2506 mlog(0, "lockres %s currently being refreshed -- backing "
2507 "off!\n", lockres->l_name);
2508 } else if (new_level == LKM_PRMODE)
2509 ret = !lockres->l_ex_holders &&
2510 ocfs2_inode_fully_checkpointed(inode);
2511 else /* Must be NLMODE we're converting to. */
2512 ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2513 ocfs2_inode_fully_checkpointed(inode);
2514
2515 mlog_exit(ret);
2516 return ret;
2517}
2518
2519static int ocfs2_do_unblock_meta(struct inode *inode,
2520 int *requeue)
2521{
2522 int new_level;
2523 int set_lvb = 0;
2524 int ret = 0;
2525 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2526 unsigned long flags;
2527
2528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2529
2530 mlog_entry_void();
2531
2532 spin_lock_irqsave(&lockres->l_lock, flags);
2533
2534 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2535
2536 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2537 lockres->l_blocking);
2538
2539 BUG_ON(lockres->l_level != LKM_EXMODE &&
2540 lockres->l_level != LKM_PRMODE);
2541
2542 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2543 *requeue = 1;
2544 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2545 spin_unlock_irqrestore(&lockres->l_lock, flags);
2546 if (ret) {
2547 ret = ocfs2_cancel_convert(osb, lockres);
2548 if (ret < 0)
2549 mlog_errno(ret);
2550 }
2551 goto leave;
2552 }
2553
2554 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2555
2556 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2557 lockres->l_level, lockres->l_blocking, new_level);
2558
2559 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2560 if (lockres->l_level == LKM_EXMODE)
2561 set_lvb = 1;
2562
2563 /* If the lock hasn't been refreshed yet (rare), then
2564 * our memory inode values are old and we skip
2565 * stuffing the lvb. There's no need to actually clear
2566 * out the lvb here as it's value is still valid. */
2567 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2568 if (set_lvb)
2569 __ocfs2_stuff_meta_lvb(inode);
2570 } else
2571 mlog(0, "lockres %s: downconverting stale lock!\n",
2572 lockres->l_name);
2573
2574 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2575 "l_blocking=%d, new_level=%d\n",
2576 lockres->l_level, lockres->l_blocking, new_level);
2577
2578 ocfs2_prepare_downconvert(lockres, new_level);
2579 spin_unlock_irqrestore(&lockres->l_lock, flags);
2580 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2581 goto leave;
2582 }
2583 if (!ocfs2_inode_fully_checkpointed(inode))
2584 ocfs2_start_checkpoint(osb);
2585
2586 *requeue = 1;
2587 spin_unlock_irqrestore(&lockres->l_lock, flags);
2588 ret = 0;
2589leave:
2590 mlog_exit(ret);
2591 return ret;
2592}
2593
2594static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2595 struct ocfs2_lock_res *lockres,
2596 int *requeue,
2597 ocfs2_convert_worker_t *worker)
2598{
2599 unsigned long flags;
2600 int blocking;
2601 int new_level;
2602 int ret = 0;
2603
2604 mlog_entry_void();
2605
2606 spin_lock_irqsave(&lockres->l_lock, flags);
2607
2608 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2609
2610recheck:
2611 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2612 *requeue = 1;
2613 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2614 spin_unlock_irqrestore(&lockres->l_lock, flags);
2615 if (ret) {
2616 ret = ocfs2_cancel_convert(osb, lockres);
2617 if (ret < 0)
2618 mlog_errno(ret);
2619 }
2620 goto leave;
2621 }
2622
2623 /* if we're blocking an exclusive and we have *any* holders,
2624 * then requeue. */
2625 if ((lockres->l_blocking == LKM_EXMODE)
2626 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2627 spin_unlock_irqrestore(&lockres->l_lock, flags);
2628 *requeue = 1;
2629 ret = 0;
2630 goto leave;
2631 }
2632
2633 /* If it's a PR we're blocking, then only
2634 * requeue if we've got any EX holders */
2635 if (lockres->l_blocking == LKM_PRMODE &&
2636 lockres->l_ex_holders) {
2637 spin_unlock_irqrestore(&lockres->l_lock, flags);
2638 *requeue = 1;
2639 ret = 0;
2640 goto leave;
2641 }
2642
2643 /* If we get here, then we know that there are no more
2644 * incompatible holders (and anyone asking for an incompatible
2645 * lock is blocked). We can now downconvert the lock */
2646 if (!worker)
2647 goto downconvert;
2648
2649 /* Some lockres types want to do a bit of work before
2650 * downconverting a lock. Allow that here. The worker function
2651 * may sleep, so we save off a copy of what we're blocking as
2652 * it may change while we're not holding the spin lock. */
2653 blocking = lockres->l_blocking;
2654 spin_unlock_irqrestore(&lockres->l_lock, flags);
2655
2656 worker(lockres, blocking);
2657
2658 spin_lock_irqsave(&lockres->l_lock, flags);
2659 if (blocking != lockres->l_blocking) {
2660 /* If this changed underneath us, then we can't drop
2661 * it just yet. */
2662 goto recheck;
2663 }
2664
2665downconvert:
2666 *requeue = 0;
2667 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2668
2669 ocfs2_prepare_downconvert(lockres, new_level);
2670 spin_unlock_irqrestore(&lockres->l_lock, flags);
2671 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2672leave:
2673 mlog_exit(ret);
2674 return ret;
2675}
2676
2677static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2678 int blocking)
2679{
2680 struct inode *inode;
2681 struct address_space *mapping;
2682
2683 mlog_entry_void();
2684
2685 inode = ocfs2_lock_res_inode(lockres);
2686 mapping = inode->i_mapping;
2687
2688 if (filemap_fdatawrite(mapping)) {
2689 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
2690 OCFS2_I(inode)->ip_blkno);
2691 }
2692 sync_mapping_buffers(mapping);
2693 if (blocking == LKM_EXMODE) {
2694 truncate_inode_pages(mapping, 0);
2695 unmap_mapping_range(mapping, 0, 0, 0);
2696 } else {
2697 /* We only need to wait on the I/O if we're not also
2698 * truncating pages because truncate_inode_pages waits
2699 * for us above. We don't truncate pages if we're
2700 * blocking anything < EXMODE because we want to keep
2701 * them around in that case. */
2702 filemap_fdatawait(mapping);
2703 }
2704
2705 mlog_exit_void();
2706}
2707
2708int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2709 int *requeue)
2710{
2711 int status;
2712 struct inode *inode;
2713 struct ocfs2_super *osb;
2714
2715 mlog_entry_void();
2716
2717 inode = ocfs2_lock_res_inode(lockres);
2718 osb = OCFS2_SB(inode->i_sb);
2719
2720 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2721
2722 status = ocfs2_generic_unblock_lock(osb,
2723 lockres,
2724 requeue,
2725 ocfs2_data_convert_worker);
2726 if (status < 0)
2727 mlog_errno(status);
2728
2729 mlog(0, "inode %"MLFu64", requeue = %d\n",
2730 OCFS2_I(inode)->ip_blkno, *requeue);
2731
2732 mlog_exit(status);
2733 return status;
2734}
2735
2736static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2737 int *requeue)
2738{
2739 int status;
2740 struct inode *inode;
2741
2742 mlog_entry_void();
2743
2744 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2745
2746 inode = ocfs2_lock_res_inode(lockres);
2747
2748 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2749 lockres,
2750 requeue,
2751 NULL);
2752 if (status < 0)
2753 mlog_errno(status);
2754
2755 mlog_exit(status);
2756 return status;
2757}
2758
2759
2760int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2761 int *requeue)
2762{
2763 int status;
2764 struct inode *inode;
2765
2766 mlog_entry_void();
2767
2768 inode = ocfs2_lock_res_inode(lockres);
2769
2770 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2771
2772 status = ocfs2_do_unblock_meta(inode, requeue);
2773 if (status < 0)
2774 mlog_errno(status);
2775
2776 mlog(0, "inode %"MLFu64", requeue = %d\n",
2777 OCFS2_I(inode)->ip_blkno, *requeue);
2778
2779 mlog_exit(status);
2780 return status;
2781}
2782
2783/* Generic unblock function for any lockres whose private data is an
2784 * ocfs2_super pointer. */
2785static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2786 int *requeue)
2787{
2788 int status;
2789 struct ocfs2_super *osb;
2790
2791 mlog_entry_void();
2792
2793 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2794
2795 osb = ocfs2_lock_res_super(lockres);
2796
2797 status = ocfs2_generic_unblock_lock(osb,
2798 lockres,
2799 requeue,
2800 NULL);
2801 if (status < 0)
2802 mlog_errno(status);
2803
2804 mlog_exit(status);
2805 return status;
2806}
2807
2808void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2809 struct ocfs2_lock_res *lockres)
2810{
2811 int status;
2812 int requeue = 0;
2813 unsigned long flags;
2814
2815 /* Our reference to the lockres in this function can be
2816 * considered valid until we remove the OCFS2_LOCK_QUEUED
2817 * flag. */
2818
2819 mlog_entry_void();
2820
2821 BUG_ON(!lockres);
2822 BUG_ON(!lockres->l_ops);
2823 BUG_ON(!lockres->l_ops->unblock);
2824
2825 mlog(0, "lockres %s blocked.\n", lockres->l_name);
2826
2827 /* Detect whether a lock has been marked as going away while
2828 * the vote thread was processing other things. A lock can
2829 * still be marked with OCFS2_LOCK_FREEING after this check,
2830 * but short circuiting here will still save us some
2831 * performance. */
2832 spin_lock_irqsave(&lockres->l_lock, flags);
2833 if (lockres->l_flags & OCFS2_LOCK_FREEING)
2834 goto unqueue;
2835 spin_unlock_irqrestore(&lockres->l_lock, flags);
2836
2837 status = lockres->l_ops->unblock(lockres, &requeue);
2838 if (status < 0)
2839 mlog_errno(status);
2840
2841 spin_lock_irqsave(&lockres->l_lock, flags);
2842unqueue:
2843 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2844 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2845 } else
2846 ocfs2_schedule_blocked_lock(osb, lockres);
2847
2848 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2849 requeue ? "yes" : "no");
2850 spin_unlock_irqrestore(&lockres->l_lock, flags);
2851
2852 mlog_exit_void();
2853}
2854
2855static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2856 struct ocfs2_lock_res *lockres)
2857{
2858 mlog_entry_void();
2859
2860 assert_spin_locked(&lockres->l_lock);
2861
2862 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2863 /* Do not schedule a lock for downconvert when it's on
2864 * the way to destruction - any nodes wanting access
2865 * to the resource will get it soon. */
2866 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2867 lockres->l_name, lockres->l_flags);
2868 return;
2869 }
2870
2871 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2872
2873 spin_lock(&osb->vote_task_lock);
2874 if (list_empty(&lockres->l_blocked_list)) {
2875 list_add_tail(&lockres->l_blocked_list,
2876 &osb->blocked_lock_list);
2877 osb->blocked_lock_count++;
2878 }
2879 spin_unlock(&osb->vote_task_lock);
2880
2881 mlog_exit_void();
2882}
2883
2884/* This aids in debugging situations where a bad LVB might be involved. */
2885void ocfs2_dump_meta_lvb_info(u64 level,
2886 const char *function,
2887 unsigned int line,
2888 struct ocfs2_lock_res *lockres)
2889{
2890 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2891
2892 mlog(level, "LVB information for %s (called from %s:%u):\n",
2893 lockres->l_name, function, line);
2894 mlog(level, "version: %u, clusters: %u\n",
2895 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2896 mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
2897 be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
2898 be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
2899 mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
2900 "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
2901 be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
2902 be64_to_cpu(lvb->lvb_ictime_packed),
2903 be64_to_cpu(lvb->lvb_imtime_packed));
2904}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 000000000000..8f2d1db2d9ea
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef DLMGLUE_H
28#define DLMGLUE_H
29
30#define OCFS2_LVB_VERSION 2
31
32struct ocfs2_meta_lvb {
33 __be32 lvb_version;
34 __be32 lvb_iclusters;
35 __be32 lvb_iuid;
36 __be32 lvb_igid;
37 __be64 lvb_iatime_packed;
38 __be64 lvb_ictime_packed;
39 __be64 lvb_imtime_packed;
40 __be64 lvb_isize;
41 __be16 lvb_imode;
42 __be16 lvb_inlink;
43 __be32 lvb_reserved[3];
44};
45
46/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
47/* don't wait on recovery. */
48#define OCFS2_META_LOCK_RECOVERY (0x01)
49/* Instruct the dlm not to queue ourselves on the other node. */
50#define OCFS2_META_LOCK_NOQUEUE (0x02)
51/* don't block waiting for the vote thread, instead return -EAGAIN */
52#define OCFS2_LOCK_NONBLOCK (0x04)
53
54int ocfs2_dlm_init(struct ocfs2_super *osb);
55void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
56void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
57void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
58 enum ocfs2_lock_type type,
59 struct inode *inode);
60void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
61int ocfs2_create_new_inode_locks(struct inode *inode);
62int ocfs2_drop_inode_locks(struct inode *inode);
63int ocfs2_data_lock_full(struct inode *inode,
64 int write,
65 int arg_flags);
66#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
67int ocfs2_data_lock_with_page(struct inode *inode,
68 int write,
69 struct page *page);
70void ocfs2_data_unlock(struct inode *inode,
71 int write);
72int ocfs2_rw_lock(struct inode *inode, int write);
73void ocfs2_rw_unlock(struct inode *inode, int write);
74int ocfs2_meta_lock_full(struct inode *inode,
75 struct ocfs2_journal_handle *handle,
76 struct buffer_head **ret_bh,
77 int ex,
78 int arg_flags);
79int ocfs2_meta_lock_with_page(struct inode *inode,
80 struct ocfs2_journal_handle *handle,
81 struct buffer_head **ret_bh,
82 int ex,
83 struct page *page);
84/* 99% of the time we don't want to supply any additional flags --
85 * those are for very specific cases only. */
86#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
87void ocfs2_meta_unlock(struct inode *inode,
88 int ex);
89int ocfs2_super_lock(struct ocfs2_super *osb,
90 int ex);
91void ocfs2_super_unlock(struct ocfs2_super *osb,
92 int ex);
93int ocfs2_rename_lock(struct ocfs2_super *osb);
94void ocfs2_rename_unlock(struct ocfs2_super *osb);
95void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
96
97/* for the vote thread */
98void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
99 struct ocfs2_lock_res *lockres);
100
101struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
102void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
103
104/* aids in debugging and tracking lvbs */
105void ocfs2_dump_meta_lvb_info(u64 level,
106 const char *function,
107 unsigned int line,
108 struct ocfs2_lock_res *lockres);
109#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
110
111#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 000000000000..f226b2207628
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_ENDIAN_H
23#define OCFS2_ENDIAN_H
24
25static inline void le16_add_cpu(__le16 *var, u16 val)
26{
27 *var = cpu_to_le16(le16_to_cpu(*var) + val);
28}
29
30static inline void le32_add_cpu(__le32 *var, u32 val)
31{
32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33}
34
35static inline void le32_and_cpu(__le32 *var, u32 val)
36{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val);
38}
39
40static inline void be32_add_cpu(__be32 *var, u32 val)
41{
42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
43}
44
45#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 000000000000..5810160d92a8
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.c
5 *
6 * Functions to facilitate NFS exporting
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28
29#define MLOG_MASK_PREFIX ML_EXPORT
30#include <cluster/masklog.h>
31
32#include "ocfs2.h"
33
34#include "dir.h"
35#include "dlmglue.h"
36#include "export.h"
37#include "inode.h"
38
39#include "buffer_head_io.h"
40
41struct ocfs2_inode_handle
42{
43 u64 ih_blkno;
44 u32 ih_generation;
45};
46
47static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
48{
49 struct ocfs2_inode_handle *handle = vobjp;
50 struct inode *inode;
51 struct dentry *result;
52
53 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
54
55 if (handle->ih_blkno == 0) {
56 mlog_errno(-ESTALE);
57 return ERR_PTR(-ESTALE);
58 }
59
60 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
61
62 if (IS_ERR(inode)) {
63 mlog_errno(PTR_ERR(inode));
64 return (void *)inode;
65 }
66
67 if (handle->ih_generation != inode->i_generation) {
68 iput(inode);
69 mlog_errno(-ESTALE);
70 return ERR_PTR(-ESTALE);
71 }
72
73 result = d_alloc_anon(inode);
74
75 if (!result) {
76 iput(inode);
77 mlog_errno(-ENOMEM);
78 return ERR_PTR(-ENOMEM);
79 }
80
81 mlog_exit_ptr(result);
82 return result;
83}
84
85static struct dentry *ocfs2_get_parent(struct dentry *child)
86{
87 int status;
88 u64 blkno;
89 struct dentry *parent;
90 struct inode *inode;
91 struct inode *dir = child->d_inode;
92 struct buffer_head *dirent_bh = NULL;
93 struct ocfs2_dir_entry *dirent;
94
95 mlog_entry("(0x%p, '%.*s')\n", child,
96 child->d_name.len, child->d_name.name);
97
98 mlog(0, "find parent of directory %"MLFu64"\n",
99 OCFS2_I(dir)->ip_blkno);
100
101 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
102 if (status < 0) {
103 if (status != -ENOENT)
104 mlog_errno(status);
105 parent = ERR_PTR(status);
106 goto bail;
107 }
108
109 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
110 &dirent);
111 if (status < 0) {
112 parent = ERR_PTR(-ENOENT);
113 goto bail_unlock;
114 }
115
116 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
117 if (IS_ERR(inode)) {
118 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
119 parent = ERR_PTR(-EACCES);
120 goto bail_unlock;
121 }
122
123 parent = d_alloc_anon(inode);
124 if (!parent) {
125 iput(inode);
126 parent = ERR_PTR(-ENOMEM);
127 }
128
129bail_unlock:
130 ocfs2_meta_unlock(dir, 0);
131
132 if (dirent_bh)
133 brelse(dirent_bh);
134
135bail:
136 mlog_exit_ptr(parent);
137
138 return parent;
139}
140
141static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
142 int connectable)
143{
144 struct inode *inode = dentry->d_inode;
145 int len = *max_len;
146 int type = 1;
147 u64 blkno;
148 u32 generation;
149
150 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
151 dentry->d_name.len, dentry->d_name.name,
152 fh, len, connectable);
153
154 if (len < 3 || (connectable && len < 6)) {
155 mlog(ML_ERROR, "fh buffer is too small for encoding\n");
156 type = 255;
157 goto bail;
158 }
159
160 blkno = OCFS2_I(inode)->ip_blkno;
161 generation = inode->i_generation;
162
163 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
164 blkno, generation);
165
166 len = 3;
167 fh[0] = cpu_to_le32((u32)(blkno >> 32));
168 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
169 fh[2] = cpu_to_le32(generation);
170
171 if (connectable && !S_ISDIR(inode->i_mode)) {
172 struct inode *parent;
173
174 spin_lock(&dentry->d_lock);
175
176 parent = dentry->d_parent->d_inode;
177 blkno = OCFS2_I(parent)->ip_blkno;
178 generation = parent->i_generation;
179
180 fh[3] = cpu_to_le32((u32)(blkno >> 32));
181 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
182 fh[5] = cpu_to_le32(generation);
183
184 spin_unlock(&dentry->d_lock);
185
186 len = 6;
187 type = 2;
188
189 mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
190 blkno, generation);
191 }
192
193 *max_len = len;
194
195bail:
196 mlog_exit(type);
197 return type;
198}
199
200static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
201 int fh_len, int fileid_type,
202 int (*acceptable)(void *context,
203 struct dentry *de),
204 void *context)
205{
206 struct ocfs2_inode_handle handle, parent;
207 struct dentry *ret = NULL;
208
209 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
210 sb, fh, fh_len, fileid_type, acceptable, context);
211
212 if (fh_len < 3 || fileid_type > 2)
213 goto bail;
214
215 if (fileid_type == 2) {
216 if (fh_len < 6)
217 goto bail;
218
219 parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
220 parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
221 parent.ih_generation = le32_to_cpu(fh[5]);
222
223 mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
224 parent.ih_blkno, parent.ih_generation);
225 }
226
227 handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
228 handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
229 handle.ih_generation = le32_to_cpu(fh[2]);
230
231 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
232 handle.ih_blkno, handle.ih_generation);
233
234 ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
235 acceptable, context);
236
237bail:
238 mlog_exit_ptr(ret);
239 return ret;
240}
241
242struct export_operations ocfs2_export_ops = {
243 .decode_fh = ocfs2_decode_fh,
244 .encode_fh = ocfs2_encode_fh,
245
246 .get_parent = ocfs2_get_parent,
247 .get_dentry = ocfs2_get_dentry,
248};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 000000000000..5b77ee7866ef
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H
28
29extern struct export_operations ocfs2_export_ops;
30
31#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 000000000000..f2fb40cd296a
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.c
5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in
7 * the library.
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License, version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "extent_map.h"
38#include "inode.h"
39#include "super.h"
40
41#include "buffer_head_io.h"
42
43
44/*
45 * SUCK SUCK SUCK
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 *
112 * The rb_node garbage lets insertion share the search. Trivial
113 * callers pass NULL.
114 */
115static struct ocfs2_extent_map_entry *
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{
121 struct rb_node **p = &em->em_extents.rb_node;
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140
141 if (ret_p != NULL)
142 *ret_p = p;
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146}
147
148/*
149 * Find the leaf containing the interval we want. While we're on our
150 * way down the tree, fill in every record we see at any depth, because
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{
161 int i, ret;
162 struct buffer_head *eb_bh = NULL;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 goto out_free;
185 }
186
187 if (rec_end <= cpos) {
188 ret = ocfs2_extent_map_insert(inode, rec,
189 le16_to_cpu(el->l_tree_depth));
190 if (ret && (ret != -EEXIST)) {
191 mlog_errno(ret);
192 goto out_free;
193 }
194 continue;
195 }
196 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 ret = ocfs2_extent_map_insert(inode, rec,
198 le16_to_cpu(el->l_tree_depth));
199 if (ret && (ret != -EEXIST)) {
200 mlog_errno(ret);
201 goto out_free;
202 }
203 continue;
204 }
205
206 /*
207 * We've found a record that matches our
208 * interval. We don't insert it because we're
209 * about to traverse it.
210 */
211
212 /* Check to see if we're stradling */
213 ret = -ESRCH;
214 if (!ocfs2_extent_rec_contains_clusters(rec,
215 cpos,
216 clusters)) {
217 mlog_errno(ret);
218 goto out_free;
219 }
220
221 /*
222 * If we've already found a record, the el has
223 * two records covering the same interval.
224 * EEEK!
225 */
226 ret = -EBADR;
227 if (blkno) {
228 mlog_errno(ret);
229 goto out_free;
230 }
231
232 blkno = le64_to_cpu(rec->e_blkno);
233 }
234
235 /*
236 * We don't support holes, and we're still up
237 * in the branches, so we'd better have found someone
238 */
239 ret = -EBADR;
240 if (!blkno) {
241 mlog_errno(ret);
242 goto out_free;
243 }
244
245 if (eb_bh) {
246 brelse(eb_bh);
247 eb_bh = NULL;
248 }
249 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 blkno, &eb_bh, OCFS2_BH_CACHED,
251 inode);
252 if (ret) {
253 mlog_errno(ret);
254 goto out_free;
255 }
256 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 ret = -EIO;
260 goto out_free;
261 }
262 el = &eb->h_list;
263 }
264
265 if (el->l_tree_depth)
266 BUG();
267
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i];
270 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth));
272 if (ret) {
273 mlog_errno(ret);
274 goto out_free;
275 }
276 }
277
278 ret = 0;
279
280out_free:
281 if (eb_bh)
282 brelse(eb_bh);
283
284 return ret;
285}
286
287/*
288 * This lookup actually will read from disk. It has one invariant:
289 * It will never re-traverse blocks. This means that all inserts should
290 * be new regions or more granular regions (both allowed by insert).
291 */
292static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 u32 cpos,
294 u32 clusters,
295 struct ocfs2_extent_map_entry **ret_ent)
296{
297 int ret;
298 u64 blkno;
299 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 struct ocfs2_extent_map_entry *ent;
301 struct buffer_head *bh = NULL;
302 struct ocfs2_extent_block *eb;
303 struct ocfs2_dinode *di;
304 struct ocfs2_extent_list *el;
305
306 spin_lock(&OCFS2_I(inode)->ip_lock);
307 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 if (ent) {
309 if (!ent->e_tree_depth) {
310 spin_unlock(&OCFS2_I(inode)->ip_lock);
311 *ret_ent = ent;
312 return 0;
313 }
314 blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 spin_unlock(&OCFS2_I(inode)->ip_lock);
316
317 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 OCFS2_BH_CACHED, inode);
319 if (ret) {
320 mlog_errno(ret);
321 if (bh)
322 brelse(bh);
323 return ret;
324 }
325 eb = (struct ocfs2_extent_block *)bh->b_data;
326 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 brelse(bh);
329 return -EIO;
330 }
331 el = &eb->h_list;
332 } else {
333 spin_unlock(&OCFS2_I(inode)->ip_lock);
334
335 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 OCFS2_I(inode)->ip_blkno, &bh,
337 OCFS2_BH_CACHED, inode);
338 if (ret) {
339 mlog_errno(ret);
340 if (bh)
341 brelse(bh);
342 return ret;
343 }
344 di = (struct ocfs2_dinode *)bh->b_data;
345 if (!OCFS2_IS_VALID_DINODE(di)) {
346 brelse(bh);
347 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 return -EIO;
349 }
350 el = &di->id2.i_list;
351 }
352
353 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 brelse(bh);
355 if (ret) {
356 mlog_errno(ret);
357 return ret;
358 }
359
360 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 if (!ent) {
362 ret = -ESRCH;
363 mlog_errno(ret);
364 return ret;
365 }
366
367 if (ent->e_tree_depth)
368 BUG(); /* FIXME: Make sure this isn't a corruption */
369
370 *ret_ent = ent;
371
372 return 0;
373}
374
375/*
376 * Callers must hold ip_lock. This can insert pieces of the tree,
377 * thus racing lookup if the lock weren't held.
378 */
379static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 struct ocfs2_extent_map_entry *ent)
381{
382 struct rb_node **p, *parent;
383 struct ocfs2_extent_map_entry *old_ent;
384
385 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 le32_to_cpu(ent->e_rec.e_clusters),
387 &p, &parent);
388 if (old_ent)
389 return -EEXIST;
390
391 rb_link_node(&ent->e_node, parent, p);
392 rb_insert_color(&ent->e_node, &em->em_extents);
393
394 return 0;
395}
396
397
398/*
399 * Simple rule: on any return code other than -EAGAIN, anything left
400 * in the insert_context will be freed.
401 */
402static int ocfs2_extent_map_try_insert(struct inode *inode,
403 struct ocfs2_extent_rec *rec,
404 int tree_depth,
405 struct ocfs2_em_insert_context *ctxt)
406{
407 int ret;
408 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 struct ocfs2_extent_map_entry *old_ent;
410
411 ctxt->need_left = 0;
412 ctxt->need_right = 0;
413 ctxt->old_ent = NULL;
414
415 spin_lock(&OCFS2_I(inode)->ip_lock);
416 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 if (!ret) {
418 ctxt->new_ent = NULL;
419 goto out_unlock;
420 }
421
422 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 le32_to_cpu(rec->e_clusters), NULL,
424 NULL);
425
426 if (!old_ent)
427 BUG();
428
429 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth)
431 goto out_unlock;
432
433 if (old_ent->e_tree_depth == tree_depth) {
434 if (!memcmp(rec, &old_ent->e_rec,
435 sizeof(struct ocfs2_extent_rec)))
436 ret = 0;
437
438 /* FIXME: Should this be ESRCH/EBADR??? */
439 goto out_unlock;
440 }
441
442 /*
443 * We do it in this order specifically so that no actual tree
444 * changes occur until we have all the pieces we need. We
445 * don't want malloc failures to leave an inconsistent tree.
446 * Whenever we drop the lock, another process could be
447 * inserting. Also note that, if another process just beat us
448 * to an insert, we might not need the same pieces we needed
449 * the first go round. In the end, the pieces we need will
450 * be used, and the pieces we don't will be freed.
451 */
452 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 le32_to_cpu(old_ent->e_rec.e_cpos));
454 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 ret = -EAGAIN;
458 if (ctxt->need_left) {
459 if (!ctxt->left_ent)
460 goto out_unlock;
461 *(ctxt->left_ent) = *old_ent;
462 ctxt->left_ent->e_rec.e_clusters =
463 cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 }
466 if (ctxt->need_right) {
467 if (!ctxt->right_ent)
468 goto out_unlock;
469 *(ctxt->right_ent) = *old_ent;
470 ctxt->right_ent->e_rec.e_cpos =
471 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 le32_to_cpu(rec->e_clusters));
473 ctxt->right_ent->e_rec.e_clusters =
474 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 }
478
479 rb_erase(&old_ent->e_node, &em->em_extents);
480 /* Now that he's erased, set him up for deletion */
481 ctxt->old_ent = old_ent;
482
483 if (ctxt->need_left) {
484 ret = ocfs2_extent_map_insert_entry(em,
485 ctxt->left_ent);
486 if (ret)
487 goto out_unlock;
488 ctxt->left_ent = NULL;
489 }
490
491 if (ctxt->need_right) {
492 ret = ocfs2_extent_map_insert_entry(em,
493 ctxt->right_ent);
494 if (ret)
495 goto out_unlock;
496 ctxt->right_ent = NULL;
497 }
498
499 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500
501 if (!ret)
502 ctxt->new_ent = NULL;
503
504out_unlock:
505 spin_unlock(&OCFS2_I(inode)->ip_lock);
506
507 return ret;
508}
509
510
511static int ocfs2_extent_map_insert(struct inode *inode,
512 struct ocfs2_extent_rec *rec,
513 int tree_depth)
514{
515 int ret;
516 struct ocfs2_em_insert_context ctxt = {0, };
517
518 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 OCFS2_I(inode)->ip_map.em_clusters) {
520 ret = -EBADR;
521 mlog_errno(ret);
522 return ret;
523 }
524
525 /* Zero e_clusters means a truncated tail record. It better be EOF */
526 if (!rec->e_clusters) {
527 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR;
530 mlog_errno(ret);
531 return ret;
532 }
533
534 /* Ignore the truncated tail */
535 return 0;
536 }
537
538 ret = -ENOMEM;
539 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 GFP_KERNEL);
541 if (!ctxt.new_ent) {
542 mlog_errno(ret);
543 return ret;
544 }
545
546 ctxt.new_ent->e_rec = *rec;
547 ctxt.new_ent->e_tree_depth = tree_depth;
548
549 do {
550 ret = -ENOMEM;
551 if (ctxt.need_left && !ctxt.left_ent) {
552 ctxt.left_ent =
553 kmem_cache_alloc(ocfs2_em_ent_cachep,
554 GFP_KERNEL);
555 if (!ctxt.left_ent)
556 break;
557 }
558 if (ctxt.need_right && !ctxt.right_ent) {
559 ctxt.right_ent =
560 kmem_cache_alloc(ocfs2_em_ent_cachep,
561 GFP_KERNEL);
562 if (!ctxt.right_ent)
563 break;
564 }
565
566 ret = ocfs2_extent_map_try_insert(inode, rec,
567 tree_depth, &ctxt);
568 } while (ret == -EAGAIN);
569
570 if (ret < 0)
571 mlog_errno(ret);
572
573 if (ctxt.left_ent)
574 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 if (ctxt.right_ent)
576 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 if (ctxt.old_ent)
578 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 if (ctxt.new_ent)
580 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581
582 return ret;
583}
584
585/*
586 * Append this record to the tail of the extent map. It must be
587 * tree_depth 0. The record might be an extension of an existing
588 * record, and as such that needs to be handled. eg:
589 *
590 * Existing record in the extent map:
591 *
592 * cpos = 10, len = 10
593 * |---------|
594 *
595 * New Record:
596 *
597 * cpos = 10, len = 20
598 * |------------------|
599 *
600 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a
602 * contiguous append, the new_clusters has been added to
603 * rec->e_clusters. If the append is an entirely new extent, then
604 * rec->e_clusters is == new_clusters.
605 */
606int ocfs2_extent_map_append(struct inode *inode,
607 struct ocfs2_extent_rec *rec,
608 u32 new_clusters)
609{
610 int ret;
611 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 struct ocfs2_extent_map_entry *ent;
613 struct ocfs2_extent_rec *old;
614
615 BUG_ON(!new_clusters);
616 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617
618 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 /*
620 * Size changed underneath us on disk. Drop any
621 * straddling records and update our idea of
622 * i_clusters
623 */
624 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 }
627
628 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 le32_to_cpu(rec->e_clusters)) !=
630 (em->em_clusters + new_clusters),
631 "Inode %"MLFu64":\n"
632 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 "em->em_clusters = %u + new_clusters = %u = %u\n",
634 OCFS2_I(inode)->ip_blkno,
635 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 em->em_clusters, new_clusters,
638 em->em_clusters + new_clusters);
639
640 em->em_clusters += new_clusters;
641
642 ret = -ENOENT;
643 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 /* This is a contiguous append */
645 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 NULL, NULL);
647 if (ent) {
648 old = &ent->e_rec;
649 BUG_ON((le32_to_cpu(rec->e_cpos) +
650 le32_to_cpu(rec->e_clusters)) !=
651 (le32_to_cpu(old->e_cpos) +
652 le32_to_cpu(old->e_clusters) +
653 new_clusters));
654 if (ent->e_tree_depth == 0) {
655 BUG_ON(le32_to_cpu(old->e_cpos) !=
656 le32_to_cpu(rec->e_cpos));
657 BUG_ON(le64_to_cpu(old->e_blkno) !=
658 le64_to_cpu(rec->e_blkno));
659 ret = 0;
660 }
661 /*
662 * Let non-leafs fall through as -ENOENT to
663 * force insertion of the new leaf.
664 */
665 le32_add_cpu(&old->e_clusters, new_clusters);
666 }
667 }
668
669 if (ret == -ENOENT)
670 ret = ocfs2_extent_map_insert(inode, rec, 0);
671 if (ret < 0)
672 mlog_errno(ret);
673 return ret;
674}
675
676#if 0
677/* Code here is included but defined out as it completes the extent
678 * map api and may be used in the future. */
679
680/*
681 * Look up the record containing this cluster offset. This record is
682 * part of the extent map. Do not free it. Any changes you make to
683 * it will reflect in the extent map. So, if your last extent
684 * is (cpos = 10, clusters = 10) and you truncate the file by 5
685 * clusters, you can do:
686 *
687 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688 * rec->e_clusters -= 5;
689 *
690 * The lookup does not read from disk. If the map isn't filled in for
691 * an entry, you won't find it.
692 *
693 * Also note that the returned record is valid until alloc_sem is
694 * dropped. After that, truncate and extend can happen. Caveat Emptor.
695 */
696int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 struct ocfs2_extent_rec **rec,
698 int *tree_depth)
699{
700 int ret = -ENOENT;
701 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 struct ocfs2_extent_map_entry *ent;
703
704 *rec = NULL;
705
706 if (cpos >= OCFS2_I(inode)->ip_clusters)
707 return -EINVAL;
708
709 if (cpos >= em->em_clusters) {
710 /*
711 * Size changed underneath us on disk. Drop any
712 * straddling records and update our idea of
713 * i_clusters
714 */
715 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 }
718
719 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 NULL, NULL);
721
722 if (ent) {
723 *rec = &ent->e_rec;
724 if (tree_depth)
725 *tree_depth = ent->e_tree_depth;
726 ret = 0;
727 }
728
729 return ret;
730}
731
732int ocfs2_extent_map_get_clusters(struct inode *inode,
733 u32 v_cpos, int count,
734 u32 *p_cpos, int *ret_count)
735{
736 int ret;
737 u32 coff, ccount;
738 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 struct ocfs2_extent_map_entry *ent = NULL;
740
741 *p_cpos = ccount = 0;
742
743 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 return -EINVAL;
745
746 if ((v_cpos + count) > em->em_clusters) {
747 /*
748 * Size changed underneath us on disk. Drop any
749 * straddling records and update our idea of
750 * i_clusters
751 */
752 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 }
755
756
757 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 if (ret)
759 return ret;
760
761 if (ent) {
762 /* We should never find ourselves straddling an interval */
763 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 v_cpos,
765 count))
766 return -ESRCH;
767
768 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 le64_to_cpu(ent->e_rec.e_blkno)) +
771 coff;
772
773 if (ret_count)
774 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775
776 return 0;
777 }
778
779
780 return -ENOENT;
781}
782
783#endif /* 0 */
784
785int ocfs2_extent_map_get_blocks(struct inode *inode,
786 u64 v_blkno, int count,
787 u64 *p_blkno, int *ret_count)
788{
789 int ret;
790 u64 boff;
791 u32 cpos, clusters;
792 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 struct ocfs2_extent_map_entry *ent = NULL;
794 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 struct ocfs2_extent_rec *rec;
796
797 *p_blkno = 0;
798
799 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 (u64)count + bpc - 1);
802 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 ret = -EINVAL;
804 mlog_errno(ret);
805 return ret;
806 }
807
808 if ((cpos + clusters) > em->em_clusters) {
809 /*
810 * Size changed underneath us on disk. Drop any
811 * straddling records and update our idea of
812 * i_clusters
813 */
814 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 }
817
818 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 if (ret) {
820 mlog_errno(ret);
821 return ret;
822 }
823
824 if (ent)
825 {
826 rec = &ent->e_rec;
827
828 /* We should never find ourselves straddling an interval */
829 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 ret = -ESRCH;
831 mlog_errno(ret);
832 return ret;
833 }
834
835 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 le32_to_cpu(rec->e_cpos));
837 boff += (v_blkno & (u64)(bpc - 1));
838 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839
840 if (ret_count) {
841 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 le32_to_cpu(rec->e_clusters)) - boff;
843 }
844
845 return 0;
846 }
847
848 return -ENOENT;
849}
850
851int ocfs2_extent_map_init(struct inode *inode)
852{
853 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854
855 em->em_extents = RB_ROOT;
856 em->em_clusters = 0;
857
858 return 0;
859}
860
861/* Needs the lock */
862static void __ocfs2_extent_map_drop(struct inode *inode,
863 u32 new_clusters,
864 struct rb_node **free_head,
865 struct ocfs2_extent_map_entry **tail_ent)
866{
867 struct rb_node *node, *next;
868 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 struct ocfs2_extent_map_entry *ent;
870
871 *free_head = NULL;
872
873 ent = NULL;
874 node = rb_last(&em->em_extents);
875 while (node)
876 {
877 next = rb_prev(node);
878
879 ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 e_node);
881 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 break;
883
884 rb_erase(&ent->e_node, &em->em_extents);
885
886 node->rb_right = *free_head;
887 *free_head = node;
888
889 ent = NULL;
890 node = next;
891 }
892
893 /* Do we have an entry straddling new_clusters? */
894 if (tail_ent) {
895 if (ent &&
896 ((le32_to_cpu(ent->e_rec.e_cpos) +
897 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 *tail_ent = ent;
899 else
900 *tail_ent = NULL;
901 }
902}
903
904static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905{
906 struct rb_node *node;
907 struct ocfs2_extent_map_entry *ent;
908
909 while (free_head) {
910 node = free_head;
911 free_head = node->rb_right;
912
913 ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 e_node);
915 kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 }
917}
918
919/*
920 * Remove all entries past new_clusters, inclusive of an entry that
921 * contains new_clusters. This is effectively a cache forget.
922 *
923 * If you want to also clip the last extent by some number of clusters,
924 * you need to call ocfs2_extent_map_trunc().
925 * This code does not check or modify ip_clusters.
926 */
927int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928{
929 struct rb_node *free_head = NULL;
930 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 struct ocfs2_extent_map_entry *ent;
932
933 spin_lock(&OCFS2_I(inode)->ip_lock);
934
935 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936
937 if (ent) {
938 rb_erase(&ent->e_node, &em->em_extents);
939 ent->e_node.rb_right = free_head;
940 free_head = &ent->e_node;
941 }
942
943 spin_unlock(&OCFS2_I(inode)->ip_lock);
944
945 if (free_head)
946 __ocfs2_extent_map_drop_cleanup(free_head);
947
948 return 0;
949}
950
951/*
952 * Remove all entries past new_clusters and also clip any extent
953 * straddling new_clusters, if there is one. This does not check
954 * or modify ip_clusters
955 */
956int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957{
958 struct rb_node *free_head = NULL;
959 struct ocfs2_extent_map_entry *ent = NULL;
960
961 spin_lock(&OCFS2_I(inode)->ip_lock);
962
963 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964
965 if (ent)
966 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 le32_to_cpu(ent->e_rec.e_cpos));
968
969 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970
971 spin_unlock(&OCFS2_I(inode)->ip_lock);
972
973 if (free_head)
974 __ocfs2_extent_map_drop_cleanup(free_head);
975
976 return 0;
977}
978
979int __init init_ocfs2_extent_maps(void)
980{
981 ocfs2_em_ent_cachep =
982 kmem_cache_create("ocfs2_em_ent",
983 sizeof(struct ocfs2_extent_map_entry),
984 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 if (!ocfs2_em_ent_cachep)
986 return -ENOMEM;
987
988 return 0;
989}
990
991void __exit exit_ocfs2_extent_maps(void)
992{
993 kmem_cache_destroy(ocfs2_em_ent_cachep);
994}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 000000000000..fa3745efa886
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.h
5 *
6 * In-memory file extent mappings for OCFS2.
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H
27
28int init_ocfs2_extent_maps(void);
29void exit_ocfs2_extent_maps(void);
30
31/*
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
33 * to be held. The allocation cannot change at all while the map is
34 * in the process of being updated.
35 */
36int ocfs2_extent_map_init(struct inode *inode);
37int ocfs2_extent_map_append(struct inode *inode,
38 struct ocfs2_extent_rec *rec,
39 u32 new_clusters);
40int ocfs2_extent_map_get_blocks(struct inode *inode,
41 u64 v_blkno, int count,
42 u64 *p_blkno, int *ret_count);
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
45
46#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 000000000000..72ae9e3306f4
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c
5 *
6 * File open, close, extend, truncate
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#include "alloc.h"
39#include "aops.h"
40#include "dir.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "sysfile.h"
45#include "inode.h"
46#include "journal.h"
47#include "mmap.h"
48#include "suballoc.h"
49#include "super.h"
50
51#include "buffer_head_io.h"
52
53static int ocfs2_sync_inode(struct inode *inode)
54{
55 filemap_fdatawrite(inode->i_mapping);
56 return sync_mapping_buffers(inode->i_mapping);
57}
58
59static int ocfs2_file_open(struct inode *inode, struct file *file)
60{
61 int status;
62 int mode = file->f_flags;
63 struct ocfs2_inode_info *oi = OCFS2_I(inode);
64
65 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
66 file->f_dentry->d_name.len, file->f_dentry->d_name.name);
67
68 spin_lock(&oi->ip_lock);
69
70 /* Check that the inode hasn't been wiped from disk by another
71 * node. If it hasn't then we're safe as long as we hold the
72 * spin lock until our increment of open count. */
73 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
74 spin_unlock(&oi->ip_lock);
75
76 status = -ENOENT;
77 goto leave;
78 }
79
80 if (mode & O_DIRECT)
81 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
82
83 oi->ip_open_count++;
84 spin_unlock(&oi->ip_lock);
85 status = 0;
86leave:
87 mlog_exit(status);
88 return status;
89}
90
91static int ocfs2_file_release(struct inode *inode, struct file *file)
92{
93 struct ocfs2_inode_info *oi = OCFS2_I(inode);
94
95 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
96 file->f_dentry->d_name.len,
97 file->f_dentry->d_name.name);
98
99 spin_lock(&oi->ip_lock);
100 if (!--oi->ip_open_count)
101 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
102 spin_unlock(&oi->ip_lock);
103
104 mlog_exit(0);
105
106 return 0;
107}
108
109static int ocfs2_sync_file(struct file *file,
110 struct dentry *dentry,
111 int datasync)
112{
113 int err = 0;
114 journal_t *journal;
115 struct inode *inode = dentry->d_inode;
116 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
117
118 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
119 dentry->d_name.len, dentry->d_name.name);
120
121 err = ocfs2_sync_inode(dentry->d_inode);
122 if (err)
123 goto bail;
124
125 journal = osb->journal->j_journal;
126 err = journal_force_commit(journal);
127
128bail:
129 mlog_exit(err);
130
131 return (err < 0) ? -EIO : 0;
132}
133
134int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
135 struct inode *inode,
136 struct buffer_head *fe_bh,
137 u64 new_i_size)
138{
139 int status;
140
141 mlog_entry_void();
142 i_size_write(inode, new_i_size);
143 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
144 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
145
146 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
147 if (status < 0) {
148 mlog_errno(status);
149 goto bail;
150 }
151
152bail:
153 mlog_exit(status);
154 return status;
155}
156
157static int ocfs2_simple_size_update(struct inode *inode,
158 struct buffer_head *di_bh,
159 u64 new_i_size)
160{
161 int ret;
162 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
163 struct ocfs2_journal_handle *handle = NULL;
164
165 handle = ocfs2_start_trans(osb, NULL,
166 OCFS2_INODE_UPDATE_CREDITS);
167 if (handle == NULL) {
168 ret = -ENOMEM;
169 mlog_errno(ret);
170 goto out;
171 }
172
173 ret = ocfs2_set_inode_size(handle, inode, di_bh,
174 new_i_size);
175 if (ret < 0)
176 mlog_errno(ret);
177
178 ocfs2_commit_trans(handle);
179out:
180 return ret;
181}
182
183static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
184 struct inode *inode,
185 struct buffer_head *fe_bh,
186 u64 new_i_size)
187{
188 int status;
189 struct ocfs2_journal_handle *handle;
190
191 mlog_entry_void();
192
193 /* TODO: This needs to actually orphan the inode in this
194 * transaction. */
195
196 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
197 if (IS_ERR(handle)) {
198 status = PTR_ERR(handle);
199 mlog_errno(status);
200 goto out;
201 }
202
203 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
204 if (status < 0)
205 mlog_errno(status);
206
207 ocfs2_commit_trans(handle);
208out:
209 mlog_exit(status);
210 return status;
211}
212
213static int ocfs2_truncate_file(struct inode *inode,
214 struct buffer_head *di_bh,
215 u64 new_i_size)
216{
217 int status = 0;
218 struct ocfs2_dinode *fe = NULL;
219 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
220 struct ocfs2_truncate_context *tc = NULL;
221
222 mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
223 OCFS2_I(inode)->ip_blkno, new_i_size);
224
225 truncate_inode_pages(inode->i_mapping, new_i_size);
226
227 fe = (struct ocfs2_dinode *) di_bh->b_data;
228 if (!OCFS2_IS_VALID_DINODE(fe)) {
229 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
230 status = -EIO;
231 goto bail;
232 }
233
234 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
235 "Inode %"MLFu64", inode i_size = %lld != di "
236 "i_size = %"MLFu64", i_flags = 0x%x\n",
237 OCFS2_I(inode)->ip_blkno,
238 i_size_read(inode),
239 le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
240
241 if (new_i_size > le64_to_cpu(fe->i_size)) {
242 mlog(0, "asked to truncate file with size (%"MLFu64") "
243 "to size (%"MLFu64")!\n",
244 le64_to_cpu(fe->i_size), new_i_size);
245 status = -EINVAL;
246 mlog_errno(status);
247 goto bail;
248 }
249
250 mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
251 le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
252
253 /* lets handle the simple truncate cases before doing any more
254 * cluster locking. */
255 if (new_i_size == le64_to_cpu(fe->i_size))
256 goto bail;
257
258 if (le32_to_cpu(fe->i_clusters) ==
259 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
260 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
261 fe->i_clusters);
262 /* No allocation change is required, so lets fast path
263 * this truncate. */
264 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
265 if (status < 0)
266 mlog_errno(status);
267 goto bail;
268 }
269
270 /* This forces other nodes to sync and drop their pages */
271 status = ocfs2_data_lock(inode, 1);
272 if (status < 0) {
273 mlog_errno(status);
274 goto bail;
275 }
276 ocfs2_data_unlock(inode, 1);
277
278 /* alright, we're going to need to do a full blown alloc size
279 * change. Orphan the inode so that recovery can complete the
280 * truncate if necessary. This does the task of marking
281 * i_size. */
282 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
283 if (status < 0) {
284 mlog_errno(status);
285 goto bail;
286 }
287
288 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
289 if (status < 0) {
290 mlog_errno(status);
291 goto bail;
292 }
293
294 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
295 if (status < 0) {
296 mlog_errno(status);
297 goto bail;
298 }
299
300 /* TODO: orphan dir cleanup here. */
301bail:
302
303 mlog_exit(status);
304 return status;
305}
306
307/*
308 * extend allocation only here.
309 * we'll update all the disk stuff, and oip->alloc_size
310 *
311 * expect stuff to be locked, a transaction started and enough data /
312 * metadata reservations in the contexts.
313 *
314 * Will return -EAGAIN, and a reason if a restart is needed.
315 * If passed in, *reason will always be set, even in error.
316 */
317int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
318 struct inode *inode,
319 u32 clusters_to_add,
320 struct buffer_head *fe_bh,
321 struct ocfs2_journal_handle *handle,
322 struct ocfs2_alloc_context *data_ac,
323 struct ocfs2_alloc_context *meta_ac,
324 enum ocfs2_alloc_restarted *reason_ret)
325{
326 int status = 0;
327 int free_extents;
328 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
329 enum ocfs2_alloc_restarted reason = RESTART_NONE;
330 u32 bit_off, num_bits;
331 u64 block;
332
333 BUG_ON(!clusters_to_add);
334
335 free_extents = ocfs2_num_free_extents(osb, inode, fe);
336 if (free_extents < 0) {
337 status = free_extents;
338 mlog_errno(status);
339 goto leave;
340 }
341
342 /* there are two cases which could cause us to EAGAIN in the
343 * we-need-more-metadata case:
344 * 1) we haven't reserved *any*
345 * 2) we are so fragmented, we've needed to add metadata too
346 * many times. */
347 if (!free_extents && !meta_ac) {
348 mlog(0, "we haven't reserved any metadata!\n");
349 status = -EAGAIN;
350 reason = RESTART_META;
351 goto leave;
352 } else if ((!free_extents)
353 && (ocfs2_alloc_context_bits_left(meta_ac)
354 < ocfs2_extend_meta_needed(fe))) {
355 mlog(0, "filesystem is really fragmented...\n");
356 status = -EAGAIN;
357 reason = RESTART_META;
358 goto leave;
359 }
360
361 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
362 &bit_off, &num_bits);
363 if (status < 0) {
364 if (status != -ENOSPC)
365 mlog_errno(status);
366 goto leave;
367 }
368
369 BUG_ON(num_bits > clusters_to_add);
370
371 /* reserve our write early -- insert_extent may update the inode */
372 status = ocfs2_journal_access(handle, inode, fe_bh,
373 OCFS2_JOURNAL_ACCESS_WRITE);
374 if (status < 0) {
375 mlog_errno(status);
376 goto leave;
377 }
378
379 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
380 mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
381 num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
382 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
383 num_bits, meta_ac);
384 if (status < 0) {
385 mlog_errno(status);
386 goto leave;
387 }
388
389 le32_add_cpu(&fe->i_clusters, num_bits);
390 spin_lock(&OCFS2_I(inode)->ip_lock);
391 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
392 spin_unlock(&OCFS2_I(inode)->ip_lock);
393
394 status = ocfs2_journal_dirty(handle, fe_bh);
395 if (status < 0) {
396 mlog_errno(status);
397 goto leave;
398 }
399
400 clusters_to_add -= num_bits;
401
402 if (clusters_to_add) {
403 mlog(0, "need to alloc once more, clusters = %u, wanted = "
404 "%u\n", fe->i_clusters, clusters_to_add);
405 status = -EAGAIN;
406 reason = RESTART_TRANS;
407 }
408
409leave:
410 mlog_exit(status);
411 if (reason_ret)
412 *reason_ret = reason;
413 return status;
414}
415
416static int ocfs2_extend_allocation(struct inode *inode,
417 u32 clusters_to_add)
418{
419 int status = 0;
420 int restart_func = 0;
421 int drop_alloc_sem = 0;
422 int credits, num_free_extents;
423 u32 prev_clusters;
424 struct buffer_head *bh = NULL;
425 struct ocfs2_dinode *fe = NULL;
426 struct ocfs2_journal_handle *handle = NULL;
427 struct ocfs2_alloc_context *data_ac = NULL;
428 struct ocfs2_alloc_context *meta_ac = NULL;
429 enum ocfs2_alloc_restarted why;
430 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
431
432 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
433
434 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
435 OCFS2_BH_CACHED, inode);
436 if (status < 0) {
437 mlog_errno(status);
438 goto leave;
439 }
440
441 fe = (struct ocfs2_dinode *) bh->b_data;
442 if (!OCFS2_IS_VALID_DINODE(fe)) {
443 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
444 status = -EIO;
445 goto leave;
446 }
447
448restart_all:
449 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
450
451 mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
452 "clusters_to_add = %u\n",
453 OCFS2_I(inode)->ip_blkno, i_size_read(inode),
454 fe->i_clusters, clusters_to_add);
455
456 handle = ocfs2_alloc_handle(osb);
457 if (handle == NULL) {
458 status = -ENOMEM;
459 mlog_errno(status);
460 goto leave;
461 }
462
463 num_free_extents = ocfs2_num_free_extents(osb,
464 inode,
465 fe);
466 if (num_free_extents < 0) {
467 status = num_free_extents;
468 mlog_errno(status);
469 goto leave;
470 }
471
472 if (!num_free_extents) {
473 status = ocfs2_reserve_new_metadata(osb,
474 handle,
475 fe,
476 &meta_ac);
477 if (status < 0) {
478 if (status != -ENOSPC)
479 mlog_errno(status);
480 goto leave;
481 }
482 }
483
484 status = ocfs2_reserve_clusters(osb,
485 handle,
486 clusters_to_add,
487 &data_ac);
488 if (status < 0) {
489 if (status != -ENOSPC)
490 mlog_errno(status);
491 goto leave;
492 }
493
494 /* blocks peope in read/write from reading our allocation
495 * until we're done changing it. We depend on i_sem to block
496 * other extend/truncate calls while we're here. Ordering wrt
497 * start_trans is important here -- always do it before! */
498 down_write(&OCFS2_I(inode)->ip_alloc_sem);
499 drop_alloc_sem = 1;
500
501 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
502 handle = ocfs2_start_trans(osb, handle, credits);
503 if (IS_ERR(handle)) {
504 status = PTR_ERR(handle);
505 handle = NULL;
506 mlog_errno(status);
507 goto leave;
508 }
509
510restarted_transaction:
511 /* reserve a write to the file entry early on - that we if we
512 * run out of credits in the allocation path, we can still
513 * update i_size. */
514 status = ocfs2_journal_access(handle, inode, bh,
515 OCFS2_JOURNAL_ACCESS_WRITE);
516 if (status < 0) {
517 mlog_errno(status);
518 goto leave;
519 }
520
521 prev_clusters = OCFS2_I(inode)->ip_clusters;
522
523 status = ocfs2_do_extend_allocation(osb,
524 inode,
525 clusters_to_add,
526 bh,
527 handle,
528 data_ac,
529 meta_ac,
530 &why);
531 if ((status < 0) && (status != -EAGAIN)) {
532 if (status != -ENOSPC)
533 mlog_errno(status);
534 goto leave;
535 }
536
537 status = ocfs2_journal_dirty(handle, bh);
538 if (status < 0) {
539 mlog_errno(status);
540 goto leave;
541 }
542
543 spin_lock(&OCFS2_I(inode)->ip_lock);
544 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
545 spin_unlock(&OCFS2_I(inode)->ip_lock);
546
547 if (why != RESTART_NONE && clusters_to_add) {
548 if (why == RESTART_META) {
549 mlog(0, "restarting function.\n");
550 restart_func = 1;
551 } else {
552 BUG_ON(why != RESTART_TRANS);
553
554 mlog(0, "restarting transaction.\n");
555 /* TODO: This can be more intelligent. */
556 credits = ocfs2_calc_extend_credits(osb->sb,
557 fe,
558 clusters_to_add);
559 status = ocfs2_extend_trans(handle, credits);
560 if (status < 0) {
561 /* handle still has to be committed at
562 * this point. */
563 status = -ENOMEM;
564 mlog_errno(status);
565 goto leave;
566 }
567 goto restarted_transaction;
568 }
569 }
570
571 mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
572 fe->i_clusters, fe->i_size);
573 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
574 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
575
576leave:
577 if (drop_alloc_sem) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 drop_alloc_sem = 0;
580 }
581 if (handle) {
582 ocfs2_commit_trans(handle);
583 handle = NULL;
584 }
585 if (data_ac) {
586 ocfs2_free_alloc_context(data_ac);
587 data_ac = NULL;
588 }
589 if (meta_ac) {
590 ocfs2_free_alloc_context(meta_ac);
591 meta_ac = NULL;
592 }
593 if ((!status) && restart_func) {
594 restart_func = 0;
595 goto restart_all;
596 }
597 if (bh) {
598 brelse(bh);
599 bh = NULL;
600 }
601
602 mlog_exit(status);
603 return status;
604}
605
606/* Some parts of this taken from generic_cont_expand, which turned out
607 * to be too fragile to do exactly what we need without us having to
608 * worry about recursive locking in ->commit_write(). */
609static int ocfs2_write_zero_page(struct inode *inode,
610 u64 size)
611{
612 struct address_space *mapping = inode->i_mapping;
613 struct page *page;
614 unsigned long index;
615 unsigned int offset;
616 struct ocfs2_journal_handle *handle = NULL;
617 int ret;
618
619 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
620 /* ugh. in prepare/commit_write, if from==to==start of block, we
621 ** skip the prepare. make sure we never send an offset for the start
622 ** of a block
623 */
624 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
625 offset++;
626 }
627 index = size >> PAGE_CACHE_SHIFT;
628
629 page = grab_cache_page(mapping, index);
630 if (!page) {
631 ret = -ENOMEM;
632 mlog_errno(ret);
633 goto out;
634 }
635
636 ret = ocfs2_prepare_write(NULL, page, offset, offset);
637 if (ret < 0) {
638 mlog_errno(ret);
639 goto out_unlock;
640 }
641
642 if (ocfs2_should_order_data(inode)) {
643 handle = ocfs2_start_walk_page_trans(inode, page, offset,
644 offset);
645 if (IS_ERR(handle)) {
646 ret = PTR_ERR(handle);
647 handle = NULL;
648 goto out_unlock;
649 }
650 }
651
652 /* must not update i_size! */
653 ret = block_commit_write(page, offset, offset);
654 if (ret < 0)
655 mlog_errno(ret);
656 else
657 ret = 0;
658
659 if (handle)
660 ocfs2_commit_trans(handle);
661out_unlock:
662 unlock_page(page);
663 page_cache_release(page);
664out:
665 return ret;
666}
667
668static int ocfs2_zero_extend(struct inode *inode,
669 u64 zero_to_size)
670{
671 int ret = 0;
672 u64 start_off;
673 struct super_block *sb = inode->i_sb;
674
675 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
676 while (start_off < zero_to_size) {
677 ret = ocfs2_write_zero_page(inode, start_off);
678 if (ret < 0) {
679 mlog_errno(ret);
680 goto out;
681 }
682
683 start_off += sb->s_blocksize;
684 }
685
686out:
687 return ret;
688}
689
690static int ocfs2_extend_file(struct inode *inode,
691 struct buffer_head *di_bh,
692 u64 new_i_size)
693{
694 int ret = 0;
695 u32 clusters_to_add;
696
697 /* setattr sometimes calls us like this. */
698 if (new_i_size == 0)
699 goto out;
700
701 if (i_size_read(inode) == new_i_size)
702 goto out;
703 BUG_ON(new_i_size < i_size_read(inode));
704
705 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
706 OCFS2_I(inode)->ip_clusters;
707
708 if (clusters_to_add) {
709 ret = ocfs2_extend_allocation(inode, clusters_to_add);
710 if (ret < 0) {
711 mlog_errno(ret);
712 goto out;
713 }
714
715 ret = ocfs2_zero_extend(inode, new_i_size);
716 if (ret < 0) {
717 mlog_errno(ret);
718 goto out;
719 }
720 }
721
722 /* No allocation required, we just use this helper to
723 * do a trivial update of i_size. */
724 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
725 if (ret < 0) {
726 mlog_errno(ret);
727 goto out;
728 }
729
730out:
731 return ret;
732}
733
734int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
735{
736 int status = 0, size_change;
737 struct inode *inode = dentry->d_inode;
738 struct super_block *sb = inode->i_sb;
739 struct ocfs2_super *osb = OCFS2_SB(sb);
740 struct buffer_head *bh = NULL;
741 struct ocfs2_journal_handle *handle = NULL;
742
743 mlog_entry("(0x%p, '%.*s')\n", dentry,
744 dentry->d_name.len, dentry->d_name.name);
745
746 if (attr->ia_valid & ATTR_MODE)
747 mlog(0, "mode change: %d\n", attr->ia_mode);
748 if (attr->ia_valid & ATTR_UID)
749 mlog(0, "uid change: %d\n", attr->ia_uid);
750 if (attr->ia_valid & ATTR_GID)
751 mlog(0, "gid change: %d\n", attr->ia_gid);
752 if (attr->ia_valid & ATTR_SIZE)
753 mlog(0, "size change...\n");
754 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
755 mlog(0, "time change...\n");
756
757#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
758 | ATTR_GID | ATTR_UID | ATTR_MODE)
759 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
760 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
761 return 0;
762 }
763
764 status = inode_change_ok(inode, attr);
765 if (status)
766 return status;
767
768 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
769 if (size_change) {
770 status = ocfs2_rw_lock(inode, 1);
771 if (status < 0) {
772 mlog_errno(status);
773 goto bail;
774 }
775 }
776
777 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
778 if (status < 0) {
779 if (status != -ENOENT)
780 mlog_errno(status);
781 goto bail_unlock_rw;
782 }
783
784 if (size_change && attr->ia_size != i_size_read(inode)) {
785 if (i_size_read(inode) > attr->ia_size)
786 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
787 else
788 status = ocfs2_extend_file(inode, bh, attr->ia_size);
789 if (status < 0) {
790 if (status != -ENOSPC)
791 mlog_errno(status);
792 status = -ENOSPC;
793 goto bail_unlock;
794 }
795 }
796
797 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
798 if (IS_ERR(handle)) {
799 status = PTR_ERR(handle);
800 mlog_errno(status);
801 goto bail_unlock;
802 }
803
804 status = inode_setattr(inode, attr);
805 if (status < 0) {
806 mlog_errno(status);
807 goto bail_commit;
808 }
809
810 status = ocfs2_mark_inode_dirty(handle, inode, bh);
811 if (status < 0)
812 mlog_errno(status);
813
814bail_commit:
815 ocfs2_commit_trans(handle);
816bail_unlock:
817 ocfs2_meta_unlock(inode, 1);
818bail_unlock_rw:
819 if (size_change)
820 ocfs2_rw_unlock(inode, 1);
821bail:
822 if (bh)
823 brelse(bh);
824
825 mlog_exit(status);
826 return status;
827}
828
829int ocfs2_getattr(struct vfsmount *mnt,
830 struct dentry *dentry,
831 struct kstat *stat)
832{
833 struct inode *inode = dentry->d_inode;
834 struct super_block *sb = dentry->d_inode->i_sb;
835 struct ocfs2_super *osb = sb->s_fs_info;
836 int err;
837
838 mlog_entry_void();
839
840 err = ocfs2_inode_revalidate(dentry);
841 if (err) {
842 if (err != -ENOENT)
843 mlog_errno(err);
844 goto bail;
845 }
846
847 generic_fillattr(inode, stat);
848
849 /* We set the blksize from the cluster size for performance */
850 stat->blksize = osb->s_clustersize;
851
852bail:
853 mlog_exit(err);
854
855 return err;
856}
857
858static int ocfs2_write_remove_suid(struct inode *inode)
859{
860 int ret;
861 struct buffer_head *bh = NULL;
862 struct ocfs2_inode_info *oi = OCFS2_I(inode);
863 struct ocfs2_journal_handle *handle;
864 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
865 struct ocfs2_dinode *di;
866
867 mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
868 inode->i_mode);
869
870 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
871 if (handle == NULL) {
872 ret = -ENOMEM;
873 mlog_errno(ret);
874 goto out;
875 }
876
877 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
878 if (ret < 0) {
879 mlog_errno(ret);
880 goto out_trans;
881 }
882
883 ret = ocfs2_journal_access(handle, inode, bh,
884 OCFS2_JOURNAL_ACCESS_WRITE);
885 if (ret < 0) {
886 mlog_errno(ret);
887 goto out_bh;
888 }
889
890 inode->i_mode &= ~S_ISUID;
891 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
892 inode->i_mode &= ~S_ISGID;
893
894 di = (struct ocfs2_dinode *) bh->b_data;
895 di->i_mode = cpu_to_le16(inode->i_mode);
896
897 ret = ocfs2_journal_dirty(handle, bh);
898 if (ret < 0)
899 mlog_errno(ret);
900out_bh:
901 brelse(bh);
902out_trans:
903 ocfs2_commit_trans(handle);
904out:
905 mlog_exit(ret);
906 return ret;
907}
908
909static inline int ocfs2_write_should_remove_suid(struct inode *inode)
910{
911 mode_t mode = inode->i_mode;
912
913 if (!capable(CAP_FSETID)) {
914 if (unlikely(mode & S_ISUID))
915 return 1;
916
917 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
918 return 1;
919 }
920 return 0;
921}
922
923static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
924 const char __user *buf,
925 size_t count,
926 loff_t pos)
927{
928 struct iovec local_iov = { .iov_base = (void __user *)buf,
929 .iov_len = count };
930 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
931 u32 clusters;
932 struct file *filp = iocb->ki_filp;
933 struct inode *inode = filp->f_dentry->d_inode;
934 loff_t newsize, saved_pos;
935#ifdef OCFS2_ORACORE_WORKAROUNDS
936 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
937#endif
938
939 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
940 (unsigned int)count,
941 filp->f_dentry->d_name.len,
942 filp->f_dentry->d_name.name);
943
944 /* happy write of zero bytes */
945 if (count == 0)
946 return 0;
947
948 if (!inode) {
949 mlog(0, "bad inode\n");
950 return -EIO;
951 }
952
953#ifdef OCFS2_ORACORE_WORKAROUNDS
954 /* ugh, work around some applications which open everything O_DIRECT +
955 * O_APPEND and really don't mean to use O_DIRECT. */
956 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
957 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
958 filp->f_flags &= ~O_DIRECT;
959#endif
960
961 down(&inode->i_sem);
962 /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
963 if (filp->f_flags & O_DIRECT) {
964 have_alloc_sem = 1;
965 down_read(&inode->i_alloc_sem);
966 }
967
968 /* concurrent O_DIRECT writes are allowed */
969 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
970 ret = ocfs2_rw_lock(inode, rw_level);
971 if (ret < 0) {
972 rw_level = -1;
973 mlog_errno(ret);
974 goto out;
975 }
976
977 /*
978 * We sample i_size under a read level meta lock to see if our write
979 * is extending the file, if it is we back off and get a write level
980 * meta lock.
981 */
982 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
983 for(;;) {
984 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
985 if (ret < 0) {
986 meta_level = -1;
987 mlog_errno(ret);
988 goto out;
989 }
990
991 /* Clear suid / sgid if necessary. We do this here
992 * instead of later in the write path because
993 * remove_suid() calls ->setattr without any hint that
994 * we may have already done our cluster locking. Since
995 * ocfs2_setattr() *must* take cluster locks to
996 * proceeed, this will lead us to recursively lock the
997 * inode. There's also the dinode i_size state which
998 * can be lost via setattr during extending writes (we
999 * set inode->i_size at the end of a write. */
1000 if (ocfs2_write_should_remove_suid(inode)) {
1001 if (meta_level == 0) {
1002 ocfs2_meta_unlock(inode, meta_level);
1003 meta_level = 1;
1004 continue;
1005 }
1006
1007 ret = ocfs2_write_remove_suid(inode);
1008 if (ret < 0) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012 }
1013
1014 /* work on a copy of ppos until we're sure that we won't have
1015 * to recalculate it due to relocking. */
1016 if (filp->f_flags & O_APPEND) {
1017 saved_pos = i_size_read(inode);
1018 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1019 } else {
1020 saved_pos = iocb->ki_pos;
1021 }
1022 newsize = count + saved_pos;
1023
1024 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
1025 saved_pos, newsize, i_size_read(inode));
1026
1027 /* No need for a higher level metadata lock if we're
1028 * never going past i_size. */
1029 if (newsize <= i_size_read(inode))
1030 break;
1031
1032 if (meta_level == 0) {
1033 ocfs2_meta_unlock(inode, meta_level);
1034 meta_level = 1;
1035 continue;
1036 }
1037
1038 spin_lock(&OCFS2_I(inode)->ip_lock);
1039 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1040 OCFS2_I(inode)->ip_clusters;
1041 spin_unlock(&OCFS2_I(inode)->ip_lock);
1042
1043 mlog(0, "Writing at EOF, may need more allocation: "
1044 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
1045 i_size_read(inode), newsize, clusters);
1046
1047 /* We only want to continue the rest of this loop if
1048 * our extend will actually require more
1049 * allocation. */
1050 if (!clusters)
1051 break;
1052
1053 ret = ocfs2_extend_allocation(inode, clusters);
1054 if (ret < 0) {
1055 if (ret != -ENOSPC)
1056 mlog_errno(ret);
1057 goto out;
1058 }
1059
1060 /* Fill any holes which would've been created by this
1061 * write. If we're O_APPEND, this will wind up
1062 * (correctly) being a noop. */
1063 ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1064 if (ret < 0) {
1065 mlog_errno(ret);
1066 goto out;
1067 }
1068 break;
1069 }
1070
1071 /* ok, we're done with i_size and alloc work */
1072 iocb->ki_pos = saved_pos;
1073 ocfs2_meta_unlock(inode, meta_level);
1074 meta_level = -1;
1075
1076 /* communicate with ocfs2_dio_end_io */
1077 ocfs2_iocb_set_rw_locked(iocb);
1078
1079#ifdef OCFS2_ORACORE_WORKAROUNDS
1080 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1081 filp->f_flags & O_DIRECT) {
1082 unsigned int saved_flags = filp->f_flags;
1083 int sector_size = 1 << osb->s_sectsize_bits;
1084
1085 if ((saved_pos & (sector_size - 1)) ||
1086 (count & (sector_size - 1)) ||
1087 ((unsigned long)buf & (sector_size - 1))) {
1088 filp->f_flags |= O_SYNC;
1089 filp->f_flags &= ~O_DIRECT;
1090 }
1091
1092 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1093 &iocb->ki_pos);
1094
1095 filp->f_flags = saved_flags;
1096 } else
1097#endif
1098 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1099 &iocb->ki_pos);
1100
1101 /* buffered aio wouldn't have proper lock coverage today */
1102 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1103
1104 /*
1105 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1106 * function pointer which is called when o_direct io completes so that
1107 * it can unlock our rw lock. (it's the clustered equivalent of
1108 * i_alloc_sem; protects truncate from racing with pending ios).
1109 * Unfortunately there are error cases which call end_io and others
1110 * that don't. so we don't have to unlock the rw_lock if either an
1111 * async dio is going to do it in the future or an end_io after an
1112 * error has already done it.
1113 */
1114 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1115 rw_level = -1;
1116 have_alloc_sem = 0;
1117 }
1118
1119out:
1120 if (meta_level != -1)
1121 ocfs2_meta_unlock(inode, meta_level);
1122 if (have_alloc_sem)
1123 up_read(&inode->i_alloc_sem);
1124 if (rw_level != -1)
1125 ocfs2_rw_unlock(inode, rw_level);
1126 up(&inode->i_sem);
1127
1128 mlog_exit(ret);
1129 return ret;
1130}
1131
1132static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1133 char __user *buf,
1134 size_t count,
1135 loff_t pos)
1136{
1137 int ret = 0, rw_level = -1, have_alloc_sem = 0;
1138 struct file *filp = iocb->ki_filp;
1139 struct inode *inode = filp->f_dentry->d_inode;
1140#ifdef OCFS2_ORACORE_WORKAROUNDS
1141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1142#endif
1143
1144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1145 (unsigned int)count,
1146 filp->f_dentry->d_name.len,
1147 filp->f_dentry->d_name.name);
1148
1149 if (!inode) {
1150 ret = -EINVAL;
1151 mlog_errno(ret);
1152 goto bail;
1153 }
1154
1155#ifdef OCFS2_ORACORE_WORKAROUNDS
1156 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1157 if (filp->f_flags & O_DIRECT) {
1158 int sector_size = 1 << osb->s_sectsize_bits;
1159
1160 if ((pos & (sector_size - 1)) ||
1161 (count & (sector_size - 1)) ||
1162 ((unsigned long)buf & (sector_size - 1)) ||
1163 (i_size_read(inode) & (sector_size -1))) {
1164 filp->f_flags &= ~O_DIRECT;
1165 }
1166 }
1167 }
1168#endif
1169
1170 /*
1171 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1172 * need locks to protect pending reads from racing with truncate.
1173 */
1174 if (filp->f_flags & O_DIRECT) {
1175 down_read(&inode->i_alloc_sem);
1176 have_alloc_sem = 1;
1177
1178 ret = ocfs2_rw_lock(inode, 0);
1179 if (ret < 0) {
1180 mlog_errno(ret);
1181 goto bail;
1182 }
1183 rw_level = 0;
1184 /* communicate with ocfs2_dio_end_io */
1185 ocfs2_iocb_set_rw_locked(iocb);
1186 }
1187
1188 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
1189 if (ret == -EINVAL)
1190 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1191
1192 /* buffered aio wouldn't have proper lock coverage today */
1193 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1194
1195 /* see ocfs2_file_aio_write */
1196 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1197 rw_level = -1;
1198 have_alloc_sem = 0;
1199 }
1200
1201bail:
1202 if (have_alloc_sem)
1203 up_read(&inode->i_alloc_sem);
1204 if (rw_level != -1)
1205 ocfs2_rw_unlock(inode, rw_level);
1206 mlog_exit(ret);
1207
1208 return ret;
1209}
1210
1211struct inode_operations ocfs2_file_iops = {
1212 .setattr = ocfs2_setattr,
1213 .getattr = ocfs2_getattr,
1214};
1215
1216struct inode_operations ocfs2_special_file_iops = {
1217 .setattr = ocfs2_setattr,
1218 .getattr = ocfs2_getattr,
1219};
1220
1221struct file_operations ocfs2_fops = {
1222 .read = do_sync_read,
1223 .write = do_sync_write,
1224 .sendfile = generic_file_sendfile,
1225 .mmap = ocfs2_mmap,
1226 .fsync = ocfs2_sync_file,
1227 .release = ocfs2_file_release,
1228 .open = ocfs2_file_open,
1229 .aio_read = ocfs2_file_aio_read,
1230 .aio_write = ocfs2_file_aio_write,
1231};
1232
1233struct file_operations ocfs2_dops = {
1234 .read = generic_read_dir,
1235 .readdir = ocfs2_readdir,
1236 .fsync = ocfs2_sync_file,
1237};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 000000000000..a5ea33b24060
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_FILE_H
27#define OCFS2_FILE_H
28
29extern struct file_operations ocfs2_fops;
30extern struct file_operations ocfs2_dops;
31extern struct inode_operations ocfs2_file_iops;
32extern struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context;
34
35enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0,
37 RESTART_TRANS,
38 RESTART_META
39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode,
42 u32 clusters_to_add,
43 struct buffer_head *fe_bh,
44 struct ocfs2_journal_handle *handle,
45 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat);
51
52int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
53 struct inode *inode,
54 struct buffer_head *fe_bh,
55 u64 new_i_size);
56
57#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 000000000000..0bbd22f46c80
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.c
5 *
6 * Register ourselves with the heartbaet service, keep our node maps
7 * up to date, and fire off recovery when needed.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_SUPER
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "heartbeat.h"
45#include "inode.h"
46#include "journal.h"
47#include "vote.h"
48
49#include "buffer_head_io.h"
50
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
57 int bit);
58static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
59static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
60 struct ocfs2_node_map *from);
61static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
62 struct ocfs2_node_map *from);
63
64void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{
66 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70}
71
72static void ocfs2_do_node_down(int node_num,
73 struct ocfs2_super *osb)
74{
75 BUG_ON(osb->node_num == node_num);
76
77 mlog(0, "ocfs2: node down event for %d\n", node_num);
78
79 if (!osb->dlm) {
80 /*
81 * No DLM means we're not even ready to participate yet.
82 * We check the slots after the DLM comes up, so we will
83 * notice the node death then. We can safely ignore it
84 * here.
85 */
86 return;
87 }
88
89 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
90 /* If a node is in the umount map, then we've been
91 * expecting him to go down and we know ahead of time
92 * that recovery is not necessary. */
93 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
94 return;
95 }
96
97 ocfs2_recovery_thread(osb, node_num);
98
99 ocfs2_remove_node_from_vote_queues(osb, node_num);
100}
101
102static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
103 int node_num,
104 void *data)
105{
106 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
107}
108
109/* Called from the dlm when it's about to evict a node. We may also
110 * get a heartbeat callback later. */
111static void ocfs2_dlm_eviction_cb(int node_num,
112 void *data)
113{
114 struct ocfs2_super *osb = (struct ocfs2_super *) data;
115 struct super_block *sb = osb->sb;
116
117 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
118 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
119
120 ocfs2_do_node_down(node_num, osb);
121}
122
123static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
124 int node_num,
125 void *data)
126{
127 struct ocfs2_super *osb = data;
128
129 BUG_ON(osb->node_num == node_num);
130
131 mlog(0, "node up event for %d\n", node_num);
132 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
133}
134
135void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
136{
137 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
138 ocfs2_hb_node_down_cb, osb,
139 OCFS2_HB_NODE_DOWN_PRI);
140
141 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
142 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
143
144 /* Not exactly a heartbeat callback, but leads to essentially
145 * the same path so we set it up here. */
146 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
147 ocfs2_dlm_eviction_cb,
148 osb);
149}
150
151/* Most functions here are just stubs for now... */
152int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
153{
154 int status;
155
156 status = o2hb_register_callback(&osb->osb_hb_down);
157 if (status < 0) {
158 mlog_errno(status);
159 goto bail;
160 }
161
162 status = o2hb_register_callback(&osb->osb_hb_up);
163 if (status < 0)
164 mlog_errno(status);
165
166bail:
167 return status;
168}
169
170void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
171{
172 int status;
173
174 status = o2hb_unregister_callback(&osb->osb_hb_down);
175 if (status < 0)
176 mlog_errno(status);
177
178 status = o2hb_unregister_callback(&osb->osb_hb_up);
179 if (status < 0)
180 mlog_errno(status);
181}
182
183void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
184{
185 int ret;
186 char *argv[5], *envp[3];
187
188 if (!osb->uuid_str) {
189 /* This can happen if we don't get far enough in mount... */
190 mlog(0, "No UUID with which to stop heartbeat!\n\n");
191 return;
192 }
193
194 argv[0] = (char *)o2nm_get_hb_ctl_path();
195 argv[1] = "-K";
196 argv[2] = "-u";
197 argv[3] = osb->uuid_str;
198 argv[4] = NULL;
199
200 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
201
202 /* minimal command environment taken from cpu_run_sbin_hotplug */
203 envp[0] = "HOME=/";
204 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
205 envp[2] = NULL;
206
207 ret = call_usermodehelper(argv[0], argv, envp, 1);
208 if (ret < 0)
209 mlog_errno(ret);
210}
211
212/* special case -1 for now
213 * TODO: should *really* make sure the calling func never passes -1!! */
214void ocfs2_node_map_init(struct ocfs2_node_map *map)
215{
216 map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
217 memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
218 sizeof(unsigned long));
219}
220
221static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
222 int bit)
223{
224 set_bit(bit, map->map);
225}
226
227void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
228 struct ocfs2_node_map *map,
229 int bit)
230{
231 if (bit==-1)
232 return;
233 BUG_ON(bit >= map->num_nodes);
234 spin_lock(&osb->node_map_lock);
235 __ocfs2_node_map_set_bit(map, bit);
236 spin_unlock(&osb->node_map_lock);
237}
238
239static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
240 int bit)
241{
242 clear_bit(bit, map->map);
243}
244
245void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
246 struct ocfs2_node_map *map,
247 int bit)
248{
249 if (bit==-1)
250 return;
251 BUG_ON(bit >= map->num_nodes);
252 spin_lock(&osb->node_map_lock);
253 __ocfs2_node_map_clear_bit(map, bit);
254 spin_unlock(&osb->node_map_lock);
255}
256
257int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
258 struct ocfs2_node_map *map,
259 int bit)
260{
261 int ret;
262 if (bit >= map->num_nodes) {
263 mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
264 BUG();
265 }
266 spin_lock(&osb->node_map_lock);
267 ret = test_bit(bit, map->map);
268 spin_unlock(&osb->node_map_lock);
269 return ret;
270}
271
272static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
273{
274 int bit;
275 bit = find_next_bit(map->map, map->num_nodes, 0);
276 if (bit < map->num_nodes)
277 return 0;
278 return 1;
279}
280
281int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
282 struct ocfs2_node_map *map)
283{
284 int ret;
285 BUG_ON(map->num_nodes == 0);
286 spin_lock(&osb->node_map_lock);
287 ret = __ocfs2_node_map_is_empty(map);
288 spin_unlock(&osb->node_map_lock);
289 return ret;
290}
291
292static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
293 struct ocfs2_node_map *from)
294{
295 BUG_ON(from->num_nodes == 0);
296 ocfs2_node_map_init(target);
297 __ocfs2_node_map_set(target, from);
298}
299
300/* returns 1 if bit is the only bit set in target, 0 otherwise */
301int ocfs2_node_map_is_only(struct ocfs2_super *osb,
302 struct ocfs2_node_map *target,
303 int bit)
304{
305 struct ocfs2_node_map temp;
306 int ret;
307
308 spin_lock(&osb->node_map_lock);
309 __ocfs2_node_map_dup(&temp, target);
310 __ocfs2_node_map_clear_bit(&temp, bit);
311 ret = __ocfs2_node_map_is_empty(&temp);
312 spin_unlock(&osb->node_map_lock);
313
314 return ret;
315}
316
317static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
318 struct ocfs2_node_map *from)
319{
320 int num_longs, i;
321
322 BUG_ON(target->num_nodes != from->num_nodes);
323 BUG_ON(target->num_nodes == 0);
324
325 num_longs = BITS_TO_LONGS(target->num_nodes);
326 for (i = 0; i < num_longs; i++)
327 target->map[i] = from->map[i];
328}
329
330/* Returns whether the recovery bit was actually set - it may not be
331 * if a node is still marked as needing recovery */
332int ocfs2_recovery_map_set(struct ocfs2_super *osb,
333 int num)
334{
335 int set = 0;
336
337 spin_lock(&osb->node_map_lock);
338
339 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
340
341 if (!test_bit(num, osb->recovery_map.map)) {
342 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
343 set = 1;
344 }
345
346 spin_unlock(&osb->node_map_lock);
347
348 return set;
349}
350
351void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
352 int num)
353{
354 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
355}
356
357int ocfs2_node_map_iterate(struct ocfs2_super *osb,
358 struct ocfs2_node_map *map,
359 int idx)
360{
361 int i = idx;
362
363 idx = O2NM_INVALID_NODE_NUM;
364 spin_lock(&osb->node_map_lock);
365 if ((i != O2NM_INVALID_NODE_NUM) &&
366 (i >= 0) &&
367 (i < map->num_nodes)) {
368 while(i < map->num_nodes) {
369 if (test_bit(i, map->map)) {
370 idx = i;
371 break;
372 }
373 i++;
374 }
375 }
376 spin_unlock(&osb->node_map_lock);
377 return idx;
378}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 000000000000..e8fb079122e4
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_HEARTBEAT_H
27#define OCFS2_HEARTBEAT_H
28
29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35
36/* node map functions - used to keep track of mounted and in-recovery
37 * nodes. */
38void ocfs2_node_map_init(struct ocfs2_node_map *map);
39int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
40 struct ocfs2_node_map *map);
41void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
42 struct ocfs2_node_map *map,
43 int bit);
44void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map,
46 int bit);
47int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int bit);
50int ocfs2_node_map_iterate(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map,
52 int idx);
53static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
54 struct ocfs2_node_map *map)
55{
56 return ocfs2_node_map_iterate(osb, map, 0);
57}
58int ocfs2_recovery_map_set(struct ocfs2_super *osb,
59 int num);
60void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
61 int num);
62/* returns 1 if bit is the only bit set in target, 0 otherwise */
63int ocfs2_node_map_is_only(struct ocfs2_super *osb,
64 struct ocfs2_node_map *target,
65 int bit);
66
67#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 000000000000..a91ba4dec936
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c
5 *
6 * vfs' aops, fops, dops and iops
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32
33#include <asm/byteorder.h>
34
35#define MLOG_MASK_PREFIX ML_INODE
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "alloc.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "inode.h"
45#include "journal.h"
46#include "namei.h"
47#include "suballoc.h"
48#include "super.h"
49#include "symlink.h"
50#include "sysfile.h"
51#include "uptodate.h"
52#include "vote.h"
53
54#include "buffer_head_io.h"
55
56#define OCFS2_FI_FLAG_NOWAIT 0x1
57#define OCFS2_FI_FLAG_DELETE 0x2
58struct ocfs2_find_inode_args
59{
60 u64 fi_blkno;
61 unsigned long fi_ino;
62 unsigned int fi_flags;
63};
64
65static int ocfs2_read_locked_inode(struct inode *inode,
66 struct ocfs2_find_inode_args *args);
67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
68static int ocfs2_find_actor(struct inode *inode, void *opaque);
69static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
70 struct inode *inode,
71 struct buffer_head *fe_bh);
72
73struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
74 u64 blkno,
75 int delete_vote)
76{
77 struct ocfs2_find_inode_args args;
78
79 /* ocfs2_ilookup_for_vote should *only* be called from the
80 * vote thread */
81 BUG_ON(current != osb->vote_task);
82
83 args.fi_blkno = blkno;
84 args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
85 if (delete_vote)
86 args.fi_flags |= OCFS2_FI_FLAG_DELETE;
87 args.fi_ino = ino_from_blkno(osb->sb, blkno);
88 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
89}
90
91struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
92{
93 struct inode *inode = NULL;
94 struct super_block *sb = osb->sb;
95 struct ocfs2_find_inode_args args;
96
97 mlog_entry("(blkno = %"MLFu64")\n", blkno);
98
99 /* Ok. By now we've either got the offsets passed to us by the
100 * caller, or we just pulled them off the bh. Lets do some
101 * sanity checks to make sure they're OK. */
102 if (blkno == 0) {
103 inode = ERR_PTR(-EINVAL);
104 mlog_errno(PTR_ERR(inode));
105 goto bail;
106 }
107
108 args.fi_blkno = blkno;
109 args.fi_flags = 0;
110 args.fi_ino = ino_from_blkno(sb, blkno);
111
112 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
113 ocfs2_init_locked_inode, &args);
114 /* inode was *not* in the inode cache. 2.6.x requires
115 * us to do our own read_inode call and unlock it
116 * afterwards. */
117 if (inode && inode->i_state & I_NEW) {
118 mlog(0, "Inode was not in inode cache, reading it.\n");
119 ocfs2_read_locked_inode(inode, &args);
120 unlock_new_inode(inode);
121 }
122 if (inode == NULL) {
123 inode = ERR_PTR(-ENOMEM);
124 mlog_errno(PTR_ERR(inode));
125 goto bail;
126 }
127 if (is_bad_inode(inode)) {
128 iput(inode);
129 inode = ERR_PTR(-ESTALE);
130 mlog_errno(PTR_ERR(inode));
131 goto bail;
132 }
133
134bail:
135 if (!IS_ERR(inode)) {
136 mlog(0, "returning inode with number %"MLFu64"\n",
137 OCFS2_I(inode)->ip_blkno);
138 mlog_exit_ptr(inode);
139 } else
140 mlog_errno(PTR_ERR(inode));
141
142 return inode;
143}
144
145
146/*
147 * here's how inodes get read from disk:
148 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
149 * found? : return the in-memory inode
150 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
151 */
152
153static int ocfs2_find_actor(struct inode *inode, void *opaque)
154{
155 struct ocfs2_find_inode_args *args = NULL;
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157 int ret = 0;
158
159 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
160
161 args = opaque;
162
163 mlog_bug_on_msg(!inode, "No inode in find actor!\n");
164
165 if (oi->ip_blkno != args->fi_blkno)
166 goto bail;
167
168 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
169 * ocfs2_ilookup_for_vote which won't create an inode for one
170 * that isn't found. The vote thread which doesn't want to get
171 * an inode which is in the process of going away - otherwise
172 * the call to __wait_on_freeing_inode in find_inode_fast will
173 * cause it to deadlock on an inode which may be waiting on a
174 * vote (or lock release) in delete_inode */
175 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
176 (inode->i_state & (I_FREEING|I_CLEAR))) {
177 /* As stated above, we're not going to return an
178 * inode. In the case of a delete vote, the voting
179 * code is going to signal the other node to go
180 * ahead. Mark that state here, so this freeing inode
181 * has the state when it gets to delete_inode. */
182 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
183 spin_lock(&oi->ip_lock);
184 ocfs2_mark_inode_remotely_deleted(inode);
185 spin_unlock(&oi->ip_lock);
186 }
187 goto bail;
188 }
189
190 ret = 1;
191bail:
192 mlog_exit(ret);
193 return ret;
194}
195
196/*
197 * initialize the new inode, but don't do anything that would cause
198 * us to sleep.
199 * return 0 on success, 1 on failure
200 */
201static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
202{
203 struct ocfs2_find_inode_args *args = opaque;
204
205 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
206
207 inode->i_ino = args->fi_ino;
208 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
209
210 mlog_exit(0);
211 return 0;
212}
213
214int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
215 int create_ino)
216{
217 struct super_block *sb;
218 struct ocfs2_super *osb;
219 int status = -EINVAL;
220
221 mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
222
223 sb = inode->i_sb;
224 osb = OCFS2_SB(sb);
225
226 /* this means that read_inode cannot create a superblock inode
227 * today. change if needed. */
228 if (!OCFS2_IS_VALID_DINODE(fe) ||
229 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
230 mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
231 "signature = %.*s, flags = 0x%x\n",
232 inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
233 fe->i_signature, le32_to_cpu(fe->i_flags));
234 goto bail;
235 }
236
237 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
238 mlog(ML_ERROR, "file entry generation does not match "
239 "superblock! osb->fs_generation=%x, "
240 "fe->i_fs_generation=%x\n",
241 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
242 goto bail;
243 }
244
245 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation);
247 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
248 inode->i_mode = le16_to_cpu(fe->i_mode);
249 inode->i_uid = le32_to_cpu(fe->i_uid);
250 inode->i_gid = le32_to_cpu(fe->i_gid);
251 inode->i_blksize = (u32)osb->s_clustersize;
252
253 /* Fast symlinks will have i_size but no allocated clusters. */
254 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
255 inode->i_blocks = 0;
256 else
257 inode->i_blocks =
258 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
259 inode->i_mapping->a_ops = &ocfs2_aops;
260 inode->i_flags |= S_NOATIME;
261 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
262 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
263 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
264 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
265 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
266 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
267
268 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
269 mlog(ML_ERROR,
270 "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
271 OCFS2_I(inode)->ip_blkno, fe->i_blkno);
272
273 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
274 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
275
276 if (create_ino)
277 inode->i_ino = ino_from_blkno(inode->i_sb,
278 le64_to_cpu(fe->i_blkno));
279
280 mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
281 fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
282
283 inode->i_nlink = le16_to_cpu(fe->i_links_count);
284
285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
291 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
292 /* we can't actually hit this as read_inode can't
293 * handle superblocks today ;-) */
294 BUG();
295 }
296
297 switch (inode->i_mode & S_IFMT) {
298 case S_IFREG:
299 inode->i_fop = &ocfs2_fops;
300 inode->i_op = &ocfs2_file_iops;
301 i_size_write(inode, le64_to_cpu(fe->i_size));
302 break;
303 case S_IFDIR:
304 inode->i_op = &ocfs2_dir_iops;
305 inode->i_fop = &ocfs2_dops;
306 i_size_write(inode, le64_to_cpu(fe->i_size));
307 break;
308 case S_IFLNK:
309 if (ocfs2_inode_is_fast_symlink(inode))
310 inode->i_op = &ocfs2_fast_symlink_inode_operations;
311 else
312 inode->i_op = &ocfs2_symlink_inode_operations;
313 i_size_write(inode, le64_to_cpu(fe->i_size));
314 break;
315 default:
316 inode->i_op = &ocfs2_special_file_iops;
317 init_special_inode(inode, inode->i_mode,
318 inode->i_rdev);
319 break;
320 }
321
322 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
323 OCFS2_LOCK_TYPE_RW, inode);
324 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
325 OCFS2_LOCK_TYPE_META, inode);
326 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
327 OCFS2_LOCK_TYPE_DATA, inode);
328
329 status = 0;
330bail:
331 mlog_exit(status);
332 return status;
333}
334
335static int ocfs2_read_locked_inode(struct inode *inode,
336 struct ocfs2_find_inode_args *args)
337{
338 struct super_block *sb;
339 struct ocfs2_super *osb;
340 struct ocfs2_dinode *fe;
341 struct buffer_head *bh = NULL;
342 int status;
343 int sysfile = 0;
344
345 mlog_entry("(0x%p, 0x%p)\n", inode, args);
346
347 status = -EINVAL;
348 if (inode == NULL || inode->i_sb == NULL) {
349 mlog(ML_ERROR, "bad inode\n");
350 goto bail;
351 }
352 sb = inode->i_sb;
353 osb = OCFS2_SB(sb);
354
355 if (!args) {
356 mlog(ML_ERROR, "bad inode args\n");
357 make_bad_inode(inode);
358 goto bail;
359 }
360
361 /* Read the FE off disk. This is safe because the kernel only
362 * does one read_inode2 for a new inode, and if it doesn't
363 * exist yet then nobody can be working on it! */
364 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
365 if (status < 0) {
366 mlog_errno(status);
367 make_bad_inode(inode);
368 goto bail;
369 }
370
371 fe = (struct ocfs2_dinode *) bh->b_data;
372 if (!OCFS2_IS_VALID_DINODE(fe)) {
373 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
374 fe->i_blkno, 7, fe->i_signature);
375 make_bad_inode(inode);
376 goto bail;
377 }
378
379 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
380 sysfile = 1;
381
382 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
383 S_ISBLK(le16_to_cpu(fe->i_mode)))
384 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
385
386 status = -EINVAL;
387 if (ocfs2_populate_inode(inode, fe, 0) < 0) {
388 mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
389 "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
390 make_bad_inode(inode);
391 goto bail;
392 }
393
394 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
395
396 if (sysfile)
397 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
398
399 status = 0;
400
401bail:
402 if (args && bh)
403 brelse(bh);
404
405 mlog_exit(status);
406 return status;
407}
408
409void ocfs2_sync_blockdev(struct super_block *sb)
410{
411 sync_blockdev(sb->s_bdev);
412}
413
414static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
415 struct inode *inode,
416 struct buffer_head *fe_bh)
417{
418 int status = 0;
419 struct ocfs2_journal_handle *handle = NULL;
420 struct ocfs2_truncate_context *tc = NULL;
421 struct ocfs2_dinode *fe;
422
423 mlog_entry_void();
424
425 fe = (struct ocfs2_dinode *) fe_bh->b_data;
426
427 /* zero allocation, zero truncate :) */
428 if (!fe->i_clusters)
429 goto bail;
430
431 handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
432 if (IS_ERR(handle)) {
433 status = PTR_ERR(handle);
434 handle = NULL;
435 mlog_errno(status);
436 goto bail;
437 }
438
439 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
440 if (status < 0) {
441 mlog_errno(status);
442 goto bail;
443 }
444
445 ocfs2_commit_trans(handle);
446 handle = NULL;
447
448 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
449 if (status < 0) {
450 mlog_errno(status);
451 goto bail;
452 }
453
454 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
455 if (status < 0) {
456 mlog_errno(status);
457 goto bail;
458 }
459bail:
460 if (handle)
461 ocfs2_commit_trans(handle);
462
463 mlog_exit(status);
464 return status;
465}
466
467static int ocfs2_remove_inode(struct inode *inode,
468 struct buffer_head *di_bh,
469 struct inode *orphan_dir_inode,
470 struct buffer_head *orphan_dir_bh)
471{
472 int status;
473 struct inode *inode_alloc_inode = NULL;
474 struct buffer_head *inode_alloc_bh = NULL;
475 struct ocfs2_journal_handle *handle;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
478
479 inode_alloc_inode =
480 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
481 le16_to_cpu(di->i_suballoc_slot));
482 if (!inode_alloc_inode) {
483 status = -EEXIST;
484 mlog_errno(status);
485 goto bail;
486 }
487
488 down(&inode_alloc_inode->i_sem);
489 status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
490 if (status < 0) {
491 up(&inode_alloc_inode->i_sem);
492
493 mlog_errno(status);
494 goto bail;
495 }
496
497 handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
498 if (IS_ERR(handle)) {
499 status = PTR_ERR(handle);
500 mlog_errno(status);
501 goto bail_unlock;
502 }
503
504 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
505 orphan_dir_bh);
506 if (status < 0) {
507 mlog_errno(status);
508 goto bail_commit;
509 }
510
511 /* set the inodes dtime */
512 status = ocfs2_journal_access(handle, inode, di_bh,
513 OCFS2_JOURNAL_ACCESS_WRITE);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail_commit;
517 }
518
519 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
520 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
521
522 status = ocfs2_journal_dirty(handle, di_bh);
523 if (status < 0) {
524 mlog_errno(status);
525 goto bail_commit;
526 }
527
528 ocfs2_remove_from_cache(inode, di_bh);
529
530 status = ocfs2_free_dinode(handle, inode_alloc_inode,
531 inode_alloc_bh, di);
532 if (status < 0)
533 mlog_errno(status);
534
535bail_commit:
536 ocfs2_commit_trans(handle);
537bail_unlock:
538 ocfs2_meta_unlock(inode_alloc_inode, 1);
539 up(&inode_alloc_inode->i_sem);
540 brelse(inode_alloc_bh);
541bail:
542 iput(inode_alloc_inode);
543
544 return status;
545}
546
547static int ocfs2_wipe_inode(struct inode *inode,
548 struct buffer_head *di_bh)
549{
550 int status, orphaned_slot;
551 struct inode *orphan_dir_inode = NULL;
552 struct buffer_head *orphan_dir_bh = NULL;
553 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
554
555 /* We've already voted on this so it should be readonly - no
556 * spinlock needed. */
557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
558 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
559 ORPHAN_DIR_SYSTEM_INODE,
560 orphaned_slot);
561 if (!orphan_dir_inode) {
562 status = -EEXIST;
563 mlog_errno(status);
564 goto bail;
565 }
566
567 /* Lock the orphan dir. The lock will be held for the entire
568 * delete_inode operation. We do this now to avoid races with
569 * recovery completion on other nodes. */
570 down(&orphan_dir_inode->i_sem);
571 status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
572 if (status < 0) {
573 up(&orphan_dir_inode->i_sem);
574
575 mlog_errno(status);
576 goto bail;
577 }
578
579 /* we do this while holding the orphan dir lock because we
580 * don't want recovery being run from another node to vote for
581 * an inode delete on us -- this will result in two nodes
582 * truncating the same file! */
583 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
584 if (status < 0) {
585 mlog_errno(status);
586 goto bail_unlock_dir;
587 }
588
589 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
590 orphan_dir_bh);
591 if (status < 0)
592 mlog_errno(status);
593
594bail_unlock_dir:
595 ocfs2_meta_unlock(orphan_dir_inode, 1);
596 up(&orphan_dir_inode->i_sem);
597 brelse(orphan_dir_bh);
598bail:
599 iput(orphan_dir_inode);
600
601 return status;
602}
603
604/* There is a series of simple checks that should be done before a
605 * vote is even considered. Encapsulate those in this function. */
606static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
607{
608 int ret = 0;
609 struct ocfs2_inode_info *oi = OCFS2_I(inode);
610 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
611
612 /* We shouldn't be getting here for the root directory
613 * inode.. */
614 if (inode == osb->root_inode) {
615 mlog(ML_ERROR, "Skipping delete of root inode.\n");
616 goto bail;
617 }
618
619 /* If we're coming from process_vote we can't go into our own
620 * voting [hello, deadlock city!], so unforuntately we just
621 * have to skip deleting this guy. That's OK though because
622 * the node who's doing the actual deleting should handle it
623 * anyway. */
624 if (current == osb->vote_task) {
625 mlog(0, "Skipping delete of %lu because we're currently "
626 "in process_vote\n", inode->i_ino);
627 goto bail;
628 }
629
630 spin_lock(&oi->ip_lock);
631 /* OCFS2 *never* deletes system files. This should technically
632 * never get here as system file inodes should always have a
633 * positive link count. */
634 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
635 mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
636 oi->ip_blkno);
637 goto bail_unlock;
638 }
639
640 /* If we have voted "yes" on the wipe of this inode for
641 * another node, it will be marked here so we can safely skip
642 * it. Recovery will cleanup any inodes we might inadvertantly
643 * skip here. */
644 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
645 mlog(0, "Skipping delete of %lu because another node "
646 "has done this for us.\n", inode->i_ino);
647 goto bail_unlock;
648 }
649
650 ret = 1;
651bail_unlock:
652 spin_unlock(&oi->ip_lock);
653bail:
654 return ret;
655}
656
657/* Query the cluster to determine whether we should wipe an inode from
658 * disk or not.
659 *
660 * Requires the inode to have the cluster lock. */
661static int ocfs2_query_inode_wipe(struct inode *inode,
662 struct buffer_head *di_bh,
663 int *wipe)
664{
665 int status = 0;
666 struct ocfs2_inode_info *oi = OCFS2_I(inode);
667 struct ocfs2_dinode *di;
668
669 *wipe = 0;
670
671 /* While we were waiting for the cluster lock in
672 * ocfs2_delete_inode, another node might have asked to delete
673 * the inode. Recheck our flags to catch this. */
674 if (!ocfs2_inode_is_valid_to_delete(inode)) {
675 mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
676 oi->ip_blkno);
677 goto bail;
678 }
679
680 /* Now that we have an up to date inode, we can double check
681 * the link count. */
682 if (inode->i_nlink) {
683 mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
684 oi->ip_blkno, inode->i_nlink);
685 goto bail;
686 }
687
688 /* Do some basic inode verification... */
689 di = (struct ocfs2_dinode *) di_bh->b_data;
690 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
691 /* for lack of a better error? */
692 status = -EEXIST;
693 mlog(ML_ERROR,
694 "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
695 "Disk flags 0x%x, inode flags 0x%x\n",
696 oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
697 goto bail;
698 }
699
700 /* has someone already deleted us?! baaad... */
701 if (di->i_dtime) {
702 status = -EEXIST;
703 mlog_errno(status);
704 goto bail;
705 }
706
707 status = ocfs2_request_delete_vote(inode);
708 /* -EBUSY means that other nodes are still using the
709 * inode. We're done here though, so avoid doing anything on
710 * disk and let them worry about deleting it. */
711 if (status == -EBUSY) {
712 status = 0;
713 mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
714 "other nodes\n", oi->ip_blkno);
715 goto bail;
716 }
717 if (status < 0) {
718 mlog_errno(status);
719 goto bail;
720 }
721
722 spin_lock(&oi->ip_lock);
723 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
724 /* Nobody knew which slot this inode was orphaned
725 * into. This may happen during node death and
726 * recovery knows how to clean it up so we can safely
727 * ignore this inode for now on. */
728 mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
729 oi->ip_blkno);
730 } else {
731 *wipe = 1;
732
733 mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
734 oi->ip_blkno, oi->ip_orphaned_slot);
735 }
736 spin_unlock(&oi->ip_lock);
737
738bail:
739 return status;
740}
741
742/* Support function for ocfs2_delete_inode. Will help us keep the
743 * inode data in a consistent state for clear_inode. Always truncates
744 * pages, optionally sync's them first. */
745static void ocfs2_cleanup_delete_inode(struct inode *inode,
746 int sync_data)
747{
748 mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
749 OCFS2_I(inode)->ip_blkno, sync_data);
750 if (sync_data)
751 write_inode_now(inode, 1);
752 truncate_inode_pages(&inode->i_data, 0);
753}
754
755void ocfs2_delete_inode(struct inode *inode)
756{
757 int wipe, status;
758 sigset_t blocked, oldset;
759 struct buffer_head *di_bh = NULL;
760
761 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
762
763 if (is_bad_inode(inode)) {
764 mlog(0, "Skipping delete of bad inode\n");
765 goto bail;
766 }
767
768 if (!ocfs2_inode_is_valid_to_delete(inode)) {
769 /* It's probably not necessary to truncate_inode_pages
770 * here but we do it for safety anyway (it will most
771 * likely be a no-op anyway) */
772 ocfs2_cleanup_delete_inode(inode, 0);
773 goto bail;
774 }
775
776 /* We want to block signals in delete_inode as the lock and
777 * messaging paths may return us -ERESTARTSYS. Which would
778 * cause us to exit early, resulting in inodes being orphaned
779 * forever. */
780 sigfillset(&blocked);
781 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
782 if (status < 0) {
783 mlog_errno(status);
784 ocfs2_cleanup_delete_inode(inode, 1);
785 goto bail;
786 }
787
788 /* Lock down the inode. This gives us an up to date view of
789 * it's metadata (for verification), and allows us to
790 * serialize delete_inode votes.
791 *
792 * Even though we might be doing a truncate, we don't take the
793 * allocation lock here as it won't be needed - nobody will
794 * have the file open.
795 */
796 status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
797 if (status < 0) {
798 if (status != -ENOENT)
799 mlog_errno(status);
800 ocfs2_cleanup_delete_inode(inode, 0);
801 goto bail_unblock;
802 }
803
804 /* Query the cluster. This will be the final decision made
805 * before we go ahead and wipe the inode. */
806 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
807 if (!wipe || status < 0) {
808 /* Error and inode busy vote both mean we won't be
809 * removing the inode, so they take almost the same
810 * path. */
811 if (status < 0)
812 mlog_errno(status);
813
814 /* Someone in the cluster has voted to not wipe this
815 * inode, or it was never completely orphaned. Write
816 * out the pages and exit now. */
817 ocfs2_cleanup_delete_inode(inode, 1);
818 goto bail_unlock_inode;
819 }
820
821 ocfs2_cleanup_delete_inode(inode, 0);
822
823 status = ocfs2_wipe_inode(inode, di_bh);
824 if (status < 0) {
825 mlog_errno(status);
826 goto bail_unlock_inode;
827 }
828
829 /* Mark the inode as successfully deleted. This is important
830 * for ocfs2_clear_inode as it will check this flag and skip
831 * any checkpointing work */
832 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
833
834bail_unlock_inode:
835 ocfs2_meta_unlock(inode, 1);
836 brelse(di_bh);
837bail_unblock:
838 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
839 if (status < 0)
840 mlog_errno(status);
841bail:
842 clear_inode(inode);
843 mlog_exit_void();
844}
845
846void ocfs2_clear_inode(struct inode *inode)
847{
848 int status;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850
851 mlog_entry_void();
852
853 if (!inode)
854 goto bail;
855
856 mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
857 OCFS2_I(inode)->ip_blkno, inode->i_nlink);
858
859 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
860 "Inode=%lu\n", inode->i_ino);
861
862 /* Do these before all the other work so that we don't bounce
863 * the vote thread while waiting to destroy the locks. */
864 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
865 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
866 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
867
868 /* We very well may get a clear_inode before all an inodes
869 * metadata has hit disk. Of course, we can't drop any cluster
870 * locks until the journal has finished with it. The only
871 * exception here are successfully wiped inodes - their
872 * metadata can now be considered to be part of the system
873 * inodes from which it came. */
874 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
875 ocfs2_checkpoint_inode(inode);
876
877 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
878 "Clear inode of %"MLFu64", inode has io markers\n",
879 oi->ip_blkno);
880
881 ocfs2_extent_map_drop(inode, 0);
882 ocfs2_extent_map_init(inode);
883
884 status = ocfs2_drop_inode_locks(inode);
885 if (status < 0)
886 mlog_errno(status);
887
888 ocfs2_lock_res_free(&oi->ip_rw_lockres);
889 ocfs2_lock_res_free(&oi->ip_meta_lockres);
890 ocfs2_lock_res_free(&oi->ip_data_lockres);
891
892 ocfs2_metadata_cache_purge(inode);
893
894 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
895 "Clear inode of %"MLFu64", inode has %u cache items\n",
896 oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
897
898 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
899 "Clear inode of %"MLFu64", inode has a bad flag\n",
900 oi->ip_blkno);
901
902 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno);
905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
907 "Clear inode of %"MLFu64", io_sem is locked\n",
908 oi->ip_blkno);
909 up(&oi->ip_io_sem);
910
911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1
913 * kernel 1, world 0
914 */
915 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
916 "Clear inode of %"MLFu64", alloc_sem is locked\n",
917 oi->ip_blkno);
918 up_write(&oi->ip_alloc_sem);
919
920 mlog_bug_on_msg(oi->ip_open_count,
921 "Clear inode of %"MLFu64" has open count %d\n",
922 oi->ip_blkno, oi->ip_open_count);
923 mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
924 "Clear inode of %"MLFu64" has non empty handle list\n",
925 oi->ip_blkno);
926 mlog_bug_on_msg(oi->ip_handle,
927 "Clear inode of %"MLFu64" has non empty handle pointer\n",
928 oi->ip_blkno);
929
930 /* Clear all other flags. */
931 oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
932 oi->ip_created_trans = 0;
933 oi->ip_last_trans = 0;
934 oi->ip_dir_start_lookup = 0;
935 oi->ip_blkno = 0ULL;
936
937bail:
938 mlog_exit_void();
939}
940
941/* Called under inode_lock, with no more references on the
942 * struct inode, so it's safe here to check the flags field
943 * and to manipulate i_nlink without any other locks. */
944void ocfs2_drop_inode(struct inode *inode)
945{
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947
948 mlog_entry_void();
949
950 mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
951 oi->ip_blkno, inode->i_nlink, oi->ip_flags);
952
953 /* Testing ip_orphaned_slot here wouldn't work because we may
954 * not have gotten a delete_inode vote from any other nodes
955 * yet. */
956 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
957 mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
958 inode->i_nlink = 0;
959 }
960
961 generic_drop_inode(inode);
962
963 mlog_exit_void();
964}
965
966/*
967 * TODO: this should probably be merged into ocfs2_get_block
968 *
969 * However, you now need to pay attention to the cont_prepare_write()
970 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
971 * expects never to extend).
972 */
973struct buffer_head *ocfs2_bread(struct inode *inode,
974 int block, int *err, int reada)
975{
976 struct buffer_head *bh = NULL;
977 int tmperr;
978 u64 p_blkno;
979 int readflags = OCFS2_BH_CACHED;
980
981#if 0
982 /* only turn this on if we know we can deal with read_block
983 * returning nothing */
984 if (reada)
985 readflags |= OCFS2_BH_READAHEAD;
986#endif
987
988 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
989 i_size_read(inode)) {
990 BUG_ON(!reada);
991 return NULL;
992 }
993
994 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
995 &p_blkno, NULL);
996 if (tmperr < 0) {
997 mlog_errno(tmperr);
998 goto fail;
999 }
1000
1001 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1002 readflags, inode);
1003 if (tmperr < 0)
1004 goto fail;
1005
1006 tmperr = 0;
1007
1008 *err = 0;
1009 return bh;
1010
1011fail:
1012 if (bh) {
1013 brelse(bh);
1014 bh = NULL;
1015 }
1016 *err = -EIO;
1017 return NULL;
1018}
1019
1020/*
1021 * This is called from our getattr.
1022 */
1023int ocfs2_inode_revalidate(struct dentry *dentry)
1024{
1025 struct inode *inode = dentry->d_inode;
1026 int status = 0;
1027
1028 mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
1029 inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
1030
1031 if (!inode) {
1032 mlog(0, "eep, no inode!\n");
1033 status = -ENOENT;
1034 goto bail;
1035 }
1036
1037 spin_lock(&OCFS2_I(inode)->ip_lock);
1038 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
1039 spin_unlock(&OCFS2_I(inode)->ip_lock);
1040 mlog(0, "inode deleted!\n");
1041 status = -ENOENT;
1042 goto bail;
1043 }
1044 spin_unlock(&OCFS2_I(inode)->ip_lock);
1045
1046 /* Let ocfs2_meta_lock do the work of updating our struct
1047 * inode for us. */
1048 status = ocfs2_meta_lock(inode, NULL, NULL, 0);
1049 if (status < 0) {
1050 if (status != -ENOENT)
1051 mlog_errno(status);
1052 goto bail;
1053 }
1054 ocfs2_meta_unlock(inode, 0);
1055bail:
1056 mlog_exit(status);
1057
1058 return status;
1059}
1060
1061/*
1062 * Updates a disk inode from a
1063 * struct inode.
1064 * Only takes ip_lock.
1065 */
1066int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
1067 struct inode *inode,
1068 struct buffer_head *bh)
1069{
1070 int status;
1071 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
1072
1073 mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
1074
1075 status = ocfs2_journal_access(handle, inode, bh,
1076 OCFS2_JOURNAL_ACCESS_WRITE);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto leave;
1080 }
1081
1082 spin_lock(&OCFS2_I(inode)->ip_lock);
1083 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1084 spin_unlock(&OCFS2_I(inode)->ip_lock);
1085
1086 fe->i_size = cpu_to_le64(i_size_read(inode));
1087 fe->i_links_count = cpu_to_le16(inode->i_nlink);
1088 fe->i_uid = cpu_to_le32(inode->i_uid);
1089 fe->i_gid = cpu_to_le32(inode->i_gid);
1090 fe->i_mode = cpu_to_le16(inode->i_mode);
1091 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
1092 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
1093 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1094 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1095 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1096 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1097
1098 status = ocfs2_journal_dirty(handle, bh);
1099 if (status < 0)
1100 mlog_errno(status);
1101
1102 status = 0;
1103leave:
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/*
1110 *
1111 * Updates a struct inode from a disk inode.
1112 * does no i/o, only takes ip_lock.
1113 */
1114void ocfs2_refresh_inode(struct inode *inode,
1115 struct ocfs2_dinode *fe)
1116{
1117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1118
1119 spin_lock(&OCFS2_I(inode)->ip_lock);
1120
1121 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1122 i_size_write(inode, le64_to_cpu(fe->i_size));
1123 inode->i_nlink = le16_to_cpu(fe->i_links_count);
1124 inode->i_uid = le32_to_cpu(fe->i_uid);
1125 inode->i_gid = le32_to_cpu(fe->i_gid);
1126 inode->i_mode = le16_to_cpu(fe->i_mode);
1127 inode->i_blksize = (u32) osb->s_clustersize;
1128 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1129 inode->i_blocks = 0;
1130 else
1131 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
1132 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1133 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1134 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
1135 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
1136 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
1137 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
1138
1139 spin_unlock(&OCFS2_I(inode)->ip_lock);
1140}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 000000000000..9b0177433653
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H
28
29/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info
31{
32 u64 ip_blkno;
33
34 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres;
37
38 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem;
40
41 /* These fields are protected by ip_lock */
42 spinlock_t ip_lock;
43 u32 ip_open_count;
44 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48
49 struct semaphore ip_io_sem;
50
51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock
53 * out other I/O to the inode until we either commit or
54 * abort. */
55 struct list_head ip_handle_list;
56 struct ocfs2_journal_handle *ip_handle;
57
58 u32 ip_flags; /* see below */
59
60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan;
62
63 u32 ip_dir_start_lookup;
64
65 /* next two are protected by trans_inc_lock */
66 /* which transaction were we created on? Zero if none. */
67 unsigned long ip_created_trans;
68 /* last transaction we were a part of. */
69 unsigned long ip_last_trans;
70
71 struct ocfs2_caching_info ip_metadata_cache;
72
73 struct inode vfs_inode;
74};
75
76/*
77 * Flags for the ip_flags field
78 */
79/* System file inodes */
80#define OCFS2_INODE_SYSTEM_FILE 0x00000001
81#define OCFS2_INODE_JOURNAL 0x00000002
82#define OCFS2_INODE_BITMAP 0x00000004
83/* This inode has been wiped from disk */
84#define OCFS2_INODE_DELETED 0x00000008
85/* Another node is deleting, so our delete is a nop */
86#define OCFS2_INODE_SKIP_DELETE 0x00000010
87/* Has the inode been orphaned on another node?
88 *
89 * This hints to ocfs2_drop_inode that it should clear i_nlink before
90 * continuing.
91 *
92 * We *only* set this on unlink vote from another node. If the inode
93 * was locally orphaned, then we're sure of the state and don't need
94 * to twiddle i_nlink later - it's either zero or not depending on
95 * whether our unlink succeeded. Otherwise we got this from a node
96 * whose intention was to orphan the inode, however he may have
97 * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
98 * rely on ocfs2_delete_inode to sort things out under the proper
99 * cluster locks.
100 */
101#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
102/* Does someone have the file open O_DIRECT */
103#define OCFS2_INODE_OPEN_DIRECT 0x00000040
104/* Indicates that the metadata cache should be used as an array. */
105#define OCFS2_INODE_CACHE_INLINE 0x00000080
106
107static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
108{
109 return container_of(inode, struct ocfs2_inode_info, vfs_inode);
110}
111
112#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
113#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
114
115extern kmem_cache_t *ocfs2_inode_cache;
116
117extern struct address_space_operations ocfs2_aops;
118
119struct buffer_head *ocfs2_bread(struct inode *inode, int block,
120 int *err, int reada);
121void ocfs2_clear_inode(struct inode *inode);
122void ocfs2_delete_inode(struct inode *inode);
123void ocfs2_drop_inode(struct inode *inode);
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
131 int create_ino);
132void ocfs2_read_inode(struct inode *inode);
133void ocfs2_read_inode2(struct inode *inode, void *opaque);
134ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
135 size_t size, loff_t *offp);
136void ocfs2_sync_blockdev(struct super_block *sb);
137void ocfs2_refresh_inode(struct inode *inode,
138 struct ocfs2_dinode *fe);
139int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
140 struct inode *inode,
141 struct buffer_head *bh);
142int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
143int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144
145#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 000000000000..04428042e5e5
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.c
5 *
6 * Defines functions of journalling api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/kthread.h>
31
32#define MLOG_MASK_PREFIX ML_JOURNAL
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "heartbeat.h"
41#include "inode.h"
42#include "journal.h"
43#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h"
46#include "super.h"
47#include "vote.h"
48#include "sysfile.h"
49
50#include "buffer_head_io.h"
51
52spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
53
54static int ocfs2_force_read_journal(struct inode *inode);
55static int ocfs2_recover_node(struct ocfs2_super *osb,
56 int node_num);
57static int __ocfs2_recovery_thread(void *arg);
58static int ocfs2_commit_cache(struct ocfs2_super *osb);
59static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
61 struct ocfs2_journal_handle *handle);
62static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
63static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
64 int dirty);
65static int ocfs2_trylock_journal(struct ocfs2_super *osb,
66 int slot_num);
67static int ocfs2_recover_orphans(struct ocfs2_super *osb,
68 int slot);
69static int ocfs2_commit_thread(void *arg);
70
71static int ocfs2_commit_cache(struct ocfs2_super *osb)
72{
73 int status = 0;
74 unsigned int flushed;
75 unsigned long old_id;
76 struct ocfs2_journal *journal = NULL;
77
78 mlog_entry_void();
79
80 journal = osb->journal;
81
82 /* Flush all pending commits and checkpoint the journal. */
83 down_write(&journal->j_trans_barrier);
84
85 if (atomic_read(&journal->j_num_trans) == 0) {
86 up_write(&journal->j_trans_barrier);
87 mlog(0, "No transactions for me to flush!\n");
88 goto finally;
89 }
90
91 journal_lock_updates(journal->j_journal);
92 status = journal_flush(journal->j_journal);
93 journal_unlock_updates(journal->j_journal);
94 if (status < 0) {
95 up_write(&journal->j_trans_barrier);
96 mlog_errno(status);
97 goto finally;
98 }
99
100 old_id = ocfs2_inc_trans_id(journal);
101
102 flushed = atomic_read(&journal->j_num_trans);
103 atomic_set(&journal->j_num_trans, 0);
104 up_write(&journal->j_trans_barrier);
105
106 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
107 journal->j_trans_id, flushed);
108
109 ocfs2_kick_vote_thread(osb);
110 wake_up(&journal->j_checkpointed);
111finally:
112 mlog_exit(status);
113 return status;
114}
115
116struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
117{
118 struct ocfs2_journal_handle *retval = NULL;
119
120 retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
121 if (!retval) {
122 mlog(ML_ERROR, "Failed to allocate memory for journal "
123 "handle!\n");
124 return NULL;
125 }
126
127 retval->max_buffs = 0;
128 retval->num_locks = 0;
129 retval->k_handle = NULL;
130
131 INIT_LIST_HEAD(&retval->locks);
132 INIT_LIST_HEAD(&retval->inode_list);
133 retval->journal = osb->journal;
134
135 return retval;
136}
137
138/* pass it NULL and it will allocate a new handle object for you. If
139 * you pass it a handle however, it may still return error, in which
140 * case it has free'd the passed handle for you. */
141struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
142 struct ocfs2_journal_handle *handle,
143 int max_buffs)
144{
145 int ret;
146 journal_t *journal = osb->journal->j_journal;
147
148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149
150 if (!osb || !osb->journal->j_journal)
151 BUG();
152
153 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS;
155 goto done_free;
156 }
157
158 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
159 BUG_ON(max_buffs <= 0);
160
161 /* JBD might support this, but our journalling code doesn't yet. */
162 if (journal_current_handle()) {
163 mlog(ML_ERROR, "Recursive transaction attempted!\n");
164 BUG();
165 }
166
167 if (!handle)
168 handle = ocfs2_alloc_handle(osb);
169 if (!handle) {
170 ret = -ENOMEM;
171 mlog(ML_ERROR, "Failed to allocate memory for journal "
172 "handle!\n");
173 goto done_free;
174 }
175
176 handle->max_buffs = max_buffs;
177
178 down_read(&osb->journal->j_trans_barrier);
179
180 /* actually start the transaction now */
181 handle->k_handle = journal_start(journal, max_buffs);
182 if (IS_ERR(handle->k_handle)) {
183 up_read(&osb->journal->j_trans_barrier);
184
185 ret = PTR_ERR(handle->k_handle);
186 handle->k_handle = NULL;
187 mlog_errno(ret);
188
189 if (is_journal_aborted(journal)) {
190 ocfs2_abort(osb->sb, "Detected aborted journal");
191 ret = -EROFS;
192 }
193 goto done_free;
194 }
195
196 atomic_inc(&(osb->journal->j_num_trans));
197 handle->flags |= OCFS2_HANDLE_STARTED;
198
199 mlog_exit_ptr(handle);
200 return handle;
201
202done_free:
203 if (handle)
204 ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
205
206 mlog_exit(ret);
207 return ERR_PTR(ret);
208}
209
210void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
211 struct inode *inode)
212{
213 BUG_ON(!handle);
214 BUG_ON(!inode);
215
216 atomic_inc(&inode->i_count);
217
218 /* we're obviously changing it... */
219 down(&inode->i_sem);
220
221 /* sanity check */
222 BUG_ON(OCFS2_I(inode)->ip_handle);
223 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
224
225 OCFS2_I(inode)->ip_handle = handle;
226 list_del(&(OCFS2_I(inode)->ip_handle_list));
227 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
228}
229
230static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
231{
232 struct list_head *p, *n;
233 struct inode *inode;
234 struct ocfs2_inode_info *oi;
235
236 list_for_each_safe(p, n, &handle->inode_list) {
237 oi = list_entry(p, struct ocfs2_inode_info,
238 ip_handle_list);
239 inode = &oi->vfs_inode;
240
241 OCFS2_I(inode)->ip_handle = NULL;
242 list_del_init(&OCFS2_I(inode)->ip_handle_list);
243
244 up(&inode->i_sem);
245 iput(inode);
246 }
247}
248
249/* This is trivial so we do it out of the main commit
250 * paths. Beware, it can be called from start_trans too! */
251static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
252{
253 mlog_entry_void();
254
255 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
256
257 ocfs2_handle_unlock_inodes(handle);
258 /* You are allowed to add journal locks before the transaction
259 * has started. */
260 ocfs2_handle_cleanup_locks(handle->journal, handle);
261
262 kfree(handle);
263
264 mlog_exit_void();
265}
266
267void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
268{
269 handle_t *jbd_handle;
270 int retval;
271 struct ocfs2_journal *journal = handle->journal;
272
273 mlog_entry_void();
274
275 BUG_ON(!handle);
276
277 if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
278 ocfs2_commit_unstarted_handle(handle);
279 mlog_exit_void();
280 return;
281 }
282
283 /* release inode semaphores we took during this transaction */
284 ocfs2_handle_unlock_inodes(handle);
285
286 /* ocfs2_extend_trans may have had to call journal_restart
287 * which will always commit the transaction, but may return
288 * error for any number of reasons. If this is the case, we
289 * clear k_handle as it's not valid any more. */
290 if (handle->k_handle) {
291 jbd_handle = handle->k_handle;
292
293 if (handle->flags & OCFS2_HANDLE_SYNC)
294 jbd_handle->h_sync = 1;
295 else
296 jbd_handle->h_sync = 0;
297
298 /* actually stop the transaction. if we've set h_sync,
299 * it'll have been committed when we return */
300 retval = journal_stop(jbd_handle);
301 if (retval < 0) {
302 mlog_errno(retval);
303 mlog(ML_ERROR, "Could not commit transaction\n");
304 BUG();
305 }
306
307 handle->k_handle = NULL; /* it's been free'd in journal_stop */
308 }
309
310 ocfs2_handle_cleanup_locks(journal, handle);
311
312 up_read(&journal->j_trans_barrier);
313
314 kfree(handle);
315 mlog_exit_void();
316}
317
318/*
319 * 'nblocks' is what you want to add to the current
320 * transaction. extend_trans will either extend the current handle by
321 * nblocks, or commit it and start a new one with nblocks credits.
322 *
323 * WARNING: This will not release any semaphores or disk locks taken
324 * during the transaction, so make sure they were taken *before*
325 * start_trans or we'll have ordering deadlocks.
326 *
327 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
328 * good because transaction ids haven't yet been recorded on the
329 * cluster locks associated with this handle.
330 */
331int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
332 int nblocks)
333{
334 int status;
335
336 BUG_ON(!handle);
337 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
338 BUG_ON(!nblocks);
339
340 mlog_entry_void();
341
342 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
343
344 status = journal_extend(handle->k_handle, nblocks);
345 if (status < 0) {
346 mlog_errno(status);
347 goto bail;
348 }
349
350 if (status > 0) {
351 mlog(0, "journal_extend failed, trying journal_restart\n");
352 status = journal_restart(handle->k_handle, nblocks);
353 if (status < 0) {
354 handle->k_handle = NULL;
355 mlog_errno(status);
356 goto bail;
357 }
358 handle->max_buffs = nblocks;
359 } else
360 handle->max_buffs += nblocks;
361
362 status = 0;
363bail:
364
365 mlog_exit(status);
366 return status;
367}
368
369int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
370 struct inode *inode,
371 struct buffer_head *bh,
372 int type)
373{
374 int status;
375
376 BUG_ON(!inode);
377 BUG_ON(!handle);
378 BUG_ON(!bh);
379 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
380
381 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
382 (unsigned long long)bh->b_blocknr, type,
383 (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
384 "OCFS2_JOURNAL_ACCESS_CREATE" :
385 "OCFS2_JOURNAL_ACCESS_WRITE",
386 bh->b_size);
387
388 /* we can safely remove this assertion after testing. */
389 if (!buffer_uptodate(bh)) {
390 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
391 mlog(ML_ERROR, "b_blocknr=%llu\n",
392 (unsigned long long)bh->b_blocknr);
393 BUG();
394 }
395
396 /* Set the current transaction information on the inode so
397 * that the locking code knows whether it can drop it's locks
398 * on this inode or not. We're protected from the commit
399 * thread updating the current transaction id until
400 * ocfs2_commit_trans() because ocfs2_start_trans() took
401 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403
404 down(&OCFS2_I(inode)->ip_io_sem);
405 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE:
408 status = journal_get_write_access(handle->k_handle, bh);
409 break;
410
411 case OCFS2_JOURNAL_ACCESS_UNDO:
412 status = journal_get_undo_access(handle->k_handle, bh);
413 break;
414
415 default:
416 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n");
418 }
419 up(&OCFS2_I(inode)->ip_io_sem);
420
421 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
423 status, type);
424
425 mlog_exit(status);
426 return status;
427}
428
429int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
430 struct buffer_head *bh)
431{
432 int status;
433
434 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
435
436 mlog_entry("(bh->b_blocknr=%llu)\n",
437 (unsigned long long)bh->b_blocknr);
438
439 status = journal_dirty_metadata(handle->k_handle, bh);
440 if (status < 0)
441 mlog(ML_ERROR, "Could not dirty metadata buffer. "
442 "(bh->b_blocknr=%llu)\n",
443 (unsigned long long)bh->b_blocknr);
444
445 mlog_exit(status);
446 return status;
447}
448
449int ocfs2_journal_dirty_data(handle_t *handle,
450 struct buffer_head *bh)
451{
452 int err = journal_dirty_data(handle, bh);
453 if (err)
454 mlog_errno(err);
455 /* TODO: When we can handle it, abort the handle and go RO on
456 * error here. */
457
458 return err;
459}
460
461/* We always assume you're adding a metadata lock at level 'ex' */
462int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
463 struct inode *inode)
464{
465 int status;
466 struct ocfs2_journal_lock *lock;
467
468 BUG_ON(!inode);
469
470 lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
471 if (!lock) {
472 status = -ENOMEM;
473 mlog_errno(-ENOMEM);
474 goto bail;
475 }
476
477 if (!igrab(inode))
478 BUG();
479 lock->jl_inode = inode;
480
481 list_add_tail(&(lock->jl_lock_list), &(handle->locks));
482 handle->num_locks++;
483
484 status = 0;
485bail:
486 mlog_exit(status);
487 return status;
488}
489
490static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
491 struct ocfs2_journal_handle *handle)
492{
493 struct list_head *p, *n;
494 struct ocfs2_journal_lock *lock;
495 struct inode *inode;
496
497 list_for_each_safe(p, n, &(handle->locks)) {
498 lock = list_entry(p, struct ocfs2_journal_lock,
499 jl_lock_list);
500 list_del(&lock->jl_lock_list);
501 handle->num_locks--;
502
503 inode = lock->jl_inode;
504 ocfs2_meta_unlock(inode, 1);
505 if (atomic_read(&inode->i_count) == 1)
506 mlog(ML_ERROR,
507 "Inode %"MLFu64", I'm doing a last iput for!",
508 OCFS2_I(inode)->ip_blkno);
509 iput(inode);
510 kmem_cache_free(ocfs2_lock_cache, lock);
511 }
512}
513
514#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
515
516void ocfs2_set_journal_params(struct ocfs2_super *osb)
517{
518 journal_t *journal = osb->journal->j_journal;
519
520 spin_lock(&journal->j_state_lock);
521 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
522 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
523 journal->j_flags |= JFS_BARRIER;
524 else
525 journal->j_flags &= ~JFS_BARRIER;
526 spin_unlock(&journal->j_state_lock);
527}
528
529int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
530{
531 int status = -1;
532 struct inode *inode = NULL; /* the journal inode */
533 journal_t *j_journal = NULL;
534 struct ocfs2_dinode *di = NULL;
535 struct buffer_head *bh = NULL;
536 struct ocfs2_super *osb;
537 int meta_lock = 0;
538
539 mlog_entry_void();
540
541 BUG_ON(!journal);
542
543 osb = journal->j_osb;
544
545 /* already have the inode for our journal */
546 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
547 osb->slot_num);
548 if (inode == NULL) {
549 status = -EACCES;
550 mlog_errno(status);
551 goto done;
552 }
553 if (is_bad_inode(inode)) {
554 mlog(ML_ERROR, "access error (bad inode)\n");
555 iput(inode);
556 inode = NULL;
557 status = -EACCES;
558 goto done;
559 }
560
561 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++;
563
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
565 if (status < 0) {
566 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n");
568 goto done;
569 }
570
571 meta_lock = 1;
572 di = (struct ocfs2_dinode *)bh->b_data;
573
574 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
575 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
576 inode->i_size);
577 status = -EINVAL;
578 goto done;
579 }
580
581 mlog(0, "inode->i_size = %lld\n", inode->i_size);
582 mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
583 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
584
585 /* call the kernels journal init function now */
586 j_journal = journal_init_inode(inode);
587 if (j_journal == NULL) {
588 mlog(ML_ERROR, "Linux journal layer error\n");
589 status = -EINVAL;
590 goto done;
591 }
592
593 mlog(0, "Returned from journal_init_inode\n");
594 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
595
596 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
597 OCFS2_JOURNAL_DIRTY_FL);
598
599 journal->j_journal = j_journal;
600 journal->j_inode = inode;
601 journal->j_bh = bh;
602
603 ocfs2_set_journal_params(osb);
604
605 journal->j_state = OCFS2_JOURNAL_LOADED;
606
607 status = 0;
608done:
609 if (status < 0) {
610 if (meta_lock)
611 ocfs2_meta_unlock(inode, 1);
612 if (bh != NULL)
613 brelse(bh);
614 if (inode) {
615 OCFS2_I(inode)->ip_open_count--;
616 iput(inode);
617 }
618 }
619
620 mlog_exit(status);
621 return status;
622}
623
624static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
625 int dirty)
626{
627 int status;
628 unsigned int flags;
629 struct ocfs2_journal *journal = osb->journal;
630 struct buffer_head *bh = journal->j_bh;
631 struct ocfs2_dinode *fe;
632
633 mlog_entry_void();
634
635 fe = (struct ocfs2_dinode *)bh->b_data;
636 if (!OCFS2_IS_VALID_DINODE(fe)) {
637 /* This is called from startup/shutdown which will
638 * handle the errors in a specific manner, so no need
639 * to call ocfs2_error() here. */
640 mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid "
641 "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
642 status = -EIO;
643 goto out;
644 }
645
646 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
647 if (dirty)
648 flags |= OCFS2_JOURNAL_DIRTY_FL;
649 else
650 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
651 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
652
653 status = ocfs2_write_block(osb, bh, journal->j_inode);
654 if (status < 0)
655 mlog_errno(status);
656
657out:
658 mlog_exit(status);
659 return status;
660}
661
662/*
663 * If the journal has been kmalloc'd it needs to be freed after this
664 * call.
665 */
666void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667{
668 struct ocfs2_journal *journal = NULL;
669 int status = 0;
670 struct inode *inode = NULL;
671 int num_running_trans = 0;
672
673 mlog_entry_void();
674
675 if (!osb)
676 BUG();
677
678 journal = osb->journal;
679 if (!journal)
680 goto done;
681
682 inode = journal->j_inode;
683
684 if (journal->j_state != OCFS2_JOURNAL_LOADED)
685 goto done;
686
687 /* need to inc inode use count as journal_destroy will iput. */
688 if (!igrab(inode))
689 BUG();
690
691 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
692 if (num_running_trans > 0)
693 mlog(0, "Shutting down journal: must wait on %d "
694 "running transactions!\n",
695 num_running_trans);
696
697 /* Do a commit_cache here. It will flush our journal, *and*
698 * release any locks that are still held.
699 * set the SHUTDOWN flag and release the trans lock.
700 * the commit thread will take the trans lock for us below. */
701 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
702
703 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
704 * drop the trans_lock (which we want to hold until we
705 * completely destroy the journal. */
706 if (osb->commit_task) {
707 /* Wait for the commit thread */
708 mlog(0, "Waiting for ocfs2commit to exit....\n");
709 kthread_stop(osb->commit_task);
710 osb->commit_task = NULL;
711 }
712
713 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
714
715 status = ocfs2_journal_toggle_dirty(osb, 0);
716 if (status < 0)
717 mlog_errno(status);
718
719 /* Shutdown the kernel journal system */
720 journal_destroy(journal->j_journal);
721
722 OCFS2_I(inode)->ip_open_count--;
723
724 /* unlock our journal */
725 ocfs2_meta_unlock(inode, 1);
726
727 brelse(journal->j_bh);
728 journal->j_bh = NULL;
729
730 journal->j_state = OCFS2_JOURNAL_FREE;
731
732// up_write(&journal->j_trans_barrier);
733done:
734 if (inode)
735 iput(inode);
736 mlog_exit_void();
737}
738
739static void ocfs2_clear_journal_error(struct super_block *sb,
740 journal_t *journal,
741 int slot)
742{
743 int olderr;
744
745 olderr = journal_errno(journal);
746 if (olderr) {
747 mlog(ML_ERROR, "File system error %d recorded in "
748 "journal %u.\n", olderr, slot);
749 mlog(ML_ERROR, "File system on device %s needs checking.\n",
750 sb->s_id);
751
752 journal_ack_err(journal);
753 journal_clear_err(journal);
754 }
755}
756
757int ocfs2_journal_load(struct ocfs2_journal *journal)
758{
759 int status = 0;
760 struct ocfs2_super *osb;
761
762 mlog_entry_void();
763
764 if (!journal)
765 BUG();
766
767 osb = journal->j_osb;
768
769 status = journal_load(journal->j_journal);
770 if (status < 0) {
771 mlog(ML_ERROR, "Failed to load journal!\n");
772 goto done;
773 }
774
775 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
776
777 status = ocfs2_journal_toggle_dirty(osb, 1);
778 if (status < 0) {
779 mlog_errno(status);
780 goto done;
781 }
782
783 /* Launch the commit thread */
784 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
785 osb->osb_id);
786 if (IS_ERR(osb->commit_task)) {
787 status = PTR_ERR(osb->commit_task);
788 osb->commit_task = NULL;
789 mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
790 status);
791 goto done;
792 }
793
794done:
795 mlog_exit(status);
796 return status;
797}
798
799
800/* 'full' flag tells us whether we clear out all blocks or if we just
801 * mark the journal clean */
802int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
803{
804 int status;
805
806 mlog_entry_void();
807
808 if (!journal)
809 BUG();
810
811 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) {
813 mlog_errno(status);
814 goto bail;
815 }
816
817 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
818 if (status < 0)
819 mlog_errno(status);
820
821bail:
822 mlog_exit(status);
823 return status;
824}
825
826/*
827 * JBD Might read a cached version of another nodes journal file. We
828 * don't want this as this file changes often and we get no
829 * notification on those changes. The only way to be sure that we've
830 * got the most up to date version of those blocks then is to force
831 * read them off disk. Just searching through the buffer cache won't
832 * work as there may be pages backing this file which are still marked
833 * up to date. We know things can't change on this file underneath us
834 * as we have the lock by now :)
835 */
836static int ocfs2_force_read_journal(struct inode *inode)
837{
838 int status = 0;
839 int i, p_blocks;
840 u64 v_blkno, p_blkno;
841#define CONCURRENT_JOURNAL_FILL 32
842 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
843
844 mlog_entry_void();
845
846 BUG_ON(inode->i_blocks !=
847 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
848
849 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
850
851 mlog(0, "Force reading %lu blocks\n",
852 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
853
854 v_blkno = 0;
855 while (v_blkno <
856 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
857
858 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
859 1, &p_blkno,
860 &p_blocks);
861 if (status < 0) {
862 mlog_errno(status);
863 goto bail;
864 }
865
866 if (p_blocks > CONCURRENT_JOURNAL_FILL)
867 p_blocks = CONCURRENT_JOURNAL_FILL;
868
869 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
870 p_blkno, p_blocks, bhs, 0,
871 inode);
872 if (status < 0) {
873 mlog_errno(status);
874 goto bail;
875 }
876
877 for(i = 0; i < p_blocks; i++) {
878 brelse(bhs[i]);
879 bhs[i] = NULL;
880 }
881
882 v_blkno += p_blocks;
883 }
884
885bail:
886 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
887 if (bhs[i])
888 brelse(bhs[i]);
889 mlog_exit(status);
890 return status;
891}
892
893struct ocfs2_la_recovery_item {
894 struct list_head lri_list;
895 int lri_slot;
896 struct ocfs2_dinode *lri_la_dinode;
897 struct ocfs2_dinode *lri_tl_dinode;
898};
899
900/* Does the second half of the recovery process. By this point, the
901 * node is marked clean and can actually be considered recovered,
902 * hence it's no longer in the recovery map, but there's still some
903 * cleanup we can do which shouldn't happen within the recovery thread
904 * as locking in that context becomes very difficult if we are to take
905 * recovering nodes into account.
906 *
907 * NOTE: This function can and will sleep on recovery of other nodes
908 * during cluster locking, just like any other ocfs2 process.
909 */
910void ocfs2_complete_recovery(void *data)
911{
912 int ret;
913 struct ocfs2_super *osb = data;
914 struct ocfs2_journal *journal = osb->journal;
915 struct ocfs2_dinode *la_dinode, *tl_dinode;
916 struct ocfs2_la_recovery_item *item;
917 struct list_head *p, *n;
918 LIST_HEAD(tmp_la_list);
919
920 mlog_entry_void();
921
922 mlog(0, "completing recovery from keventd\n");
923
924 spin_lock(&journal->j_lock);
925 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
926 spin_unlock(&journal->j_lock);
927
928 list_for_each_safe(p, n, &tmp_la_list) {
929 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
930 list_del_init(&item->lri_list);
931
932 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
933
934 la_dinode = item->lri_la_dinode;
935 if (la_dinode) {
936 mlog(0, "Clean up local alloc %"MLFu64"\n",
937 la_dinode->i_blkno);
938
939 ret = ocfs2_complete_local_alloc_recovery(osb,
940 la_dinode);
941 if (ret < 0)
942 mlog_errno(ret);
943
944 kfree(la_dinode);
945 }
946
947 tl_dinode = item->lri_tl_dinode;
948 if (tl_dinode) {
949 mlog(0, "Clean up truncate log %"MLFu64"\n",
950 tl_dinode->i_blkno);
951
952 ret = ocfs2_complete_truncate_log_recovery(osb,
953 tl_dinode);
954 if (ret < 0)
955 mlog_errno(ret);
956
957 kfree(tl_dinode);
958 }
959
960 ret = ocfs2_recover_orphans(osb, item->lri_slot);
961 if (ret < 0)
962 mlog_errno(ret);
963
964 kfree(item);
965 }
966
967 mlog(0, "Recovery completion\n");
968 mlog_exit_void();
969}
970
971/* NOTE: This function always eats your references to la_dinode and
972 * tl_dinode, either manually on error, or by passing them to
973 * ocfs2_complete_recovery */
974static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
975 int slot_num,
976 struct ocfs2_dinode *la_dinode,
977 struct ocfs2_dinode *tl_dinode)
978{
979 struct ocfs2_la_recovery_item *item;
980
981 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
982 if (!item) {
983 /* Though we wish to avoid it, we are in fact safe in
984 * skipping local alloc cleanup as fsck.ocfs2 is more
985 * than capable of reclaiming unused space. */
986 if (la_dinode)
987 kfree(la_dinode);
988
989 if (tl_dinode)
990 kfree(tl_dinode);
991
992 mlog_errno(-ENOMEM);
993 return;
994 }
995
996 INIT_LIST_HEAD(&item->lri_list);
997 item->lri_la_dinode = la_dinode;
998 item->lri_slot = slot_num;
999 item->lri_tl_dinode = tl_dinode;
1000
1001 spin_lock(&journal->j_lock);
1002 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1003 queue_work(ocfs2_wq, &journal->j_recovery_work);
1004 spin_unlock(&journal->j_lock);
1005}
1006
1007/* Called by the mount code to queue recovery the last part of
1008 * recovery for it's own slot. */
1009void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1010{
1011 struct ocfs2_journal *journal = osb->journal;
1012
1013 if (osb->dirty) {
1014 /* No need to queue up our truncate_log as regular
1015 * cleanup will catch that. */
1016 ocfs2_queue_recovery_completion(journal,
1017 osb->slot_num,
1018 osb->local_alloc_copy,
1019 NULL);
1020 ocfs2_schedule_truncate_log_flush(osb, 0);
1021
1022 osb->local_alloc_copy = NULL;
1023 osb->dirty = 0;
1024 }
1025}
1026
1027static int __ocfs2_recovery_thread(void *arg)
1028{
1029 int status, node_num;
1030 struct ocfs2_super *osb = arg;
1031
1032 mlog_entry_void();
1033
1034 status = ocfs2_wait_on_mount(osb);
1035 if (status < 0) {
1036 goto bail;
1037 }
1038
1039restart:
1040 status = ocfs2_super_lock(osb, 1);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto bail;
1044 }
1045
1046 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1047 node_num = ocfs2_node_map_first_set_bit(osb,
1048 &osb->recovery_map);
1049 if (node_num == O2NM_INVALID_NODE_NUM) {
1050 mlog(0, "Out of nodes to recover.\n");
1051 break;
1052 }
1053
1054 status = ocfs2_recover_node(osb, node_num);
1055 if (status < 0) {
1056 mlog(ML_ERROR,
1057 "Error %d recovering node %d on device (%u,%u)!\n",
1058 status, node_num,
1059 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1060 mlog(ML_ERROR, "Volume requires unmount.\n");
1061 continue;
1062 }
1063
1064 ocfs2_recovery_map_clear(osb, node_num);
1065 }
1066 ocfs2_super_unlock(osb, 1);
1067
1068 /* We always run recovery on our own orphan dir - the dead
1069 * node(s) may have voted "no" on an inode delete earlier. A
1070 * revote is therefore required. */
1071 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1072 NULL);
1073
1074bail:
1075 down(&osb->recovery_lock);
1076 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock);
1079 goto restart;
1080 }
1081
1082 osb->recovery_thread_task = NULL;
1083 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event);
1085
1086 up(&osb->recovery_lock);
1087
1088 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api
1090 * requires that we call do_exit(). And it isn't exported, but
1091 * complete_and_exit() seems to be a minimal wrapper around it. */
1092 complete_and_exit(NULL, status);
1093 return status;
1094}
1095
1096void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1097{
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num);
1100
1101 down(&osb->recovery_lock);
1102 if (osb->disable_recovery)
1103 goto out;
1104
1105 /* People waiting on recovery will wait on
1106 * the recovery map to empty. */
1107 if (!ocfs2_recovery_map_set(osb, node_num))
1108 mlog(0, "node %d already be in recovery.\n", node_num);
1109
1110 mlog(0, "starting recovery thread...\n");
1111
1112 if (osb->recovery_thread_task)
1113 goto out;
1114
1115 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
1116 "ocfs2rec-%d", osb->osb_id);
1117 if (IS_ERR(osb->recovery_thread_task)) {
1118 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
1119 osb->recovery_thread_task = NULL;
1120 }
1121
1122out:
1123 up(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event);
1125
1126 mlog_exit_void();
1127}
1128
1129/* Does the actual journal replay and marks the journal inode as
1130 * clean. Will only replay if the journal inode is marked dirty. */
1131static int ocfs2_replay_journal(struct ocfs2_super *osb,
1132 int node_num,
1133 int slot_num)
1134{
1135 int status;
1136 int got_lock = 0;
1137 unsigned int flags;
1138 struct inode *inode = NULL;
1139 struct ocfs2_dinode *fe;
1140 journal_t *journal = NULL;
1141 struct buffer_head *bh = NULL;
1142
1143 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1144 slot_num);
1145 if (inode == NULL) {
1146 status = -EACCES;
1147 mlog_errno(status);
1148 goto done;
1149 }
1150 if (is_bad_inode(inode)) {
1151 status = -EACCES;
1152 iput(inode);
1153 inode = NULL;
1154 mlog_errno(status);
1155 goto done;
1156 }
1157 SET_INODE_JOURNAL(inode);
1158
1159 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
1160 OCFS2_META_LOCK_RECOVERY);
1161 if (status < 0) {
1162 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
1163 if (status != -ERESTARTSYS)
1164 mlog(ML_ERROR, "Could not lock journal!\n");
1165 goto done;
1166 }
1167 got_lock = 1;
1168
1169 fe = (struct ocfs2_dinode *) bh->b_data;
1170
1171 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1172
1173 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1174 mlog(0, "No recovery required for node %d\n", node_num);
1175 goto done;
1176 }
1177
1178 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1179 node_num, slot_num,
1180 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1181
1182 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1183
1184 status = ocfs2_force_read_journal(inode);
1185 if (status < 0) {
1186 mlog_errno(status);
1187 goto done;
1188 }
1189
1190 mlog(0, "calling journal_init_inode\n");
1191 journal = journal_init_inode(inode);
1192 if (journal == NULL) {
1193 mlog(ML_ERROR, "Linux journal layer error\n");
1194 status = -EIO;
1195 goto done;
1196 }
1197
1198 status = journal_load(journal);
1199 if (status < 0) {
1200 mlog_errno(status);
1201 if (!igrab(inode))
1202 BUG();
1203 journal_destroy(journal);
1204 goto done;
1205 }
1206
1207 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1208
1209 /* wipe the journal */
1210 mlog(0, "flushing the journal.\n");
1211 journal_lock_updates(journal);
1212 status = journal_flush(journal);
1213 journal_unlock_updates(journal);
1214 if (status < 0)
1215 mlog_errno(status);
1216
1217 /* This will mark the node clean */
1218 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1219 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1220 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1221
1222 status = ocfs2_write_block(osb, bh, inode);
1223 if (status < 0)
1224 mlog_errno(status);
1225
1226 if (!igrab(inode))
1227 BUG();
1228
1229 journal_destroy(journal);
1230
1231done:
1232 /* drop the lock on this nodes journal */
1233 if (got_lock)
1234 ocfs2_meta_unlock(inode, 1);
1235
1236 if (inode)
1237 iput(inode);
1238
1239 if (bh)
1240 brelse(bh);
1241
1242 mlog_exit(status);
1243 return status;
1244}
1245
1246/*
1247 * Do the most important parts of node recovery:
1248 * - Replay it's journal
1249 * - Stamp a clean local allocator file
1250 * - Stamp a clean truncate log
1251 * - Mark the node clean
1252 *
1253 * If this function completes without error, a node in OCFS2 can be
1254 * said to have been safely recovered. As a result, failure during the
1255 * second part of a nodes recovery process (local alloc recovery) is
1256 * far less concerning.
1257 */
1258static int ocfs2_recover_node(struct ocfs2_super *osb,
1259 int node_num)
1260{
1261 int status = 0;
1262 int slot_num;
1263 struct ocfs2_slot_info *si = osb->slot_info;
1264 struct ocfs2_dinode *la_copy = NULL;
1265 struct ocfs2_dinode *tl_copy = NULL;
1266
1267 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1268 node_num, osb->node_num);
1269
1270 mlog(0, "checking node %d\n", node_num);
1271
1272 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num)
1275 BUG();
1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) {
1279 status = 0;
1280 mlog(0, "no slot for this node, so no recovery required.\n");
1281 goto done;
1282 }
1283
1284 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1285
1286 status = ocfs2_replay_journal(osb, node_num, slot_num);
1287 if (status < 0) {
1288 mlog_errno(status);
1289 goto done;
1290 }
1291
1292 /* Stamp a clean local alloc file AFTER recovering the journal... */
1293 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto done;
1297 }
1298
1299 /* An error from begin_truncate_log_recovery is not
1300 * serious enough to warrant halting the rest of
1301 * recovery. */
1302 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1303 if (status < 0)
1304 mlog_errno(status);
1305
1306 /* Likewise, this would be a strange but ultimately not so
1307 * harmful place to get an error... */
1308 ocfs2_clear_slot(si, slot_num);
1309 status = ocfs2_update_disk_slots(osb, si);
1310 if (status < 0)
1311 mlog_errno(status);
1312
1313 /* This will kfree the memory pointed to by la_copy and tl_copy */
1314 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1315 tl_copy);
1316
1317 status = 0;
1318done:
1319
1320 mlog_exit(status);
1321 return status;
1322}
1323
1324/* Test node liveness by trylocking his journal. If we get the lock,
1325 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1326 * still alive (we couldn't get the lock) and < 0 on error. */
1327static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1328 int slot_num)
1329{
1330 int status, flags;
1331 struct inode *inode = NULL;
1332
1333 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1334 slot_num);
1335 if (inode == NULL) {
1336 mlog(ML_ERROR, "access error\n");
1337 status = -EACCES;
1338 goto bail;
1339 }
1340 if (is_bad_inode(inode)) {
1341 mlog(ML_ERROR, "access error (bad inode)\n");
1342 iput(inode);
1343 inode = NULL;
1344 status = -EACCES;
1345 goto bail;
1346 }
1347 SET_INODE_JOURNAL(inode);
1348
1349 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1350 status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
1351 if (status < 0) {
1352 if (status != -EAGAIN)
1353 mlog_errno(status);
1354 goto bail;
1355 }
1356
1357 ocfs2_meta_unlock(inode, 1);
1358bail:
1359 if (inode)
1360 iput(inode);
1361
1362 return status;
1363}
1364
1365/* Call this underneath ocfs2_super_lock. It also assumes that the
1366 * slot info struct has been updated from disk. */
1367int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1368{
1369 int status, i, node_num;
1370 struct ocfs2_slot_info *si = osb->slot_info;
1371
1372 /* This is called with the super block cluster lock, so we
1373 * know that the slot map can't change underneath us. */
1374
1375 spin_lock(&si->si_lock);
1376 for(i = 0; i < si->si_num_slots; i++) {
1377 if (i == osb->slot_num)
1378 continue;
1379 if (ocfs2_is_empty_slot(si, i))
1380 continue;
1381
1382 node_num = si->si_global_node_nums[i];
1383 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1384 continue;
1385 spin_unlock(&si->si_lock);
1386
1387 /* Ok, we have a slot occupied by another node which
1388 * is not in the recovery map. We trylock his journal
1389 * file here to test if he's alive. */
1390 status = ocfs2_trylock_journal(osb, i);
1391 if (!status) {
1392 /* Since we're called from mount, we know that
1393 * the recovery thread can't race us on
1394 * setting / checking the recovery bits. */
1395 ocfs2_recovery_thread(osb, node_num);
1396 } else if ((status < 0) && (status != -EAGAIN)) {
1397 mlog_errno(status);
1398 goto bail;
1399 }
1400
1401 spin_lock(&si->si_lock);
1402 }
1403 spin_unlock(&si->si_lock);
1404
1405 status = 0;
1406bail:
1407 mlog_exit(status);
1408 return status;
1409}
1410
1411static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1412 int slot)
1413{
1414 int status = 0;
1415 int have_disk_lock = 0;
1416 struct inode *inode = NULL;
1417 struct inode *iter;
1418 struct inode *orphan_dir_inode = NULL;
1419 unsigned long offset, blk, local;
1420 struct buffer_head *bh = NULL;
1421 struct ocfs2_dir_entry *de;
1422 struct super_block *sb = osb->sb;
1423 struct ocfs2_inode_info *oi;
1424
1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1426
1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1428 ORPHAN_DIR_SYSTEM_INODE,
1429 slot);
1430 if (!orphan_dir_inode) {
1431 status = -ENOENT;
1432 mlog_errno(status);
1433 goto out;
1434 }
1435
1436 down(&orphan_dir_inode->i_sem);
1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
1438 if (status < 0) {
1439 up(&orphan_dir_inode->i_sem);
1440 mlog_errno(status);
1441 goto out;
1442 }
1443 have_disk_lock = 1;
1444
1445 offset = 0;
1446 iter = NULL;
1447 while(offset < i_size_read(orphan_dir_inode)) {
1448 blk = offset >> sb->s_blocksize_bits;
1449
1450 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1451 if (!bh)
1452 status = -EINVAL;
1453 if (status < 0) {
1454 up(&orphan_dir_inode->i_sem);
1455 if (bh)
1456 brelse(bh);
1457 mlog_errno(status);
1458 goto out;
1459 }
1460
1461 local = 0;
1462 while(offset < i_size_read(orphan_dir_inode)
1463 && local < sb->s_blocksize) {
1464 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1465
1466 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1467 de, bh, local)) {
1468 up(&orphan_dir_inode->i_sem);
1469 status = -EINVAL;
1470 mlog_errno(status);
1471 brelse(bh);
1472 goto out;
1473 }
1474
1475 local += le16_to_cpu(de->rec_len);
1476 offset += le16_to_cpu(de->rec_len);
1477
1478 /* I guess we silently fail on no inode? */
1479 if (!le64_to_cpu(de->inode))
1480 continue;
1481 if (de->file_type > OCFS2_FT_MAX) {
1482 mlog(ML_ERROR,
1483 "block %llu contains invalid de: "
1484 "inode = %"MLFu64", rec_len = %u, "
1485 "name_len = %u, file_type = %u, "
1486 "name='%.*s'\n",
1487 (unsigned long long)bh->b_blocknr,
1488 le64_to_cpu(de->inode),
1489 le16_to_cpu(de->rec_len),
1490 de->name_len,
1491 de->file_type,
1492 de->name_len,
1493 de->name);
1494 continue;
1495 }
1496 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1497 continue;
1498 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1499 continue;
1500
1501 iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
1502 if (IS_ERR(iter))
1503 continue;
1504
1505 mlog(0, "queue orphan %"MLFu64"\n",
1506 OCFS2_I(iter)->ip_blkno);
1507 OCFS2_I(iter)->ip_next_orphan = inode;
1508 inode = iter;
1509 }
1510 brelse(bh);
1511 }
1512 up(&orphan_dir_inode->i_sem);
1513
1514 ocfs2_meta_unlock(orphan_dir_inode, 0);
1515 have_disk_lock = 0;
1516
1517 iput(orphan_dir_inode);
1518 orphan_dir_inode = NULL;
1519
1520 while (inode) {
1521 oi = OCFS2_I(inode);
1522 mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
1523
1524 iter = oi->ip_next_orphan;
1525
1526 spin_lock(&oi->ip_lock);
1527 /* Delete voting may have set these on the assumption
1528 * that the other node would wipe them successfully.
1529 * If they are still in the node's orphan dir, we need
1530 * to reset that state. */
1531 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1532
1533 /* Set the proper information to get us going into
1534 * ocfs2_delete_inode. */
1535 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1536 oi->ip_orphaned_slot = slot;
1537 spin_unlock(&oi->ip_lock);
1538
1539 iput(inode);
1540
1541 inode = iter;
1542 }
1543
1544out:
1545 if (have_disk_lock)
1546 ocfs2_meta_unlock(orphan_dir_inode, 0);
1547
1548 if (orphan_dir_inode)
1549 iput(orphan_dir_inode);
1550
1551 return status;
1552}
1553
1554static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1555{
1556 /* This check is good because ocfs2 will wait on our recovery
1557 * thread before changing it to something other than MOUNTED
1558 * or DISABLED. */
1559 wait_event(osb->osb_mount_event,
1560 atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
1561 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1562
1563 /* If there's an error on mount, then we may never get to the
1564 * MOUNTED flag, but this is set right before
1565 * dismount_volume() so we can trust it. */
1566 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1567 mlog(0, "mount error, exiting!\n");
1568 return -EBUSY;
1569 }
1570
1571 return 0;
1572}
1573
1574static int ocfs2_commit_thread(void *arg)
1575{
1576 int status;
1577 struct ocfs2_super *osb = arg;
1578 struct ocfs2_journal *journal = osb->journal;
1579
1580 /* we can trust j_num_trans here because _should_stop() is only set in
1581 * shutdown and nobody other than ourselves should be able to start
1582 * transactions. committing on shutdown might take a few iterations
1583 * as final transactions put deleted inodes on the list */
1584 while (!(kthread_should_stop() &&
1585 atomic_read(&journal->j_num_trans) == 0)) {
1586
1587 wait_event_interruptible_timeout(osb->checkpoint_event,
1588 atomic_read(&journal->j_num_trans)
1589 || kthread_should_stop(),
1590 OCFS2_CHECKPOINT_INTERVAL);
1591
1592 status = ocfs2_commit_cache(osb);
1593 if (status < 0)
1594 mlog_errno(status);
1595
1596 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1597 mlog(ML_KTHREAD,
1598 "commit_thread: %u transactions pending on "
1599 "shutdown\n",
1600 atomic_read(&journal->j_num_trans));
1601 }
1602 }
1603
1604 return 0;
1605}
1606
1607/* Look for a dirty journal without taking any cluster locks. Used for
1608 * hard readonly access to determine whether the file system journals
1609 * require recovery. */
1610int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1611{
1612 int ret = 0;
1613 unsigned int slot;
1614 struct buffer_head *di_bh;
1615 struct ocfs2_dinode *di;
1616 struct inode *journal = NULL;
1617
1618 for(slot = 0; slot < osb->max_slots; slot++) {
1619 journal = ocfs2_get_system_file_inode(osb,
1620 JOURNAL_SYSTEM_INODE,
1621 slot);
1622 if (!journal || is_bad_inode(journal)) {
1623 ret = -EACCES;
1624 mlog_errno(ret);
1625 goto out;
1626 }
1627
1628 di_bh = NULL;
1629 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1630 0, journal);
1631 if (ret < 0) {
1632 mlog_errno(ret);
1633 goto out;
1634 }
1635
1636 di = (struct ocfs2_dinode *) di_bh->b_data;
1637
1638 if (le32_to_cpu(di->id1.journal1.ij_flags) &
1639 OCFS2_JOURNAL_DIRTY_FL)
1640 ret = -EROFS;
1641
1642 brelse(di_bh);
1643 if (ret)
1644 break;
1645 }
1646
1647out:
1648 if (journal)
1649 iput(journal);
1650
1651 return ret;
1652}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 000000000000..7d0a816184fa
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.h
5 *
6 * Defines journalling api and structures.
7 *
8 * Copyright (C) 2003, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_JOURNAL_H
27#define OCFS2_JOURNAL_H
28
29#include <linux/fs.h>
30#include <linux/jbd.h>
31
32#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
33
34enum ocfs2_journal_state {
35 OCFS2_JOURNAL_FREE = 0,
36 OCFS2_JOURNAL_LOADED,
37 OCFS2_JOURNAL_IN_SHUTDOWN,
38};
39
40struct ocfs2_super;
41struct ocfs2_dinode;
42struct ocfs2_journal_handle;
43
44struct ocfs2_journal {
45 enum ocfs2_journal_state j_state; /* Journals current state */
46
47 journal_t *j_journal; /* The kernels journal type */
48 struct inode *j_inode; /* Kernel inode pointing to
49 * this journal */
50 struct ocfs2_super *j_osb; /* pointer to the super
51 * block for the node
52 * we're currently
53 * running on -- not
54 * necessarily the super
55 * block from the node
56 * which we usually run
57 * from (recovery,
58 * etc) */
59 struct buffer_head *j_bh; /* Journal disk inode block */
60 atomic_t j_num_trans; /* Number of transactions
61 * currently in the system. */
62 unsigned long j_trans_id;
63 struct rw_semaphore j_trans_barrier;
64 wait_queue_head_t j_checkpointed;
65
66 spinlock_t j_lock;
67 struct list_head j_la_cleanups;
68 struct work_struct j_recovery_work;
69};
70
71extern spinlock_t trans_inc_lock;
72
73/* wrap j_trans_id so we never have it equal to zero. */
74static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
75{
76 unsigned long old_id;
77 spin_lock(&trans_inc_lock);
78 old_id = j->j_trans_id++;
79 if (unlikely(!j->j_trans_id))
80 j->j_trans_id = 1;
81 spin_unlock(&trans_inc_lock);
82 return old_id;
83}
84
85static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
86 struct inode *inode)
87{
88 spin_lock(&trans_inc_lock);
89 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
90 spin_unlock(&trans_inc_lock);
91}
92
93/* Used to figure out whether it's safe to drop a metadata lock on an
94 * inode. Returns true if all the inodes changes have been
95 * checkpointed to disk. You should be holding the spinlock on the
96 * metadata lock while calling this to be sure that nobody can take
97 * the lock and put it on another transaction. */
98static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
99{
100 int ret;
101 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
102
103 spin_lock(&trans_inc_lock);
104 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
105 spin_unlock(&trans_inc_lock);
106 return ret;
107}
108
109/* convenience function to check if an inode is still new (has never
110 * hit disk) Will do you a favor and set created_trans = 0 when you've
111 * been checkpointed. returns '1' if the inode is still new. */
112static inline int ocfs2_inode_is_new(struct inode *inode)
113{
114 int ret;
115
116 /* System files are never "new" as they're written out by
117 * mkfs. This helps us early during mount, before we have the
118 * journal open and j_trans_id could be junk. */
119 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
120 return 0;
121 spin_lock(&trans_inc_lock);
122 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
123 OCFS2_I(inode)->ip_created_trans));
124 if (!ret)
125 OCFS2_I(inode)->ip_created_trans = 0;
126 spin_unlock(&trans_inc_lock);
127 return ret;
128}
129
130static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
131 struct inode *inode)
132{
133 spin_lock(&trans_inc_lock);
134 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
135 spin_unlock(&trans_inc_lock);
136}
137
138extern kmem_cache_t *ocfs2_lock_cache;
139
140struct ocfs2_journal_lock {
141 struct inode *jl_inode;
142 struct list_head jl_lock_list;
143};
144
145struct ocfs2_journal_handle {
146 handle_t *k_handle; /* kernel handle. */
147 struct ocfs2_journal *journal;
148 u32 flags; /* see flags below. */
149 int max_buffs; /* Buffs reserved by this handle */
150
151 /* The following two fields are for ocfs2_handle_add_lock */
152 int num_locks;
153 struct list_head locks; /* A bunch of locks to
154 * release on commit. This
155 * should be a list_head */
156
157 struct list_head inode_list;
158};
159
160#define OCFS2_HANDLE_STARTED 1
161/* should we sync-commit this handle? */
162#define OCFS2_HANDLE_SYNC 2
163static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
164{
165 return handle->flags & OCFS2_HANDLE_STARTED;
166}
167
168static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
169{
170 if (sync)
171 handle->flags |= OCFS2_HANDLE_SYNC;
172 else
173 handle->flags &= ~OCFS2_HANDLE_SYNC;
174}
175
176/* Exported only for the journal struct init code in super.c. Do not call. */
177void ocfs2_complete_recovery(void *data);
178
179/*
180 * Journal Control:
181 * Initialize, Load, Shutdown, Wipe a journal.
182 *
183 * ocfs2_journal_init - Initialize journal structures in the OSB.
184 * ocfs2_journal_load - Load the given journal off disk. Replay it if
185 * there's transactions still in there.
186 * ocfs2_journal_shutdown - Shutdown a journal, this will flush all
187 * uncommitted, uncheckpointed transactions.
188 * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
189 * zero out each block.
190 * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
191 * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
192 * event on.
193 * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
194 */
195void ocfs2_set_journal_params(struct ocfs2_super *osb);
196int ocfs2_journal_init(struct ocfs2_journal *journal,
197 int *dirty);
198void ocfs2_journal_shutdown(struct ocfs2_super *osb);
199int ocfs2_journal_wipe(struct ocfs2_journal *journal,
200 int full);
201int ocfs2_journal_load(struct ocfs2_journal *journal);
202int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
203void ocfs2_recovery_thread(struct ocfs2_super *osb,
204 int node_num);
205int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
206void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
207
208static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
209{
210 atomic_set(&osb->needs_checkpoint, 1);
211 wake_up(&osb->checkpoint_event);
212}
213
214static inline void ocfs2_checkpoint_inode(struct inode *inode)
215{
216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
217
218 if (!ocfs2_inode_fully_checkpointed(inode)) {
219 /* WARNING: This only kicks off a single
220 * checkpoint. If someone races you and adds more
221 * metadata to the journal, you won't know, and will
222 * wind up waiting *alot* longer than necessary. Right
223 * now we only use this in clear_inode so that's
224 * OK. */
225 ocfs2_start_checkpoint(osb);
226
227 wait_event(osb->journal->j_checkpointed,
228 ocfs2_inode_fully_checkpointed(inode));
229 }
230}
231
232/*
233 * Transaction Handling:
234 * Manage the lifetime of a transaction handle.
235 *
236 * ocfs2_alloc_handle - Only allocate a handle so we can start putting
237 * cluster locks on it. To actually change blocks,
238 * call ocfs2_start_trans with the handle returned
239 * from this function. You may call ocfs2_commit_trans
240 * at any time in the lifetime of a handle.
241 * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
242 * the number of blocks that will be changed during
243 * this handle.
244 * ocfs2_commit_trans - Complete a handle.
245 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
246 * commit the handle to disk in the process, but will
247 * not release any locks taken during the transaction.
248 * ocfs2_journal_access - Notify the handle that we want to journal this
249 * buffer. Will have to call ocfs2_journal_dirty once
250 * we've actually dirtied it. Type is one of . or .
251 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
252 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
253 * the current handle commits.
254 * ocfs2_handle_add_lock - Sometimes we need to delay lock release
255 * until after a transaction has been completed. Use
256 * ocfs2_handle_add_lock to indicate that a lock needs
257 * to be released at the end of that handle. Locks
258 * will be released in the order that they are added.
259 * ocfs2_handle_add_inode - Add a locked inode to a transaction.
260 */
261
262/* You must always start_trans with a number of buffs > 0, but it's
263 * perfectly legal to go through an entire transaction without having
264 * dirtied any buffers. */
265struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
266struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
267 struct ocfs2_journal_handle *handle,
268 int max_buffs);
269void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
270int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
271 int nblocks);
272
273/*
274 * Create access is for when we get a newly created buffer and we're
275 * not gonna read it off disk, but rather fill it ourselves. Right
276 * now, we don't do anything special with this (it turns into a write
277 * request), but this is a good placeholder in case we do...
278 *
279 * Write access is for when we read a block off disk and are going to
280 * modify it. This way the journalling layer knows it may need to make
281 * a copy of that block (if it's part of another, uncommitted
282 * transaction) before we do so.
283 */
284#define OCFS2_JOURNAL_ACCESS_CREATE 0
285#define OCFS2_JOURNAL_ACCESS_WRITE 1
286#define OCFS2_JOURNAL_ACCESS_UNDO 2
287
288int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
289 struct inode *inode,
290 struct buffer_head *bh,
291 int type);
292/*
293 * A word about the journal_access/journal_dirty "dance". It is
294 * entirely legal to journal_access a buffer more than once (as long
295 * as the access type is the same -- I'm not sure what will happen if
296 * access type is different but this should never happen anyway) It is
297 * also legal to journal_dirty a buffer more than once. In fact, you
298 * can even journal_access a buffer after you've done a
299 * journal_access/journal_dirty pair. The only thing you cannot do
300 * however, is journal_dirty a buffer which you haven't yet passed to
301 * journal_access at least once.
302 *
303 * That said, 99% of the time this doesn't matter and this is what the
304 * path looks like:
305 *
306 * <read a bh>
307 * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
308 * <modify the bh>
309 * ocfs2_journal_dirty(handle, bh);
310 */
311int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
312 struct buffer_head *bh);
313int ocfs2_journal_dirty_data(handle_t *handle,
314 struct buffer_head *bh);
315int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
316 struct inode *inode);
317/*
318 * Use this to protect from other processes reading buffer state while
319 * it's in flight.
320 */
321void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
322 struct inode *inode);
323
324/*
325 * Credit Macros:
326 * Convenience macros to calculate number of credits needed.
327 *
328 * For convenience sake, I have a set of macros here which calculate
329 * the *maximum* number of sectors which will be changed for various
330 * metadata updates.
331 */
332
333/* simple file updates like chmod, etc. */
334#define OCFS2_INODE_UPDATE_CREDITS 1
335
336/* get one bit out of a suballocator: dinode + group descriptor +
337 * prev. group desc. if we relink. */
338#define OCFS2_SUBALLOC_ALLOC (3)
339
340/* dinode + group descriptor update. We don't relink on free yet. */
341#define OCFS2_SUBALLOC_FREE (2)
342
343#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
344#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
345 + OCFS2_TRUNCATE_LOG_UPDATE)
346
347/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
348 * bitmap block for the new bit) */
349#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
350
351/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
352 * group descriptor + mkdir/symlink blocks */
353#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
354 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
355
356/* local alloc metadata change + main bitmap updates */
357#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
359
360/* used when we don't need an allocation change for a dir extend. One
361 * for the dinode, one for the new block. */
362#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
363
364/* file update (nlink, etc) + dir entry block */
365#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
366
367/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
368 * dir inode link */
369#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
370 + OCFS2_LINK_CREDITS)
371
372/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
373 * inode alloc group descriptor */
374#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
375
376/* dinode update, old dir dinode update, new dir dinode update, old
377 * dir dir entry, new dir dir entry, dir entry update for renaming
378 * directory + target unlink */
379#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
380 + OCFS2_UNLINK_CREDITS)
381
382static inline int ocfs2_calc_extend_credits(struct super_block *sb,
383 struct ocfs2_dinode *fe,
384 u32 bits_wanted)
385{
386 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
387
388 /* bitmap dinode, group desc. + relinked group. */
389 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
390
391 /* we might need to shift tree depth so lets assume an
392 * absolute worst case of complete fragmentation. Even with
393 * that, we only need one update for the dinode, and then
394 * however many metadata chunks needed * a remaining suballoc
395 * alloc. */
396 sysfile_bitmap_blocks = 1 +
397 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
398
399 /* this does not include *new* metadata blocks, which are
400 * accounted for in sysfile_bitmap_blocks. fe +
401 * prev. last_eb_blk + blocks along edge of tree.
402 * calc_symlink_credits passes because we just need 1
403 * credit for the dinode there. */
404 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
405
406 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
407}
408
409static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
410{
411 int blocks = OCFS2_MKNOD_CREDITS;
412
413 /* links can be longer than one block so we may update many
414 * within our single allocated extent. */
415 blocks += ocfs2_clusters_to_blocks(sb, 1);
416
417 return blocks;
418}
419
420static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
421 unsigned int cpg)
422{
423 int blocks;
424 int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
425 /* parent inode update + new block group header + bitmap inode update
426 + bitmap blocks affected */
427 blocks = 1 + 1 + 1 + bitmap_blocks;
428 return blocks;
429}
430
431static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
432 unsigned int clusters_to_del,
433 struct ocfs2_dinode *fe,
434 struct ocfs2_extent_list *last_el)
435{
436 /* for dinode + all headers in this pass + update to next leaf */
437 u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
438 u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
439 int credits = 1 + tree_depth + 1;
440 int i;
441
442 i = next_free - 1;
443 BUG_ON(i < 0);
444
445 /* We may be deleting metadata blocks, so metadata alloc dinode +
446 one desc. block for each possible delete. */
447 if (tree_depth && next_free == 1 &&
448 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
449 credits += 1 + tree_depth;
450
451 /* update to the truncate log. */
452 credits += OCFS2_TRUNCATE_LOG_UPDATE;
453
454 return credits;
455}
456
457#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 000000000000..fe373a2101d9
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.c
5 *
6 * Node local data allocation
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/bitops.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45
46#include "buffer_head_io.h"
47
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
55 struct ocfs2_dinode *alloc,
56 u32 numbits);
57
58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
59
60static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *alloc,
63 struct inode *main_bm_inode,
64 struct buffer_head *main_bm_bh);
65
66static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
67 struct ocfs2_journal_handle *handle,
68 struct ocfs2_alloc_context **ac,
69 struct inode **bitmap_inode,
70 struct buffer_head **bitmap_bh);
71
72static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac);
75
76static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
77 struct inode *local_alloc_inode);
78
79/*
80 * Determine how large our local alloc window should be, in bits.
81 *
82 * These values (and the behavior in ocfs2_alloc_should_use_local) have
83 * been chosen so that most allocations, including new block groups go
84 * through local alloc.
85 */
86static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
87{
88 BUG_ON(osb->s_clustersize_bits < 12);
89
90 return 2048 >> (osb->s_clustersize_bits - 12);
91}
92
93/*
94 * Tell us whether a given allocation should use the local alloc
95 * file. Otherwise, it has to go to the main bitmap.
96 */
97int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
98{
99 int la_bits = ocfs2_local_alloc_window_bits(osb);
100
101 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
102 return 0;
103
104 /* la_bits should be at least twice the size (in clusters) of
105 * a new block group. We want to be sure block group
106 * allocations go through the local alloc, so allow an
107 * allocation to take up to half the bitmap. */
108 if (bits > (la_bits / 2))
109 return 0;
110
111 return 1;
112}
113
114int ocfs2_load_local_alloc(struct ocfs2_super *osb)
115{
116 int status = 0;
117 struct ocfs2_dinode *alloc = NULL;
118 struct buffer_head *alloc_bh = NULL;
119 u32 num_used;
120 struct inode *inode = NULL;
121 struct ocfs2_local_alloc *la;
122
123 mlog_entry_void();
124
125 /* read the alloc off disk */
126 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
127 osb->slot_num);
128 if (!inode) {
129 status = -EINVAL;
130 mlog_errno(status);
131 goto bail;
132 }
133
134 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
135 &alloc_bh, 0, inode);
136 if (status < 0) {
137 mlog_errno(status);
138 goto bail;
139 }
140
141 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
142 la = OCFS2_LOCAL_ALLOC(alloc);
143
144 if (!(le32_to_cpu(alloc->i_flags) &
145 (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
146 mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
147 OCFS2_I(inode)->ip_blkno);
148 status = -EINVAL;
149 goto bail;
150 }
151
152 if ((la->la_size == 0) ||
153 (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
154 mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
155 le16_to_cpu(la->la_size));
156 status = -EINVAL;
157 goto bail;
158 }
159
160 /* do a little verification. */
161 num_used = ocfs2_local_alloc_count_bits(alloc);
162
163 /* hopefully the local alloc has always been recovered before
164 * we load it. */
165 if (num_used
166 || alloc->id1.bitmap1.i_used
167 || alloc->id1.bitmap1.i_total
168 || la->la_bm_off)
169 mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
170 "found = %u, set = %u, taken = %u, off = %u\n",
171 num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
172 le32_to_cpu(alloc->id1.bitmap1.i_total),
173 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
174
175 osb->local_alloc_bh = alloc_bh;
176 osb->local_alloc_state = OCFS2_LA_ENABLED;
177
178bail:
179 if (status < 0)
180 if (alloc_bh)
181 brelse(alloc_bh);
182 if (inode)
183 iput(inode);
184
185 mlog_exit(status);
186 return status;
187}
188
189/*
190 * return any unused bits to the bitmap and write out a clean
191 * local_alloc.
192 *
193 * local_alloc_bh is optional. If not passed, we will simply use the
194 * one off osb. If you do pass it however, be warned that it *will* be
195 * returned brelse'd and NULL'd out.*/
196void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
197{
198 int status;
199 struct ocfs2_journal_handle *handle = NULL;
200 struct inode *local_alloc_inode = NULL;
201 struct buffer_head *bh = NULL;
202 struct buffer_head *main_bm_bh = NULL;
203 struct inode *main_bm_inode = NULL;
204 struct ocfs2_dinode *alloc_copy = NULL;
205 struct ocfs2_dinode *alloc = NULL;
206
207 mlog_entry_void();
208
209 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
210 goto bail;
211
212 local_alloc_inode =
213 ocfs2_get_system_file_inode(osb,
214 LOCAL_ALLOC_SYSTEM_INODE,
215 osb->slot_num);
216 if (!local_alloc_inode) {
217 status = -ENOENT;
218 mlog_errno(status);
219 goto bail;
220 }
221
222 osb->local_alloc_state = OCFS2_LA_DISABLED;
223
224 handle = ocfs2_alloc_handle(osb);
225 if (!handle) {
226 status = -ENOMEM;
227 mlog_errno(status);
228 goto bail;
229 }
230
231 main_bm_inode = ocfs2_get_system_file_inode(osb,
232 GLOBAL_BITMAP_SYSTEM_INODE,
233 OCFS2_INVALID_SLOT);
234 if (!main_bm_inode) {
235 status = -EINVAL;
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_handle_add_inode(handle, main_bm_inode);
241 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
242 if (status < 0) {
243 mlog_errno(status);
244 goto bail;
245 }
246
247 /* WINDOW_MOVE_CREDITS is a bit heavy... */
248 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
249 if (IS_ERR(handle)) {
250 mlog_errno(PTR_ERR(handle));
251 handle = NULL;
252 goto bail;
253 }
254
255 bh = osb->local_alloc_bh;
256 alloc = (struct ocfs2_dinode *) bh->b_data;
257
258 alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
259 if (!alloc_copy) {
260 status = -ENOMEM;
261 goto bail;
262 }
263 memcpy(alloc_copy, alloc, bh->b_size);
264
265 status = ocfs2_journal_access(handle, local_alloc_inode, bh,
266 OCFS2_JOURNAL_ACCESS_WRITE);
267 if (status < 0) {
268 mlog_errno(status);
269 goto bail;
270 }
271
272 ocfs2_clear_local_alloc(alloc);
273
274 status = ocfs2_journal_dirty(handle, bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 brelse(bh);
281 osb->local_alloc_bh = NULL;
282 osb->local_alloc_state = OCFS2_LA_UNUSED;
283
284 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
285 main_bm_inode, main_bm_bh);
286 if (status < 0)
287 mlog_errno(status);
288
289bail:
290 if (handle)
291 ocfs2_commit_trans(handle);
292
293 if (main_bm_bh)
294 brelse(main_bm_bh);
295
296 if (main_bm_inode)
297 iput(main_bm_inode);
298
299 if (local_alloc_inode)
300 iput(local_alloc_inode);
301
302 if (alloc_copy)
303 kfree(alloc_copy);
304
305 mlog_exit_void();
306}
307
308/*
309 * We want to free the bitmap bits outside of any recovery context as
310 * we'll need a cluster lock to do so, but we must clear the local
311 * alloc before giving up the recovered nodes journal. To solve this,
312 * we kmalloc a copy of the local alloc before it's change for the
313 * caller to process with ocfs2_complete_local_alloc_recovery
314 */
315int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
316 int slot_num,
317 struct ocfs2_dinode **alloc_copy)
318{
319 int status = 0;
320 struct buffer_head *alloc_bh = NULL;
321 struct inode *inode = NULL;
322 struct ocfs2_dinode *alloc;
323
324 mlog_entry("(slot_num = %d)\n", slot_num);
325
326 *alloc_copy = NULL;
327
328 inode = ocfs2_get_system_file_inode(osb,
329 LOCAL_ALLOC_SYSTEM_INODE,
330 slot_num);
331 if (!inode) {
332 status = -EINVAL;
333 mlog_errno(status);
334 goto bail;
335 }
336
337 down(&inode->i_sem);
338
339 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
340 &alloc_bh, 0, inode);
341 if (status < 0) {
342 mlog_errno(status);
343 goto bail;
344 }
345
346 *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
347 if (!(*alloc_copy)) {
348 status = -ENOMEM;
349 goto bail;
350 }
351 memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
352
353 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
354 ocfs2_clear_local_alloc(alloc);
355
356 status = ocfs2_write_block(osb, alloc_bh, inode);
357 if (status < 0)
358 mlog_errno(status);
359
360bail:
361 if ((status < 0) && (*alloc_copy)) {
362 kfree(*alloc_copy);
363 *alloc_copy = NULL;
364 }
365
366 if (alloc_bh)
367 brelse(alloc_bh);
368
369 if (inode) {
370 up(&inode->i_sem);
371 iput(inode);
372 }
373
374 mlog_exit(status);
375 return status;
376}
377
378/*
379 * Step 2: By now, we've completed the journal recovery, we've stamped
380 * a clean local alloc on disk and dropped the node out of the
381 * recovery map. Dlm locks will no longer stall, so lets clear out the
382 * main bitmap.
383 */
384int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
385 struct ocfs2_dinode *alloc)
386{
387 int status;
388 struct ocfs2_journal_handle *handle = NULL;
389 struct buffer_head *main_bm_bh = NULL;
390 struct inode *main_bm_inode = NULL;
391
392 mlog_entry_void();
393
394 handle = ocfs2_alloc_handle(osb);
395 if (!handle) {
396 status = -ENOMEM;
397 mlog_errno(status);
398 goto bail;
399 }
400
401 main_bm_inode = ocfs2_get_system_file_inode(osb,
402 GLOBAL_BITMAP_SYSTEM_INODE,
403 OCFS2_INVALID_SLOT);
404 if (!main_bm_inode) {
405 status = -EINVAL;
406 mlog_errno(status);
407 goto bail;
408 }
409
410 ocfs2_handle_add_inode(handle, main_bm_inode);
411 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
412 if (status < 0) {
413 mlog_errno(status);
414 goto bail;
415 }
416
417 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
418 if (IS_ERR(handle)) {
419 status = PTR_ERR(handle);
420 handle = NULL;
421 mlog_errno(status);
422 goto bail;
423 }
424
425 /* we want the bitmap change to be recorded on disk asap */
426 ocfs2_handle_set_sync(handle, 1);
427
428 status = ocfs2_sync_local_to_main(osb, handle, alloc,
429 main_bm_inode, main_bm_bh);
430 if (status < 0)
431 mlog_errno(status);
432
433bail:
434 if (handle)
435 ocfs2_commit_trans(handle);
436
437 if (main_bm_bh)
438 brelse(main_bm_bh);
439
440 if (main_bm_inode)
441 iput(main_bm_inode);
442
443 mlog_exit(status);
444 return status;
445}
446
447/*
448 * make sure we've got at least bitswanted contiguous bits in the
449 * local alloc. You lose them when you drop i_sem.
450 *
451 * We will add ourselves to the transaction passed in, but may start
452 * our own in order to shift windows.
453 */
454int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
455 struct ocfs2_journal_handle *passed_handle,
456 u32 bits_wanted,
457 struct ocfs2_alloc_context *ac)
458{
459 int status;
460 struct ocfs2_dinode *alloc;
461 struct inode *local_alloc_inode;
462 unsigned int free_bits;
463
464 mlog_entry_void();
465
466 BUG_ON(!passed_handle);
467 BUG_ON(!ac);
468 BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
469
470 local_alloc_inode =
471 ocfs2_get_system_file_inode(osb,
472 LOCAL_ALLOC_SYSTEM_INODE,
473 osb->slot_num);
474 if (!local_alloc_inode) {
475 status = -ENOENT;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
480
481 if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
482 status = -ENOSPC;
483 goto bail;
484 }
485
486 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
487 mlog(0, "Asking for more than my max window size!\n");
488 status = -ENOSPC;
489 goto bail;
490 }
491
492 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
493
494 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
495 ocfs2_local_alloc_count_bits(alloc)) {
496 ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
497 "%u free bits, but a count shows %u",
498 le64_to_cpu(alloc->i_blkno),
499 le32_to_cpu(alloc->id1.bitmap1.i_used),
500 ocfs2_local_alloc_count_bits(alloc));
501 status = -EIO;
502 goto bail;
503 }
504
505 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
506 le32_to_cpu(alloc->id1.bitmap1.i_used);
507 if (bits_wanted > free_bits) {
508 /* uhoh, window change time. */
509 status =
510 ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
511 if (status < 0) {
512 if (status != -ENOSPC)
513 mlog_errno(status);
514 goto bail;
515 }
516 }
517
518 ac->ac_inode = igrab(local_alloc_inode);
519 get_bh(osb->local_alloc_bh);
520 ac->ac_bh = osb->local_alloc_bh;
521 ac->ac_which = OCFS2_AC_USE_LOCAL;
522 status = 0;
523bail:
524 if (local_alloc_inode)
525 iput(local_alloc_inode);
526
527 mlog_exit(status);
528 return status;
529}
530
531int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
532 struct ocfs2_journal_handle *handle,
533 struct ocfs2_alloc_context *ac,
534 u32 min_bits,
535 u32 *bit_off,
536 u32 *num_bits)
537{
538 int status, start;
539 struct inode *local_alloc_inode;
540 u32 bits_wanted;
541 void *bitmap;
542 struct ocfs2_dinode *alloc;
543 struct ocfs2_local_alloc *la;
544
545 mlog_entry_void();
546 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
547
548 bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
549 local_alloc_inode = ac->ac_inode;
550 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
551 la = OCFS2_LOCAL_ALLOC(alloc);
552
553 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
554 if (start == -1) {
555 /* TODO: Shouldn't we just BUG here? */
556 status = -ENOSPC;
557 mlog_errno(status);
558 goto bail;
559 }
560
561 bitmap = la->la_bitmap;
562 *bit_off = le32_to_cpu(la->la_bm_off) + start;
563 /* local alloc is always contiguous by nature -- we never
564 * delete bits from it! */
565 *num_bits = bits_wanted;
566
567 status = ocfs2_journal_access(handle, local_alloc_inode,
568 osb->local_alloc_bh,
569 OCFS2_JOURNAL_ACCESS_WRITE);
570 if (status < 0) {
571 mlog_errno(status);
572 goto bail;
573 }
574
575 while(bits_wanted--)
576 ocfs2_set_bit(start++, bitmap);
577
578 alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
579 le32_to_cpu(alloc->id1.bitmap1.i_used));
580
581 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
582 if (status < 0) {
583 mlog_errno(status);
584 goto bail;
585 }
586
587 status = 0;
588bail:
589 mlog_exit(status);
590 return status;
591}
592
593static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
594{
595 int i;
596 u8 *buffer;
597 u32 count = 0;
598 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
599
600 mlog_entry_void();
601
602 buffer = la->la_bitmap;
603 for (i = 0; i < le16_to_cpu(la->la_size); i++)
604 count += hweight8(buffer[i]);
605
606 mlog_exit(count);
607 return count;
608}
609
610static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
611 struct ocfs2_dinode *alloc,
612 u32 numbits)
613{
614 int numfound, bitoff, left, startoff, lastzero;
615 void *bitmap = NULL;
616
617 mlog_entry("(numbits wanted = %u)\n", numbits);
618
619 if (!alloc->id1.bitmap1.i_total) {
620 mlog(0, "No bits in my window!\n");
621 bitoff = -1;
622 goto bail;
623 }
624
625 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
626
627 numfound = bitoff = startoff = 0;
628 lastzero = -1;
629 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
630 while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
631 if (bitoff == left) {
632 /* mlog(0, "bitoff (%d) == left", bitoff); */
633 break;
634 }
635 /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
636 "numfound = %d\n", bitoff, startoff, numfound);*/
637
638 /* Ok, we found a zero bit... is it contig. or do we
639 * start over?*/
640 if (bitoff == startoff) {
641 /* we found a zero */
642 numfound++;
643 startoff++;
644 } else {
645 /* got a zero after some ones */
646 numfound = 1;
647 startoff = bitoff+1;
648 }
649 /* we got everything we needed */
650 if (numfound == numbits) {
651 /* mlog(0, "Found it all!\n"); */
652 break;
653 }
654 }
655
656 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
657 numfound);
658
659 if (numfound == numbits)
660 bitoff = startoff - numfound;
661 else
662 bitoff = -1;
663
664bail:
665 mlog_exit(bitoff);
666 return bitoff;
667}
668
669static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
670{
671 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
672 int i;
673 mlog_entry_void();
674
675 alloc->id1.bitmap1.i_total = 0;
676 alloc->id1.bitmap1.i_used = 0;
677 la->la_bm_off = 0;
678 for(i = 0; i < le16_to_cpu(la->la_size); i++)
679 la->la_bitmap[i] = 0;
680
681 mlog_exit_void();
682}
683
684#if 0
685/* turn this on and uncomment below to aid debugging window shifts. */
686static void ocfs2_verify_zero_bits(unsigned long *bitmap,
687 unsigned int start,
688 unsigned int count)
689{
690 unsigned int tmp = count;
691 while(tmp--) {
692 if (ocfs2_test_bit(start + tmp, bitmap)) {
693 printk("ocfs2_verify_zero_bits: start = %u, count = "
694 "%u\n", start, count);
695 printk("ocfs2_verify_zero_bits: bit %u is set!",
696 start + tmp);
697 BUG();
698 }
699 }
700}
701#endif
702
703/*
704 * sync the local alloc to main bitmap.
705 *
706 * assumes you've already locked the main bitmap -- the bitmap inode
707 * passed is used for caching.
708 */
709static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
710 struct ocfs2_journal_handle *handle,
711 struct ocfs2_dinode *alloc,
712 struct inode *main_bm_inode,
713 struct buffer_head *main_bm_bh)
714{
715 int status = 0;
716 int bit_off, left, count, start;
717 u64 la_start_blk;
718 u64 blkno;
719 void *bitmap;
720 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
721
722 mlog_entry("total = %u, COUNT = %u, used = %u\n",
723 le32_to_cpu(alloc->id1.bitmap1.i_total),
724 ocfs2_local_alloc_count_bits(alloc),
725 le32_to_cpu(alloc->id1.bitmap1.i_used));
726
727 if (!alloc->id1.bitmap1.i_total) {
728 mlog(0, "nothing to sync!\n");
729 goto bail;
730 }
731
732 if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
733 le32_to_cpu(alloc->id1.bitmap1.i_total)) {
734 mlog(0, "all bits were taken!\n");
735 goto bail;
736 }
737
738 la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
739 le32_to_cpu(la->la_bm_off));
740 bitmap = la->la_bitmap;
741 start = count = bit_off = 0;
742 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
743
744 while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
745 != -1) {
746 if ((bit_off < left) && (bit_off == start)) {
747 count++;
748 start++;
749 continue;
750 }
751 if (count) {
752 blkno = la_start_blk +
753 ocfs2_clusters_to_blocks(osb->sb,
754 start - count);
755
756 mlog(0, "freeing %u bits starting at local "
757 "alloc bit %u (la_start_blk = %"MLFu64", "
758 "blkno = %"MLFu64")\n", count, start - count,
759 la_start_blk, blkno);
760
761 status = ocfs2_free_clusters(handle, main_bm_inode,
762 main_bm_bh, blkno, count);
763 if (status < 0) {
764 mlog_errno(status);
765 goto bail;
766 }
767 }
768 if (bit_off >= left)
769 break;
770 count = 1;
771 start = bit_off + 1;
772 }
773
774bail:
775 mlog_exit(status);
776 return status;
777}
778
779static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
780 struct ocfs2_journal_handle *handle,
781 struct ocfs2_alloc_context **ac,
782 struct inode **bitmap_inode,
783 struct buffer_head **bitmap_bh)
784{
785 int status;
786
787 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
788 if (!(*ac)) {
789 status = -ENOMEM;
790 mlog_errno(status);
791 goto bail;
792 }
793
794 (*ac)->ac_handle = handle;
795 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
796
797 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
798 if (status < 0) {
799 if (status != -ENOSPC)
800 mlog_errno(status);
801 goto bail;
802 }
803
804 *bitmap_inode = (*ac)->ac_inode;
805 igrab(*bitmap_inode);
806 *bitmap_bh = (*ac)->ac_bh;
807 get_bh(*bitmap_bh);
808 status = 0;
809bail:
810 if ((status < 0) && *ac) {
811 ocfs2_free_alloc_context(*ac);
812 *ac = NULL;
813 }
814
815 mlog_exit(status);
816 return status;
817}
818
819/*
820 * pass it the bitmap lock in lock_bh if you have it.
821 */
822static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
823 struct ocfs2_journal_handle *handle,
824 struct ocfs2_alloc_context *ac)
825{
826 int status = 0;
827 u32 cluster_off, cluster_count;
828 struct ocfs2_dinode *alloc = NULL;
829 struct ocfs2_local_alloc *la;
830
831 mlog_entry_void();
832
833 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
834 la = OCFS2_LOCAL_ALLOC(alloc);
835
836 if (alloc->id1.bitmap1.i_total)
837 mlog(0, "asking me to alloc a new window over a non-empty "
838 "one\n");
839
840 mlog(0, "Allocating %u clusters for a new window.\n",
841 ocfs2_local_alloc_window_bits(osb));
842 /* we used the generic suballoc reserve function, but we set
843 * everything up nicely, so there's no reason why we can't use
844 * the more specific cluster api to claim bits. */
845 status = ocfs2_claim_clusters(osb, handle, ac,
846 ocfs2_local_alloc_window_bits(osb),
847 &cluster_off, &cluster_count);
848 if (status < 0) {
849 if (status != -ENOSPC)
850 mlog_errno(status);
851 goto bail;
852 }
853
854 la->la_bm_off = cpu_to_le32(cluster_off);
855 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
856 /* just in case... In the future when we find space ourselves,
857 * we don't have to get all contiguous -- but we'll have to
858 * set all previously used bits in bitmap and update
859 * la_bits_set before setting the bits in the main bitmap. */
860 alloc->id1.bitmap1.i_used = 0;
861 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
862 le16_to_cpu(la->la_size));
863
864 mlog(0, "New window allocated:\n");
865 mlog(0, "window la_bm_off = %u\n",
866 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
867 mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
868
869bail:
870 mlog_exit(status);
871 return status;
872}
873
874/* Note that we do *NOT* lock the local alloc inode here as
875 * it's been locked already for us. */
876static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
877 struct inode *local_alloc_inode)
878{
879 int status = 0;
880 struct buffer_head *main_bm_bh = NULL;
881 struct inode *main_bm_inode = NULL;
882 struct ocfs2_journal_handle *handle = NULL;
883 struct ocfs2_dinode *alloc;
884 struct ocfs2_dinode *alloc_copy = NULL;
885 struct ocfs2_alloc_context *ac = NULL;
886
887 mlog_entry_void();
888
889 handle = ocfs2_alloc_handle(osb);
890 if (!handle) {
891 status = -ENOMEM;
892 mlog_errno(status);
893 goto bail;
894 }
895
896 /* This will lock the main bitmap for us. */
897 status = ocfs2_local_alloc_reserve_for_window(osb,
898 handle,
899 &ac,
900 &main_bm_inode,
901 &main_bm_bh);
902 if (status < 0) {
903 if (status != -ENOSPC)
904 mlog_errno(status);
905 goto bail;
906 }
907
908 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
909 if (IS_ERR(handle)) {
910 status = PTR_ERR(handle);
911 handle = NULL;
912 mlog_errno(status);
913 goto bail;
914 }
915
916 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
917
918 /* We want to clear the local alloc before doing anything
919 * else, so that if we error later during this operation,
920 * local alloc shutdown won't try to double free main bitmap
921 * bits. Make a copy so the sync function knows which bits to
922 * free. */
923 alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
924 if (!alloc_copy) {
925 status = -ENOMEM;
926 mlog_errno(status);
927 goto bail;
928 }
929 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
930
931 status = ocfs2_journal_access(handle, local_alloc_inode,
932 osb->local_alloc_bh,
933 OCFS2_JOURNAL_ACCESS_WRITE);
934 if (status < 0) {
935 mlog_errno(status);
936 goto bail;
937 }
938
939 ocfs2_clear_local_alloc(alloc);
940
941 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
942 if (status < 0) {
943 mlog_errno(status);
944 goto bail;
945 }
946
947 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
948 main_bm_inode, main_bm_bh);
949 if (status < 0) {
950 mlog_errno(status);
951 goto bail;
952 }
953
954 status = ocfs2_local_alloc_new_window(osb, handle, ac);
955 if (status < 0) {
956 if (status != -ENOSPC)
957 mlog_errno(status);
958 goto bail;
959 }
960
961 atomic_inc(&osb->alloc_stats.moves);
962
963 status = 0;
964bail:
965 if (handle)
966 ocfs2_commit_trans(handle);
967
968 if (main_bm_bh)
969 brelse(main_bm_bh);
970
971 if (main_bm_inode)
972 iput(main_bm_inode);
973
974 if (alloc_copy)
975 kfree(alloc_copy);
976
977 if (ac)
978 ocfs2_free_alloc_context(ac);
979
980 mlog_exit(status);
981 return status;
982}
983
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 000000000000..30f88ce14e46
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCALALLOC_H
27#define OCFS2_LOCALALLOC_H
28
29int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num,
35 struct ocfs2_dinode **alloc_copy);
36
37int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
38 struct ocfs2_dinode *alloc);
39
40int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
41 u64 bits);
42
43struct ocfs2_alloc_context;
44int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
45 struct ocfs2_journal_handle *passed_handle,
46 u32 bits_wanted,
47 struct ocfs2_alloc_context *ac);
48
49int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
50 struct ocfs2_journal_handle *handle,
51 struct ocfs2_alloc_context *ac,
52 u32 min_bits,
53 u32 *bit_off,
54 u32 *num_bits);
55
56#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 000000000000..afdeec4b0eef
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mmap.c
5 *
6 * Code to deal with the mess that is clustered mmap.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32#include <linux/signal.h>
33#include <linux/rbtree.h>
34
35#define MLOG_MASK_PREFIX ML_FILE_IO
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "dlmglue.h"
41#include "file.h"
42#include "inode.h"
43#include "mmap.h"
44
45static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address,
47 int *type)
48{
49 struct inode *inode = area->vm_file->f_dentry->d_inode;
50 struct page *page = NOPAGE_SIGBUS;
51 sigset_t blocked, oldset;
52 int ret;
53
54 mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
55
56 /* The best way to deal with signals in this path is
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) {
65 mlog_errno(ret);
66 goto out;
67 }
68
69 page = filemap_nopage(area, address, type);
70
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
72 if (ret < 0)
73 mlog_errno(ret);
74out:
75 mlog_exit_ptr(page);
76 return page;
77}
78
79static struct vm_operations_struct ocfs2_file_vm_ops = {
80 .nopage = ocfs2_nopage,
81};
82
83int ocfs2_mmap(struct file *file,
84 struct vm_area_struct *vma)
85{
86 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
87 struct inode *inode = mapping->host;
88
89 /* We don't want to support shared writable mappings yet. */
90 if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
91 && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
92 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
93 /* This is -EINVAL because generic_file_readonly_mmap
94 * returns it in a similar situation. */
95 return -EINVAL;
96 }
97
98 update_atime(inode);
99 vma->vm_ops = &ocfs2_file_vm_ops;
100 return 0;
101}
102
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 000000000000..1274ee0f1fe2
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
1#ifndef OCFS2_MMAP_H
2#define OCFS2_MMAP_H
3
4int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
5
6#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 000000000000..f6b77ff1d2bf
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.c
5 *
6 * Create and rename file, directory, symlinks
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dcache.h"
51#include "dir.h"
52#include "dlmglue.h"
53#include "extent_map.h"
54#include "file.h"
55#include "inode.h"
56#include "journal.h"
57#include "namei.h"
58#include "suballoc.h"
59#include "symlink.h"
60#include "sysfile.h"
61#include "uptodate.h"
62#include "vote.h"
63
64#include "buffer_head_io.h"
65
66#define NAMEI_RA_CHUNKS 2
67#define NAMEI_RA_BLOCKS 4
68#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
69#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
70
71static int inline ocfs2_search_dirblock(struct buffer_head *bh,
72 struct inode *dir,
73 const char *name, int namelen,
74 unsigned long offset,
75 struct ocfs2_dir_entry **res_dir);
76
77static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
78 struct inode *dir,
79 struct ocfs2_dir_entry *de_del,
80 struct buffer_head *bh);
81
82static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
83 struct inode *dir,
84 const char *name, int namelen,
85 struct inode *inode, u64 blkno,
86 struct buffer_head *parent_fe_bh,
87 struct buffer_head *insert_bh);
88
89static int ocfs2_mknod_locked(struct ocfs2_super *osb,
90 struct inode *dir,
91 struct dentry *dentry, int mode,
92 dev_t dev,
93 struct buffer_head **new_fe_bh,
94 struct buffer_head *parent_fe_bh,
95 struct ocfs2_journal_handle *handle,
96 struct inode **ret_inode,
97 struct ocfs2_alloc_context *inode_ac);
98
99static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
100 struct ocfs2_journal_handle *handle,
101 struct inode *parent,
102 struct inode *inode,
103 struct buffer_head *fe_bh,
104 struct ocfs2_alloc_context *data_ac);
105
106static int ocfs2_double_lock(struct ocfs2_super *osb,
107 struct ocfs2_journal_handle *handle,
108 struct buffer_head **bh1,
109 struct inode *inode1,
110 struct buffer_head **bh2,
111 struct inode *inode2);
112
113static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
114 struct ocfs2_journal_handle *handle,
115 struct inode *inode,
116 char *name,
117 struct buffer_head **de_bh);
118
119static int ocfs2_orphan_add(struct ocfs2_super *osb,
120 struct ocfs2_journal_handle *handle,
121 struct inode *inode,
122 struct ocfs2_dinode *fe,
123 char *name,
124 struct buffer_head *de_bh);
125
126static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
127 struct ocfs2_journal_handle *handle,
128 struct inode *inode,
129 const char *symname);
130
131static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
132 struct dentry *dentry,
133 struct inode *inode, u64 blkno,
134 struct buffer_head *parent_fe_bh,
135 struct buffer_head *insert_bh)
136{
137 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
138 dentry->d_name.name, dentry->d_name.len,
139 inode, blkno, parent_fe_bh, insert_bh);
140}
141
142/* An orphan dir name is an 8 byte value, printed as a hex string */
143#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
144
145static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
146 struct nameidata *nd)
147{
148 int status;
149 u64 blkno;
150 struct buffer_head *dirent_bh = NULL;
151 struct inode *inode = NULL;
152 struct dentry *ret;
153 struct ocfs2_dir_entry *dirent;
154 struct ocfs2_inode_info *oi;
155
156 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
157 dentry->d_name.len, dentry->d_name.name);
158
159 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
160 ret = ERR_PTR(-ENAMETOOLONG);
161 goto bail;
162 }
163
164 mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
165 dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
166
167 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
168 if (status < 0) {
169 if (status != -ENOENT)
170 mlog_errno(status);
171 ret = ERR_PTR(status);
172 goto bail;
173 }
174
175 status = ocfs2_find_files_on_disk(dentry->d_name.name,
176 dentry->d_name.len, &blkno,
177 dir, &dirent_bh, &dirent);
178 if (status < 0)
179 goto bail_add;
180
181 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
182 if (IS_ERR(inode)) {
183 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
184 ret = ERR_PTR(-EACCES);
185 goto bail_unlock;
186 }
187
188 oi = OCFS2_I(inode);
189 /* Clear any orphaned state... If we were able to look up the
190 * inode from a directory, it certainly can't be orphaned. We
191 * might have the bad state from a node which intended to
192 * orphan this inode but crashed before it could commit the
193 * unlink. */
194 spin_lock(&oi->ip_lock);
195 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
196 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
197 spin_unlock(&oi->ip_lock);
198
199bail_add:
200
201 dentry->d_op = &ocfs2_dentry_ops;
202 ret = d_splice_alias(inode, dentry);
203
204bail_unlock:
205 /* Don't drop the cluster lock until *after* the d_add --
206 * unlink on another node will message us to remove that
207 * dentry under this lock so otherwise we can race this with
208 * the vote thread and have a stale dentry. */
209 ocfs2_meta_unlock(dir, 0);
210
211bail:
212 if (dirent_bh)
213 brelse(dirent_bh);
214
215 mlog_exit_ptr(ret);
216
217 return ret;
218}
219
220static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
221 struct ocfs2_journal_handle *handle,
222 struct inode *parent,
223 struct inode *inode,
224 struct buffer_head *fe_bh,
225 struct ocfs2_alloc_context *data_ac)
226{
227 int status;
228 struct buffer_head *new_bh = NULL;
229 struct ocfs2_dir_entry *de = NULL;
230
231 mlog_entry_void();
232
233 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
234 data_ac, NULL, &new_bh);
235 if (status < 0) {
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_set_new_buffer_uptodate(inode, new_bh);
241
242 status = ocfs2_journal_access(handle, inode, new_bh,
243 OCFS2_JOURNAL_ACCESS_CREATE);
244 if (status < 0) {
245 mlog_errno(status);
246 goto bail;
247 }
248 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
249
250 de = (struct ocfs2_dir_entry *) new_bh->b_data;
251 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
252 de->name_len = 1;
253 de->rec_len =
254 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
255 strcpy(de->name, ".");
256 ocfs2_set_de_type(de, S_IFDIR);
257 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
258 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
259 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
260 OCFS2_DIR_REC_LEN(1));
261 de->name_len = 2;
262 strcpy(de->name, "..");
263 ocfs2_set_de_type(de, S_IFDIR);
264
265 status = ocfs2_journal_dirty(handle, new_bh);
266 if (status < 0) {
267 mlog_errno(status);
268 goto bail;
269 }
270
271 i_size_write(inode, inode->i_sb->s_blocksize);
272 inode->i_nlink = 2;
273 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
274 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 status = 0;
281bail:
282 if (new_bh)
283 brelse(new_bh);
284
285 mlog_exit(status);
286 return status;
287}
288
289static int ocfs2_mknod(struct inode *dir,
290 struct dentry *dentry,
291 int mode,
292 dev_t dev)
293{
294 int status = 0;
295 struct buffer_head *parent_fe_bh = NULL;
296 struct ocfs2_journal_handle *handle = NULL;
297 struct ocfs2_super *osb;
298 struct ocfs2_dinode *dirfe;
299 struct buffer_head *new_fe_bh = NULL;
300 struct buffer_head *de_bh = NULL;
301 struct inode *inode = NULL;
302 struct ocfs2_alloc_context *inode_ac = NULL;
303 struct ocfs2_alloc_context *data_ac = NULL;
304
305 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
306 (unsigned long)dev, dentry->d_name.len,
307 dentry->d_name.name);
308
309 /* get our super block */
310 osb = OCFS2_SB(dir->i_sb);
311
312 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
313 mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
314 OCFS2_I(dir)->ip_blkno, dir->i_nlink);
315 status = -EMLINK;
316 goto leave;
317 }
318
319 handle = ocfs2_alloc_handle(osb);
320 if (handle == NULL) {
321 status = -ENOMEM;
322 mlog_errno(status);
323 goto leave;
324 }
325
326 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
327 if (status < 0) {
328 if (status != -ENOENT)
329 mlog_errno(status);
330 goto leave;
331 }
332
333 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
334 if (!dirfe->i_links_count) {
335 /* can't make a file in a deleted directory. */
336 status = -ENOENT;
337 goto leave;
338 }
339
340 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
341 dentry->d_name.len);
342 if (status)
343 goto leave;
344
345 /* get a spot inside the dir. */
346 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
347 dentry->d_name.name,
348 dentry->d_name.len, &de_bh);
349 if (status < 0) {
350 mlog_errno(status);
351 goto leave;
352 }
353
354 /* reserve an inode spot */
355 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
356 if (status < 0) {
357 if (status != -ENOSPC)
358 mlog_errno(status);
359 goto leave;
360 }
361
362 /* are we making a directory? If so, reserve a cluster for his
363 * 1st extent. */
364 if (S_ISDIR(mode)) {
365 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
366 if (status < 0) {
367 if (status != -ENOSPC)
368 mlog_errno(status);
369 goto leave;
370 }
371 }
372
373 handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
374 if (IS_ERR(handle)) {
375 status = PTR_ERR(handle);
376 handle = NULL;
377 mlog_errno(status);
378 goto leave;
379 }
380
381 /* do the real work now. */
382 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
383 &new_fe_bh, parent_fe_bh, handle,
384 &inode, inode_ac);
385 if (status < 0) {
386 mlog_errno(status);
387 goto leave;
388 }
389
390 if (S_ISDIR(mode)) {
391 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
392 new_fe_bh, data_ac);
393 if (status < 0) {
394 mlog_errno(status);
395 goto leave;
396 }
397
398 status = ocfs2_journal_access(handle, dir, parent_fe_bh,
399 OCFS2_JOURNAL_ACCESS_WRITE);
400 if (status < 0) {
401 mlog_errno(status);
402 goto leave;
403 }
404 le16_add_cpu(&dirfe->i_links_count, 1);
405 status = ocfs2_journal_dirty(handle, parent_fe_bh);
406 if (status < 0) {
407 mlog_errno(status);
408 goto leave;
409 }
410 dir->i_nlink++;
411 }
412
413 status = ocfs2_add_entry(handle, dentry, inode,
414 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
415 de_bh);
416 if (status < 0) {
417 mlog_errno(status);
418 goto leave;
419 }
420
421 insert_inode_hash(inode);
422 dentry->d_op = &ocfs2_dentry_ops;
423 d_instantiate(dentry, inode);
424 status = 0;
425leave:
426 if (handle)
427 ocfs2_commit_trans(handle);
428
429 if (status == -ENOSPC)
430 mlog(0, "Disk is full\n");
431
432 if (new_fe_bh)
433 brelse(new_fe_bh);
434
435 if (de_bh)
436 brelse(de_bh);
437
438 if (parent_fe_bh)
439 brelse(parent_fe_bh);
440
441 if ((status < 0) && inode)
442 iput(inode);
443
444 if (inode_ac)
445 ocfs2_free_alloc_context(inode_ac);
446
447 if (data_ac)
448 ocfs2_free_alloc_context(data_ac);
449
450 mlog_exit(status);
451
452 return status;
453}
454
455static int ocfs2_mknod_locked(struct ocfs2_super *osb,
456 struct inode *dir,
457 struct dentry *dentry, int mode,
458 dev_t dev,
459 struct buffer_head **new_fe_bh,
460 struct buffer_head *parent_fe_bh,
461 struct ocfs2_journal_handle *handle,
462 struct inode **ret_inode,
463 struct ocfs2_alloc_context *inode_ac)
464{
465 int status = 0;
466 struct ocfs2_dinode *fe = NULL;
467 struct ocfs2_extent_list *fel;
468 u64 fe_blkno = 0;
469 u16 suballoc_bit;
470 struct inode *inode = NULL;
471
472 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
473 (unsigned long)dev, dentry->d_name.len,
474 dentry->d_name.name);
475
476 *new_fe_bh = NULL;
477 *ret_inode = NULL;
478
479 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
480 &fe_blkno);
481 if (status < 0) {
482 mlog_errno(status);
483 goto leave;
484 }
485
486 inode = new_inode(dir->i_sb);
487 if (IS_ERR(inode)) {
488 status = PTR_ERR(inode);
489 mlog(ML_ERROR, "new_inode failed!\n");
490 goto leave;
491 }
492
493 /* populate as many fields early on as possible - many of
494 * these are used by the support functions here and in
495 * callers. */
496 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
497 OCFS2_I(inode)->ip_blkno = fe_blkno;
498 if (S_ISDIR(mode))
499 inode->i_nlink = 2;
500 else
501 inode->i_nlink = 1;
502 inode->i_mode = mode;
503 spin_lock(&osb->osb_lock);
504 inode->i_generation = osb->s_next_generation++;
505 spin_unlock(&osb->osb_lock);
506
507 *new_fe_bh = sb_getblk(osb->sb, fe_blkno);
508 if (!*new_fe_bh) {
509 status = -EIO;
510 mlog_errno(status);
511 goto leave;
512 }
513 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
514
515 status = ocfs2_journal_access(handle, inode, *new_fe_bh,
516 OCFS2_JOURNAL_ACCESS_CREATE);
517 if (status < 0) {
518 mlog_errno(status);
519 goto leave;
520 }
521
522 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
523 memset(fe, 0, osb->sb->s_blocksize);
524
525 fe->i_generation = cpu_to_le32(inode->i_generation);
526 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
527 fe->i_blkno = cpu_to_le64(fe_blkno);
528 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
529 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
530 fe->i_uid = cpu_to_le32(current->fsuid);
531 if (dir->i_mode & S_ISGID) {
532 fe->i_gid = cpu_to_le32(dir->i_gid);
533 if (S_ISDIR(mode))
534 mode |= S_ISGID;
535 } else
536 fe->i_gid = cpu_to_le32(current->fsgid);
537 fe->i_mode = cpu_to_le16(mode);
538 if (S_ISCHR(mode) || S_ISBLK(mode))
539 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
540
541 fe->i_links_count = cpu_to_le16(inode->i_nlink);
542
543 fe->i_last_eb_blk = 0;
544 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
545 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
546 fe->i_atime = fe->i_ctime = fe->i_mtime =
547 cpu_to_le64(CURRENT_TIME.tv_sec);
548 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
549 cpu_to_le32(CURRENT_TIME.tv_nsec);
550 fe->i_dtime = 0;
551
552 fel = &fe->id2.i_list;
553 fel->l_tree_depth = 0;
554 fel->l_next_free_rec = 0;
555 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
556
557 status = ocfs2_journal_dirty(handle, *new_fe_bh);
558 if (status < 0) {
559 mlog_errno(status);
560 goto leave;
561 }
562
563 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
564 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
565 "i_blkno=%"MLFu64", i_ino=%lu\n",
566 (unsigned long long) (*new_fe_bh)->b_blocknr,
567 fe->i_blkno, inode->i_ino);
568 BUG();
569 }
570
571 ocfs2_inode_set_new(osb, inode);
572 status = ocfs2_create_new_inode_locks(inode);
573 if (status < 0)
574 mlog_errno(status);
575
576 status = 0; /* error in ocfs2_create_new_inode_locks is not
577 * critical */
578
579 *ret_inode = inode;
580leave:
581 if (status < 0) {
582 if (*new_fe_bh) {
583 brelse(*new_fe_bh);
584 *new_fe_bh = NULL;
585 }
586 if (inode)
587 iput(inode);
588 }
589
590 mlog_exit(status);
591 return status;
592}
593
594static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry,
596 int mode)
597{
598 int ret;
599
600 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
601 dentry->d_name.len, dentry->d_name.name);
602 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
603 mlog_exit(ret);
604
605 return ret;
606}
607
608static int ocfs2_create(struct inode *dir,
609 struct dentry *dentry,
610 int mode,
611 struct nameidata *nd)
612{
613 int ret;
614
615 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
616 dentry->d_name.len, dentry->d_name.name);
617 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
618 mlog_exit(ret);
619
620 return ret;
621}
622
623static int ocfs2_link(struct dentry *old_dentry,
624 struct inode *dir,
625 struct dentry *dentry)
626{
627 struct ocfs2_journal_handle *handle = NULL;
628 struct inode *inode = old_dentry->d_inode;
629 int err;
630 struct buffer_head *fe_bh = NULL;
631 struct buffer_head *parent_fe_bh = NULL;
632 struct buffer_head *de_bh = NULL;
633 struct ocfs2_dinode *fe = NULL;
634 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
635
636 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
637 old_dentry->d_name.len, old_dentry->d_name.name,
638 dentry->d_name.len, dentry->d_name.name);
639
640 if (S_ISDIR(inode->i_mode)) {
641 err = -EPERM;
642 goto bail;
643 }
644
645 if (inode->i_nlink >= OCFS2_LINK_MAX) {
646 err = -EMLINK;
647 goto bail;
648 }
649
650 handle = ocfs2_alloc_handle(osb);
651 if (handle == NULL) {
652 err = -ENOMEM;
653 goto bail;
654 }
655
656 err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
657 if (err < 0) {
658 if (err != -ENOENT)
659 mlog_errno(err);
660 goto bail;
661 }
662
663 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
664 dentry->d_name.len);
665 if (err)
666 goto bail;
667
668 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
669 dentry->d_name.name,
670 dentry->d_name.len, &de_bh);
671 if (err < 0) {
672 mlog_errno(err);
673 goto bail;
674 }
675
676 err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
677 if (err < 0) {
678 if (err != -ENOENT)
679 mlog_errno(err);
680 goto bail;
681 }
682
683 fe = (struct ocfs2_dinode *) fe_bh->b_data;
684 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
685 err = -EMLINK;
686 goto bail;
687 }
688
689 handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
690 if (IS_ERR(handle)) {
691 err = PTR_ERR(handle);
692 handle = NULL;
693 mlog_errno(err);
694 goto bail;
695 }
696
697 err = ocfs2_journal_access(handle, inode, fe_bh,
698 OCFS2_JOURNAL_ACCESS_WRITE);
699 if (err < 0) {
700 mlog_errno(err);
701 goto bail;
702 }
703
704 inode->i_nlink++;
705 inode->i_ctime = CURRENT_TIME;
706 fe->i_links_count = cpu_to_le16(inode->i_nlink);
707 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
708 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
709
710 err = ocfs2_journal_dirty(handle, fe_bh);
711 if (err < 0) {
712 le16_add_cpu(&fe->i_links_count, -1);
713 inode->i_nlink--;
714 mlog_errno(err);
715 goto bail;
716 }
717
718 err = ocfs2_add_entry(handle, dentry, inode,
719 OCFS2_I(inode)->ip_blkno,
720 parent_fe_bh, de_bh);
721 if (err) {
722 le16_add_cpu(&fe->i_links_count, -1);
723 inode->i_nlink--;
724 mlog_errno(err);
725 goto bail;
726 }
727
728 atomic_inc(&inode->i_count);
729 dentry->d_op = &ocfs2_dentry_ops;
730 d_instantiate(dentry, inode);
731bail:
732 if (handle)
733 ocfs2_commit_trans(handle);
734 if (de_bh)
735 brelse(de_bh);
736 if (fe_bh)
737 brelse(fe_bh);
738 if (parent_fe_bh)
739 brelse(parent_fe_bh);
740
741 mlog_exit(err);
742
743 return err;
744}
745
746static int ocfs2_unlink(struct inode *dir,
747 struct dentry *dentry)
748{
749 int status;
750 unsigned int saved_nlink = 0;
751 struct inode *inode = dentry->d_inode;
752 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
753 u64 blkno;
754 struct ocfs2_dinode *fe = NULL;
755 struct buffer_head *fe_bh = NULL;
756 struct buffer_head *parent_node_bh = NULL;
757 struct ocfs2_journal_handle *handle = NULL;
758 struct ocfs2_dir_entry *dirent = NULL;
759 struct buffer_head *dirent_bh = NULL;
760 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
761 struct buffer_head *orphan_entry_bh = NULL;
762
763 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
764 dentry->d_name.len, dentry->d_name.name);
765
766 BUG_ON(dentry->d_parent->d_inode != dir);
767
768 mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
769
770 if (inode == osb->root_inode) {
771 mlog(0, "Cannot delete the root directory\n");
772 status = -EPERM;
773 goto leave;
774 }
775
776 handle = ocfs2_alloc_handle(osb);
777 if (handle == NULL) {
778 status = -ENOMEM;
779 mlog_errno(status);
780 goto leave;
781 }
782
783 status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
784 if (status < 0) {
785 if (status != -ENOENT)
786 mlog_errno(status);
787 goto leave;
788 }
789
790 status = ocfs2_find_files_on_disk(dentry->d_name.name,
791 dentry->d_name.len, &blkno,
792 dir, &dirent_bh, &dirent);
793 if (status < 0) {
794 if (status != -ENOENT)
795 mlog_errno(status);
796 goto leave;
797 }
798
799 if (OCFS2_I(inode)->ip_blkno != blkno) {
800 status = -ENOENT;
801
802 mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
803 "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
804 OCFS2_I(inode)->ip_flags);
805 goto leave;
806 }
807
808 status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
809 if (status < 0) {
810 if (status != -ENOENT)
811 mlog_errno(status);
812 goto leave;
813 }
814
815 if (S_ISDIR(inode->i_mode)) {
816 if (!ocfs2_empty_dir(inode)) {
817 status = -ENOTEMPTY;
818 goto leave;
819 } else if (inode->i_nlink != 2) {
820 status = -ENOTEMPTY;
821 goto leave;
822 }
823 }
824
825 /* There are still a few steps left until we can consider the
826 * unlink to have succeeded. Save off nlink here before
827 * modification so we can set it back in case we hit an issue
828 * before commit. */
829 saved_nlink = inode->i_nlink;
830 if (S_ISDIR(inode->i_mode))
831 inode->i_nlink = 0;
832 else
833 inode->i_nlink--;
834
835 status = ocfs2_request_unlink_vote(inode, dentry,
836 (unsigned int) inode->i_nlink);
837 if (status < 0) {
838 /* This vote should succeed under all normal
839 * circumstances. */
840 mlog_errno(status);
841 goto leave;
842 }
843
844 if (!inode->i_nlink) {
845 status = ocfs2_prepare_orphan_dir(osb, handle, inode,
846 orphan_name,
847 &orphan_entry_bh);
848 if (status < 0) {
849 mlog_errno(status);
850 goto leave;
851 }
852 }
853
854 handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
855 if (IS_ERR(handle)) {
856 status = PTR_ERR(handle);
857 handle = NULL;
858 mlog_errno(status);
859 goto leave;
860 }
861
862 status = ocfs2_journal_access(handle, inode, fe_bh,
863 OCFS2_JOURNAL_ACCESS_WRITE);
864 if (status < 0) {
865 mlog_errno(status);
866 goto leave;
867 }
868
869 fe = (struct ocfs2_dinode *) fe_bh->b_data;
870
871 if (!inode->i_nlink) {
872 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
873 orphan_entry_bh);
874 if (status < 0) {
875 mlog_errno(status);
876 goto leave;
877 }
878 }
879
880 /* delete the name from the parent dir */
881 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
882 if (status < 0) {
883 mlog_errno(status);
884 goto leave;
885 }
886
887 /* We can set nlink on the dinode now. clear the saved version
888 * so that it doesn't get set later. */
889 fe->i_links_count = cpu_to_le16(inode->i_nlink);
890 saved_nlink = 0;
891
892 status = ocfs2_journal_dirty(handle, fe_bh);
893 if (status < 0) {
894 mlog_errno(status);
895 goto leave;
896 }
897
898 if (S_ISDIR(inode->i_mode)) {
899 dir->i_nlink--;
900 status = ocfs2_mark_inode_dirty(handle, dir,
901 parent_node_bh);
902 if (status < 0) {
903 mlog_errno(status);
904 dir->i_nlink++;
905 }
906 }
907
908leave:
909 if (status < 0 && saved_nlink)
910 inode->i_nlink = saved_nlink;
911
912 if (handle)
913 ocfs2_commit_trans(handle);
914
915 if (fe_bh)
916 brelse(fe_bh);
917
918 if (dirent_bh)
919 brelse(dirent_bh);
920
921 if (parent_node_bh)
922 brelse(parent_node_bh);
923
924 if (orphan_entry_bh)
925 brelse(orphan_entry_bh);
926
927 mlog_exit(status);
928
929 return status;
930}
931
932/*
933 * The only place this should be used is rename!
934 * if they have the same id, then the 1st one is the only one locked.
935 */
936static int ocfs2_double_lock(struct ocfs2_super *osb,
937 struct ocfs2_journal_handle *handle,
938 struct buffer_head **bh1,
939 struct inode *inode1,
940 struct buffer_head **bh2,
941 struct inode *inode2)
942{
943 int status;
944 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
945 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
946 struct buffer_head **tmpbh;
947 struct inode *tmpinode;
948
949 mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
950 oi1->ip_blkno, oi2->ip_blkno);
951
952 BUG_ON(!handle);
953
954 if (*bh1)
955 *bh1 = NULL;
956 if (*bh2)
957 *bh2 = NULL;
958
959 /* we always want to lock the one with the lower lockid first. */
960 if (oi1->ip_blkno != oi2->ip_blkno) {
961 if (oi1->ip_blkno < oi2->ip_blkno) {
962 /* switch id1 and id2 around */
963 mlog(0, "switching them around...\n");
964 tmpbh = bh2;
965 bh2 = bh1;
966 bh1 = tmpbh;
967
968 tmpinode = inode2;
969 inode2 = inode1;
970 inode1 = tmpinode;
971 }
972 /* lock id2 */
973 status = ocfs2_meta_lock(inode2, handle, bh2, 1);
974 if (status < 0) {
975 if (status != -ENOENT)
976 mlog_errno(status);
977 goto bail;
978 }
979 }
980 /* lock id1 */
981 status = ocfs2_meta_lock(inode1, handle, bh1, 1);
982 if (status < 0) {
983 if (status != -ENOENT)
984 mlog_errno(status);
985 goto bail;
986 }
987bail:
988 mlog_exit(status);
989 return status;
990}
991
992#define PARENT_INO(buffer) \
993 ((struct ocfs2_dir_entry *) \
994 ((char *)buffer + \
995 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
996
997static int ocfs2_rename(struct inode *old_dir,
998 struct dentry *old_dentry,
999 struct inode *new_dir,
1000 struct dentry *new_dentry)
1001{
1002 int status = 0, rename_lock = 0;
1003 struct inode *old_inode = old_dentry->d_inode;
1004 struct inode *new_inode = new_dentry->d_inode;
1005 struct ocfs2_dinode *newfe = NULL;
1006 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1007 struct buffer_head *orphan_entry_bh = NULL;
1008 struct buffer_head *newfe_bh = NULL;
1009 struct buffer_head *insert_entry_bh = NULL;
1010 struct ocfs2_super *osb = NULL;
1011 u64 newfe_blkno;
1012 struct ocfs2_journal_handle *handle = NULL;
1013 struct buffer_head *old_dir_bh = NULL;
1014 struct buffer_head *new_dir_bh = NULL;
1015 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
1016 // and new_dentry
1017 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1018 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1019 // this is the 1st dirent bh
1020 nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
1021 unsigned int links_count;
1022
1023 /* At some point it might be nice to break this function up a
1024 * bit. */
1025
1026 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
1027 old_dir, old_dentry, new_dir, new_dentry,
1028 old_dentry->d_name.len, old_dentry->d_name.name,
1029 new_dentry->d_name.len, new_dentry->d_name.name);
1030
1031 osb = OCFS2_SB(old_dir->i_sb);
1032
1033 if (new_inode) {
1034 if (!igrab(new_inode))
1035 BUG();
1036 }
1037
1038 if (atomic_read(&old_dentry->d_count) > 2) {
1039 shrink_dcache_parent(old_dentry);
1040 if (atomic_read(&old_dentry->d_count) > 2) {
1041 status = -EBUSY;
1042 goto bail;
1043 }
1044 }
1045
1046 /* Assume a directory heirarchy thusly:
1047 * a/b/c
1048 * a/d
1049 * a,b,c, and d are all directories.
1050 *
1051 * from cwd of 'a' on both nodes:
1052 * node1: mv b/c d
1053 * node2: mv d b/c
1054 *
1055 * And that's why, just like the VFS, we need a file system
1056 * rename lock. */
1057 if (old_dentry != new_dentry) {
1058 status = ocfs2_rename_lock(osb);
1059 if (status < 0) {
1060 mlog_errno(status);
1061 goto bail;
1062 }
1063 rename_lock = 1;
1064 }
1065
1066 handle = ocfs2_alloc_handle(osb);
1067 if (handle == NULL) {
1068 status = -ENOMEM;
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 /* if old and new are the same, this'll just do one lock. */
1074 status = ocfs2_double_lock(osb, handle,
1075 &old_dir_bh, old_dir,
1076 &new_dir_bh, new_dir);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto bail;
1080 }
1081
1082 /* make sure both dirs have bhs
1083 * get an extra ref on old_dir_bh if old==new */
1084 if (!new_dir_bh) {
1085 if (old_dir_bh) {
1086 new_dir_bh = old_dir_bh;
1087 get_bh(new_dir_bh);
1088 } else {
1089 mlog(ML_ERROR, "no old_dir_bh!\n");
1090 status = -EIO;
1091 goto bail;
1092 }
1093 }
1094
1095 if (S_ISDIR(old_inode->i_mode)) {
1096 /* Directories actually require metadata updates to
1097 * the directory info so we can't get away with not
1098 * doing node locking on it. */
1099 status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
1100 if (status < 0) {
1101 if (status != -ENOENT)
1102 mlog_errno(status);
1103 goto bail;
1104 }
1105
1106 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1107 if (status < 0) {
1108 mlog_errno(status);
1109 goto bail;
1110 }
1111
1112 status = -EIO;
1113 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
1114 if (!old_inode_de_bh)
1115 goto bail;
1116
1117 status = -EIO;
1118 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
1119 OCFS2_I(old_dir)->ip_blkno)
1120 goto bail;
1121 status = -EMLINK;
1122 if (!new_inode && new_dir!=old_dir &&
1123 new_dir->i_nlink >= OCFS2_LINK_MAX)
1124 goto bail;
1125 } else {
1126 /* Ah, the simple case - we're a file so just send a
1127 * message. */
1128 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1129 if (status < 0) {
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133 }
1134
1135 status = -ENOENT;
1136 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1137 old_dentry->d_name.len,
1138 old_dir, &old_de);
1139 if (!old_de_bh)
1140 goto bail;
1141
1142 /*
1143 * Check for inode number is _not_ due to possible IO errors.
1144 * We might rmdir the source, keep it as pwd of some process
1145 * and merrily kill the link to whatever was created under the
1146 * same name. Goodbye sticky bit ;-<
1147 */
1148 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
1149 goto bail;
1150
1151 /* check if the target already exists (in which case we need
1152 * to delete it */
1153 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1154 new_dentry->d_name.len,
1155 &newfe_blkno, new_dir, &new_de_bh,
1156 &new_de);
1157 /* The only error we allow here is -ENOENT because the new
1158 * file not existing is perfectly valid. */
1159 if ((status < 0) && (status != -ENOENT)) {
1160 /* If we cannot find the file specified we should just */
1161 /* return the error... */
1162 mlog_errno(status);
1163 goto bail;
1164 }
1165
1166 if (!new_de && new_inode)
1167 mlog(ML_ERROR, "inode %lu does not exist in it's parent "
1168 "directory!", new_inode->i_ino);
1169
1170 /* In case we need to overwrite an existing file, we blow it
1171 * away first */
1172 if (new_de) {
1173 /* VFS didn't think there existed an inode here, but
1174 * someone else in the cluster must have raced our
1175 * rename to create one. Today we error cleanly, in
1176 * the future we should consider calling iget to build
1177 * a new struct inode for this entry. */
1178 if (!new_inode) {
1179 status = -EACCES;
1180
1181 mlog(0, "We found an inode for name %.*s but VFS "
1182 "didn't give us one.\n", new_dentry->d_name.len,
1183 new_dentry->d_name.name);
1184 goto bail;
1185 }
1186
1187 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1188 status = -EACCES;
1189
1190 mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
1191 "disagree. ip_flags = %x\n",
1192 OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
1193 OCFS2_I(new_inode)->ip_flags);
1194 goto bail;
1195 }
1196
1197 status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
1198 if (status < 0) {
1199 if (status != -ENOENT)
1200 mlog_errno(status);
1201 goto bail;
1202 }
1203
1204 if (S_ISDIR(new_inode->i_mode))
1205 links_count = 0;
1206 else
1207 links_count = (unsigned int) (new_inode->i_nlink - 1);
1208
1209 status = ocfs2_request_unlink_vote(new_inode, new_dentry,
1210 links_count);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto bail;
1214 }
1215
1216 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1217
1218 mlog(0, "aha rename over existing... new_de=%p "
1219 "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
1220 new_de, newfe_blkno, newfe_bh, newfe_bh ?
1221 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1222
1223 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1224 status = ocfs2_prepare_orphan_dir(osb, handle,
1225 new_inode,
1226 orphan_name,
1227 &orphan_entry_bh);
1228 if (status < 0) {
1229 mlog_errno(status);
1230 goto bail;
1231 }
1232 }
1233 } else {
1234 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
1235
1236 status = ocfs2_check_dir_for_entry(new_dir,
1237 new_dentry->d_name.name,
1238 new_dentry->d_name.len);
1239 if (status)
1240 goto bail;
1241
1242 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1243 new_dentry->d_name.name,
1244 new_dentry->d_name.len,
1245 &insert_entry_bh);
1246 if (status < 0) {
1247 mlog_errno(status);
1248 goto bail;
1249 }
1250 }
1251
1252 handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
1253 if (IS_ERR(handle)) {
1254 status = PTR_ERR(handle);
1255 handle = NULL;
1256 mlog_errno(status);
1257 goto bail;
1258 }
1259
1260 if (new_de) {
1261 if (S_ISDIR(new_inode->i_mode)) {
1262 if (!ocfs2_empty_dir(new_inode) ||
1263 new_inode->i_nlink != 2) {
1264 status = -ENOTEMPTY;
1265 goto bail;
1266 }
1267 }
1268 status = ocfs2_journal_access(handle, new_inode, newfe_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (status < 0) {
1271 mlog_errno(status);
1272 goto bail;
1273 }
1274
1275 if (S_ISDIR(new_inode->i_mode) ||
1276 (newfe->i_links_count == cpu_to_le16(1))){
1277 status = ocfs2_orphan_add(osb, handle, new_inode,
1278 newfe, orphan_name,
1279 orphan_entry_bh);
1280 if (status < 0) {
1281 mlog_errno(status);
1282 goto bail;
1283 }
1284 }
1285
1286 /* change the dirent to point to the correct inode */
1287 status = ocfs2_journal_access(handle, new_dir, new_de_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1294 new_de->file_type = old_de->file_type;
1295 new_dir->i_version++;
1296 status = ocfs2_journal_dirty(handle, new_de_bh);
1297 if (status < 0) {
1298 mlog_errno(status);
1299 goto bail;
1300 }
1301
1302 if (S_ISDIR(new_inode->i_mode))
1303 newfe->i_links_count = 0;
1304 else
1305 le16_add_cpu(&newfe->i_links_count, -1);
1306
1307 status = ocfs2_journal_dirty(handle, newfe_bh);
1308 if (status < 0) {
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312 } else {
1313 /* if the name was not found in new_dir, add it now */
1314 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1315 OCFS2_I(old_inode)->ip_blkno,
1316 new_dir_bh, insert_entry_bh);
1317 }
1318
1319 old_inode->i_ctime = CURRENT_TIME;
1320 mark_inode_dirty(old_inode);
1321
1322 /* now that the name has been added to new_dir, remove the old name */
1323 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1324 if (status < 0) {
1325 mlog_errno(status);
1326 goto bail;
1327 }
1328
1329 if (new_inode) {
1330 new_inode->i_nlink--;
1331 new_inode->i_ctime = CURRENT_TIME;
1332 }
1333 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1334 if (old_inode_de_bh) {
1335 status = ocfs2_journal_access(handle, old_inode,
1336 old_inode_de_bh,
1337 OCFS2_JOURNAL_ACCESS_WRITE);
1338 PARENT_INO(old_inode_de_bh->b_data) =
1339 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1340 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1341 old_dir->i_nlink--;
1342 if (new_inode) {
1343 new_inode->i_nlink--;
1344 } else {
1345 new_dir->i_nlink++;
1346 mark_inode_dirty(new_dir);
1347 }
1348 }
1349 mark_inode_dirty(old_dir);
1350 if (new_inode)
1351 mark_inode_dirty(new_inode);
1352
1353 if (old_dir != new_dir)
1354 if (new_dir_nlink != new_dir->i_nlink) {
1355 if (!new_dir_bh) {
1356 mlog(ML_ERROR, "need to change nlink for new "
1357 "dir %"MLFu64" from %d to %d but bh is "
1358 "NULL\n", OCFS2_I(new_dir)->ip_blkno,
1359 (int)new_dir_nlink, new_dir->i_nlink);
1360 } else {
1361 struct ocfs2_dinode *fe;
1362 status = ocfs2_journal_access(handle,
1363 new_dir,
1364 new_dir_bh,
1365 OCFS2_JOURNAL_ACCESS_WRITE);
1366 fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
1367 fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
1368 status = ocfs2_journal_dirty(handle, new_dir_bh);
1369 }
1370 }
1371
1372 if (old_dir_nlink != old_dir->i_nlink) {
1373 if (!old_dir_bh) {
1374 mlog(ML_ERROR, "need to change nlink for old dir "
1375 "%"MLFu64" from %d to %d but bh is NULL!\n",
1376 OCFS2_I(old_dir)->ip_blkno,
1377 (int)old_dir_nlink,
1378 old_dir->i_nlink);
1379 } else {
1380 struct ocfs2_dinode *fe;
1381 status = ocfs2_journal_access(handle, old_dir,
1382 old_dir_bh,
1383 OCFS2_JOURNAL_ACCESS_WRITE);
1384 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1385 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1386 status = ocfs2_journal_dirty(handle, old_dir_bh);
1387 }
1388 }
1389
1390 status = 0;
1391bail:
1392 if (rename_lock)
1393 ocfs2_rename_unlock(osb);
1394
1395 if (handle)
1396 ocfs2_commit_trans(handle);
1397
1398 if (new_inode)
1399 sync_mapping_buffers(old_inode->i_mapping);
1400
1401 if (new_inode)
1402 iput(new_inode);
1403 if (newfe_bh)
1404 brelse(newfe_bh);
1405 if (old_dir_bh)
1406 brelse(old_dir_bh);
1407 if (new_dir_bh)
1408 brelse(new_dir_bh);
1409 if (new_de_bh)
1410 brelse(new_de_bh);
1411 if (old_de_bh)
1412 brelse(old_de_bh);
1413 if (old_inode_de_bh)
1414 brelse(old_inode_de_bh);
1415 if (orphan_entry_bh)
1416 brelse(orphan_entry_bh);
1417 if (insert_entry_bh)
1418 brelse(insert_entry_bh);
1419
1420 mlog_exit(status);
1421
1422 return status;
1423}
1424
1425/*
1426 * we expect i_size = strlen(symname). Copy symname into the file
1427 * data, including the null terminator.
1428 */
1429static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1430 struct ocfs2_journal_handle *handle,
1431 struct inode *inode,
1432 const char *symname)
1433{
1434 struct buffer_head **bhs = NULL;
1435 const char *c;
1436 struct super_block *sb = osb->sb;
1437 u64 p_blkno;
1438 int p_blocks;
1439 int virtual, blocks, status, i, bytes_left;
1440
1441 bytes_left = i_size_read(inode) + 1;
1442 /* we can't trust i_blocks because we're actually going to
1443 * write i_size + 1 bytes. */
1444 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1445
1446 mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
1447 inode->i_blocks, i_size_read(inode), blocks);
1448
1449 /* Sanity check -- make sure we're going to fit. */
1450 if (bytes_left >
1451 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
1452 status = -EIO;
1453 mlog_errno(status);
1454 goto bail;
1455 }
1456
1457 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
1458 if (!bhs) {
1459 status = -ENOMEM;
1460 mlog_errno(status);
1461 goto bail;
1462 }
1463
1464 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
1465 &p_blocks);
1466 if (status < 0) {
1467 mlog_errno(status);
1468 goto bail;
1469 }
1470
1471 /* links can never be larger than one cluster so we know this
1472 * is all going to be contiguous, but do a sanity check
1473 * anyway. */
1474 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
1475 status = -EIO;
1476 mlog_errno(status);
1477 goto bail;
1478 }
1479
1480 virtual = 0;
1481 while(bytes_left > 0) {
1482 c = &symname[virtual * sb->s_blocksize];
1483
1484 bhs[virtual] = sb_getblk(sb, p_blkno);
1485 if (!bhs[virtual]) {
1486 status = -ENOMEM;
1487 mlog_errno(status);
1488 goto bail;
1489 }
1490 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
1491
1492 status = ocfs2_journal_access(handle, inode, bhs[virtual],
1493 OCFS2_JOURNAL_ACCESS_CREATE);
1494 if (status < 0) {
1495 mlog_errno(status);
1496 goto bail;
1497 }
1498
1499 memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
1500
1501 memcpy(bhs[virtual]->b_data, c,
1502 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1503 bytes_left);
1504
1505 status = ocfs2_journal_dirty(handle, bhs[virtual]);
1506 if (status < 0) {
1507 mlog_errno(status);
1508 goto bail;
1509 }
1510
1511 virtual++;
1512 p_blkno++;
1513 bytes_left -= sb->s_blocksize;
1514 }
1515
1516 status = 0;
1517bail:
1518
1519 if (bhs) {
1520 for(i = 0; i < blocks; i++)
1521 if (bhs[i])
1522 brelse(bhs[i]);
1523 kfree(bhs);
1524 }
1525
1526 mlog_exit(status);
1527 return status;
1528}
1529
1530static int ocfs2_symlink(struct inode *dir,
1531 struct dentry *dentry,
1532 const char *symname)
1533{
1534 int status, l, credits;
1535 u64 newsize;
1536 struct ocfs2_super *osb = NULL;
1537 struct inode *inode = NULL;
1538 struct super_block *sb;
1539 struct buffer_head *new_fe_bh = NULL;
1540 struct buffer_head *de_bh = NULL;
1541 struct buffer_head *parent_fe_bh = NULL;
1542 struct ocfs2_dinode *fe = NULL;
1543 struct ocfs2_dinode *dirfe;
1544 struct ocfs2_journal_handle *handle = NULL;
1545 struct ocfs2_alloc_context *inode_ac = NULL;
1546 struct ocfs2_alloc_context *data_ac = NULL;
1547
1548 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1549 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1550
1551 sb = dir->i_sb;
1552 osb = OCFS2_SB(sb);
1553
1554 l = strlen(symname) + 1;
1555
1556 credits = ocfs2_calc_symlink_credits(sb);
1557
1558 handle = ocfs2_alloc_handle(osb);
1559 if (handle == NULL) {
1560 status = -ENOMEM;
1561 mlog_errno(status);
1562 goto bail;
1563 }
1564
1565 /* lock the parent directory */
1566 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
1567 if (status < 0) {
1568 if (status != -ENOENT)
1569 mlog_errno(status);
1570 goto bail;
1571 }
1572
1573 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1574 if (!dirfe->i_links_count) {
1575 /* can't make a file in a deleted directory. */
1576 status = -ENOENT;
1577 goto bail;
1578 }
1579
1580 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
1581 dentry->d_name.len);
1582 if (status)
1583 goto bail;
1584
1585 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1586 dentry->d_name.name,
1587 dentry->d_name.len, &de_bh);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto bail;
1591 }
1592
1593 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
1594 if (status < 0) {
1595 if (status != -ENOSPC)
1596 mlog_errno(status);
1597 goto bail;
1598 }
1599
1600 /* don't reserve bitmap space for fast symlinks. */
1601 if (l > ocfs2_fast_symlink_chars(sb)) {
1602 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
1603 if (status < 0) {
1604 if (status != -ENOSPC)
1605 mlog_errno(status);
1606 goto bail;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans(osb, handle, credits);
1611 if (IS_ERR(handle)) {
1612 status = PTR_ERR(handle);
1613 handle = NULL;
1614 mlog_errno(status);
1615 goto bail;
1616 }
1617
1618 status = ocfs2_mknod_locked(osb, dir, dentry,
1619 S_IFLNK | S_IRWXUGO, 0,
1620 &new_fe_bh, parent_fe_bh, handle,
1621 &inode, inode_ac);
1622 if (status < 0) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1628 inode->i_rdev = 0;
1629 newsize = l - 1;
1630 if (l > ocfs2_fast_symlink_chars(sb)) {
1631 inode->i_op = &ocfs2_symlink_inode_operations;
1632 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
1633 handle, data_ac, NULL,
1634 NULL);
1635 if (status < 0) {
1636 if (status != -ENOSPC && status != -EINTR) {
1637 mlog(ML_ERROR, "Failed to extend file to "
1638 "%"MLFu64"\n",
1639 newsize);
1640 mlog_errno(status);
1641 status = -ENOSPC;
1642 }
1643 goto bail;
1644 }
1645 i_size_write(inode, newsize);
1646 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
1647 } else {
1648 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1649 memcpy((char *) fe->id2.i_symlink, symname, l);
1650 i_size_write(inode, newsize);
1651 inode->i_blocks = 0;
1652 }
1653
1654 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
1655 if (status < 0) {
1656 mlog_errno(status);
1657 goto bail;
1658 }
1659
1660 if (!ocfs2_inode_is_fast_symlink(inode)) {
1661 status = ocfs2_create_symlink_data(osb, handle, inode,
1662 symname);
1663 if (status < 0) {
1664 mlog_errno(status);
1665 goto bail;
1666 }
1667 }
1668
1669 status = ocfs2_add_entry(handle, dentry, inode,
1670 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1671 de_bh);
1672 if (status < 0) {
1673 mlog_errno(status);
1674 goto bail;
1675 }
1676
1677 insert_inode_hash(inode);
1678 dentry->d_op = &ocfs2_dentry_ops;
1679 d_instantiate(dentry, inode);
1680bail:
1681 if (handle)
1682 ocfs2_commit_trans(handle);
1683 if (new_fe_bh)
1684 brelse(new_fe_bh);
1685 if (parent_fe_bh)
1686 brelse(parent_fe_bh);
1687 if (de_bh)
1688 brelse(de_bh);
1689 if (inode_ac)
1690 ocfs2_free_alloc_context(inode_ac);
1691 if (data_ac)
1692 ocfs2_free_alloc_context(data_ac);
1693 if ((status < 0) && inode)
1694 iput(inode);
1695
1696 mlog_exit(status);
1697
1698 return status;
1699}
1700
1701int ocfs2_check_dir_entry(struct inode * dir,
1702 struct ocfs2_dir_entry * de,
1703 struct buffer_head * bh,
1704 unsigned long offset)
1705{
1706 const char *error_msg = NULL;
1707 const int rlen = le16_to_cpu(de->rec_len);
1708
1709 if (rlen < OCFS2_DIR_REC_LEN(1))
1710 error_msg = "rec_len is smaller than minimal";
1711 else if (rlen % 4 != 0)
1712 error_msg = "rec_len % 4 != 0";
1713 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1714 error_msg = "rec_len is too small for name_len";
1715 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1716 error_msg = "directory entry across blocks";
1717
1718 if (error_msg != NULL)
1719 mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
1720 "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
1721 OCFS2_I(dir)->ip_blkno, error_msg, offset,
1722 le64_to_cpu(de->inode), rlen, de->name_len);
1723 return error_msg == NULL ? 1 : 0;
1724}
1725
1726/* we don't always have a dentry for what we want to add, so people
1727 * like orphan dir can call this instead.
1728 *
1729 * If you pass me insert_bh, I'll skip the search of the other dir
1730 * blocks and put the record in there.
1731 */
1732static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
1733 struct inode *dir,
1734 const char *name, int namelen,
1735 struct inode *inode, u64 blkno,
1736 struct buffer_head *parent_fe_bh,
1737 struct buffer_head *insert_bh)
1738{
1739 unsigned long offset;
1740 unsigned short rec_len;
1741 struct ocfs2_dir_entry *de, *de1;
1742 struct super_block *sb;
1743 int retval, status;
1744
1745 mlog_entry_void();
1746
1747 sb = dir->i_sb;
1748
1749 if (!namelen)
1750 return -EINVAL;
1751
1752 rec_len = OCFS2_DIR_REC_LEN(namelen);
1753 offset = 0;
1754 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1755 while (1) {
1756 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1757 /* These checks should've already been passed by the
1758 * prepare function, but I guess we can leave them
1759 * here anyway. */
1760 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1761 retval = -ENOENT;
1762 goto bail;
1763 }
1764 if (ocfs2_match(namelen, name, de)) {
1765 retval = -EEXIST;
1766 goto bail;
1767 }
1768 if (((le64_to_cpu(de->inode) == 0) &&
1769 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1770 (le16_to_cpu(de->rec_len) >=
1771 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1772 status = ocfs2_journal_access(handle, dir, insert_bh,
1773 OCFS2_JOURNAL_ACCESS_WRITE);
1774 /* By now the buffer is marked for journaling */
1775 offset += le16_to_cpu(de->rec_len);
1776 if (le64_to_cpu(de->inode)) {
1777 de1 = (struct ocfs2_dir_entry *)((char *) de +
1778 OCFS2_DIR_REC_LEN(de->name_len));
1779 de1->rec_len =
1780 cpu_to_le16(le16_to_cpu(de->rec_len) -
1781 OCFS2_DIR_REC_LEN(de->name_len));
1782 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1783 de = de1;
1784 }
1785 de->file_type = OCFS2_FT_UNKNOWN;
1786 if (blkno) {
1787 de->inode = cpu_to_le64(blkno);
1788 ocfs2_set_de_type(de, inode->i_mode);
1789 } else
1790 de->inode = 0;
1791 de->name_len = namelen;
1792 memcpy(de->name, name, namelen);
1793
1794 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1795 dir->i_version++;
1796 status = ocfs2_journal_dirty(handle, insert_bh);
1797 retval = 0;
1798 goto bail;
1799 }
1800 offset += le16_to_cpu(de->rec_len);
1801 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1802 }
1803
1804 /* when you think about it, the assert above should prevent us
1805 * from ever getting here. */
1806 retval = -ENOSPC;
1807bail:
1808
1809 mlog_exit(retval);
1810 return retval;
1811}
1812
1813
1814/*
1815 * ocfs2_delete_entry deletes a directory entry by merging it with the
1816 * previous entry
1817 */
1818static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
1819 struct inode *dir,
1820 struct ocfs2_dir_entry *de_del,
1821 struct buffer_head *bh)
1822{
1823 struct ocfs2_dir_entry *de, *pde;
1824 int i, status = -ENOENT;
1825
1826 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1827
1828 i = 0;
1829 pde = NULL;
1830 de = (struct ocfs2_dir_entry *) bh->b_data;
1831 while (i < bh->b_size) {
1832 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1833 status = -EIO;
1834 mlog_errno(status);
1835 goto bail;
1836 }
1837 if (de == de_del) {
1838 status = ocfs2_journal_access(handle, dir, bh,
1839 OCFS2_JOURNAL_ACCESS_WRITE);
1840 if (status < 0) {
1841 status = -EIO;
1842 mlog_errno(status);
1843 goto bail;
1844 }
1845 if (pde)
1846 pde->rec_len =
1847 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1848 le16_to_cpu(de->rec_len));
1849 else
1850 de->inode = 0;
1851 dir->i_version++;
1852 status = ocfs2_journal_dirty(handle, bh);
1853 goto bail;
1854 }
1855 i += le16_to_cpu(de->rec_len);
1856 pde = de;
1857 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1858 }
1859bail:
1860 mlog_exit(status);
1861 return status;
1862}
1863
1864/*
1865 * Returns 0 if not found, -1 on failure, and 1 on success
1866 */
1867static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1868 struct inode *dir,
1869 const char *name, int namelen,
1870 unsigned long offset,
1871 struct ocfs2_dir_entry **res_dir)
1872{
1873 struct ocfs2_dir_entry *de;
1874 char *dlimit, *de_buf;
1875 int de_len;
1876 int ret = 0;
1877
1878 mlog_entry_void();
1879
1880 de_buf = bh->b_data;
1881 dlimit = de_buf + dir->i_sb->s_blocksize;
1882
1883 while (de_buf < dlimit) {
1884 /* this code is executed quadratically often */
1885 /* do minimal checking `by hand' */
1886
1887 de = (struct ocfs2_dir_entry *) de_buf;
1888
1889 if (de_buf + namelen <= dlimit &&
1890 ocfs2_match(namelen, name, de)) {
1891 /* found a match - just to be sure, do a full check */
1892 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1893 ret = -1;
1894 goto bail;
1895 }
1896 *res_dir = de;
1897 ret = 1;
1898 goto bail;
1899 }
1900
1901 /* prevent looping on a bad block */
1902 de_len = le16_to_cpu(de->rec_len);
1903 if (de_len <= 0) {
1904 ret = -1;
1905 goto bail;
1906 }
1907
1908 de_buf += de_len;
1909 offset += de_len;
1910 }
1911
1912bail:
1913 mlog_exit(ret);
1914 return ret;
1915}
1916
1917struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1918 struct inode *dir,
1919 struct ocfs2_dir_entry **res_dir)
1920{
1921 struct super_block *sb;
1922 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1923 struct buffer_head *bh, *ret = NULL;
1924 unsigned long start, block, b;
1925 int ra_max = 0; /* Number of bh's in the readahead
1926 buffer, bh_use[] */
1927 int ra_ptr = 0; /* Current index into readahead
1928 buffer */
1929 int num = 0;
1930 int nblocks, i, err;
1931
1932 mlog_entry_void();
1933
1934 *res_dir = NULL;
1935 sb = dir->i_sb;
1936
1937 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
1938 start = OCFS2_I(dir)->ip_dir_start_lookup;
1939 if (start >= nblocks)
1940 start = 0;
1941 block = start;
1942
1943restart:
1944 do {
1945 /*
1946 * We deal with the read-ahead logic here.
1947 */
1948 if (ra_ptr >= ra_max) {
1949 /* Refill the readahead buffer */
1950 ra_ptr = 0;
1951 b = block;
1952 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1953 /*
1954 * Terminate if we reach the end of the
1955 * directory and must wrap, or if our
1956 * search has finished at this block.
1957 */
1958 if (b >= nblocks || (num && block == start)) {
1959 bh_use[ra_max] = NULL;
1960 break;
1961 }
1962 num++;
1963
1964 /* XXX: questionable readahead stuff here */
1965 bh = ocfs2_bread(dir, b++, &err, 1);
1966 bh_use[ra_max] = bh;
1967#if 0 // ???
1968 if (bh)
1969 ll_rw_block(READ, 1, &bh);
1970#endif
1971 }
1972 }
1973 if ((bh = bh_use[ra_ptr++]) == NULL)
1974 goto next;
1975 wait_on_buffer(bh);
1976 if (!buffer_uptodate(bh)) {
1977 /* read error, skip block & hope for the best */
1978 brelse(bh);
1979 goto next;
1980 }
1981 i = ocfs2_search_dirblock(bh, dir, name, namelen,
1982 block << sb->s_blocksize_bits,
1983 res_dir);
1984 if (i == 1) {
1985 OCFS2_I(dir)->ip_dir_start_lookup = block;
1986 ret = bh;
1987 goto cleanup_and_exit;
1988 } else {
1989 brelse(bh);
1990 if (i < 0)
1991 goto cleanup_and_exit;
1992 }
1993 next:
1994 if (++block >= nblocks)
1995 block = 0;
1996 } while (block != start);
1997
1998 /*
1999 * If the directory has grown while we were searching, then
2000 * search the last part of the directory before giving up.
2001 */
2002 block = nblocks;
2003 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2004 if (block < nblocks) {
2005 start = 0;
2006 goto restart;
2007 }
2008
2009cleanup_and_exit:
2010 /* Clean up the read-ahead blocks */
2011 for (; ra_ptr < ra_max; ra_ptr++)
2012 brelse(bh_use[ra_ptr]);
2013
2014 mlog_exit_ptr(ret);
2015 return ret;
2016}
2017
2018static int ocfs2_blkno_stringify(u64 blkno, char *name)
2019{
2020 int status, namelen;
2021
2022 mlog_entry_void();
2023
2024 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
2025 blkno);
2026 if (namelen <= 0) {
2027 if (namelen)
2028 status = namelen;
2029 else
2030 status = -EINVAL;
2031 mlog_errno(status);
2032 goto bail;
2033 }
2034 if (namelen != OCFS2_ORPHAN_NAMELEN) {
2035 status = -EINVAL;
2036 mlog_errno(status);
2037 goto bail;
2038 }
2039
2040 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
2041 namelen);
2042
2043 status = 0;
2044bail:
2045 mlog_exit(status);
2046 return status;
2047}
2048
2049static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2050 struct ocfs2_journal_handle *handle,
2051 struct inode *inode,
2052 char *name,
2053 struct buffer_head **de_bh)
2054{
2055 struct inode *orphan_dir_inode = NULL;
2056 struct buffer_head *orphan_dir_bh = NULL;
2057 int status = 0;
2058
2059 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2060 if (status < 0) {
2061 mlog_errno(status);
2062 goto leave;
2063 }
2064
2065 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2066 ORPHAN_DIR_SYSTEM_INODE,
2067 osb->slot_num);
2068 if (!orphan_dir_inode) {
2069 status = -ENOENT;
2070 mlog_errno(status);
2071 goto leave;
2072 }
2073
2074 ocfs2_handle_add_inode(handle, orphan_dir_inode);
2075 status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
2076 if (status < 0) {
2077 mlog_errno(status);
2078 goto leave;
2079 }
2080
2081 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2082 orphan_dir_bh, name,
2083 OCFS2_ORPHAN_NAMELEN, de_bh);
2084 if (status < 0) {
2085 mlog_errno(status);
2086 goto leave;
2087 }
2088
2089leave:
2090 if (orphan_dir_inode)
2091 iput(orphan_dir_inode);
2092
2093 if (orphan_dir_bh)
2094 brelse(orphan_dir_bh);
2095
2096 mlog_exit(status);
2097 return status;
2098}
2099
2100static int ocfs2_orphan_add(struct ocfs2_super *osb,
2101 struct ocfs2_journal_handle *handle,
2102 struct inode *inode,
2103 struct ocfs2_dinode *fe,
2104 char *name,
2105 struct buffer_head *de_bh)
2106{
2107 struct inode *orphan_dir_inode = NULL;
2108 struct buffer_head *orphan_dir_bh = NULL;
2109 int status = 0;
2110 struct ocfs2_dinode *orphan_fe;
2111
2112 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
2113
2114 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2115 ORPHAN_DIR_SYSTEM_INODE,
2116 osb->slot_num);
2117 if (!orphan_dir_inode) {
2118 status = -ENOENT;
2119 mlog_errno(status);
2120 goto leave;
2121 }
2122
2123 status = ocfs2_read_block(osb,
2124 OCFS2_I(orphan_dir_inode)->ip_blkno,
2125 &orphan_dir_bh, OCFS2_BH_CACHED,
2126 orphan_dir_inode);
2127 if (status < 0) {
2128 mlog_errno(status);
2129 goto leave;
2130 }
2131
2132 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
2133 OCFS2_JOURNAL_ACCESS_WRITE);
2134 if (status < 0) {
2135 mlog_errno(status);
2136 goto leave;
2137 }
2138
2139 /* we're a cluster, and nlink can change on disk from
2140 * underneath us... */
2141 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2142 if (S_ISDIR(inode->i_mode))
2143 le16_add_cpu(&orphan_fe->i_links_count, 1);
2144 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2145
2146 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2147 if (status < 0) {
2148 mlog_errno(status);
2149 goto leave;
2150 }
2151
2152 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2153 OCFS2_ORPHAN_NAMELEN, inode,
2154 OCFS2_I(inode)->ip_blkno,
2155 orphan_dir_bh, de_bh);
2156 if (status < 0) {
2157 mlog_errno(status);
2158 goto leave;
2159 }
2160
2161 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
2162
2163 /* Record which orphan dir our inode now resides
2164 * in. delete_inode will use this to determine which orphan
2165 * dir to lock. */
2166 spin_lock(&OCFS2_I(inode)->ip_lock);
2167 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2168 spin_unlock(&OCFS2_I(inode)->ip_lock);
2169
2170 mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
2171 OCFS2_I(inode)->ip_blkno, osb->slot_num);
2172
2173leave:
2174 if (orphan_dir_inode)
2175 iput(orphan_dir_inode);
2176
2177 if (orphan_dir_bh)
2178 brelse(orphan_dir_bh);
2179
2180 mlog_exit(status);
2181 return status;
2182}
2183
2184/* unlike orphan_add, we expect the orphan dir to already be locked here. */
2185int ocfs2_orphan_del(struct ocfs2_super *osb,
2186 struct ocfs2_journal_handle *handle,
2187 struct inode *orphan_dir_inode,
2188 struct inode *inode,
2189 struct buffer_head *orphan_dir_bh)
2190{
2191 char name[OCFS2_ORPHAN_NAMELEN + 1];
2192 struct ocfs2_dinode *orphan_fe;
2193 int status = 0;
2194 struct buffer_head *target_de_bh = NULL;
2195 struct ocfs2_dir_entry *target_de = NULL;
2196
2197 mlog_entry_void();
2198
2199 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2200 if (status < 0) {
2201 mlog_errno(status);
2202 goto leave;
2203 }
2204
2205 mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
2206 name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
2207
2208 /* find it's spot in the orphan directory */
2209 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
2210 orphan_dir_inode, &target_de);
2211 if (!target_de_bh) {
2212 status = -ENOENT;
2213 mlog_errno(status);
2214 goto leave;
2215 }
2216
2217 /* remove it from the orphan directory */
2218 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
2219 target_de_bh);
2220 if (status < 0) {
2221 mlog_errno(status);
2222 goto leave;
2223 }
2224
2225 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
2226 OCFS2_JOURNAL_ACCESS_WRITE);
2227 if (status < 0) {
2228 mlog_errno(status);
2229 goto leave;
2230 }
2231
2232 /* do the i_nlink dance! :) */
2233 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2234 if (S_ISDIR(inode->i_mode))
2235 le16_add_cpu(&orphan_fe->i_links_count, -1);
2236 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2237
2238 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2239 if (status < 0) {
2240 mlog_errno(status);
2241 goto leave;
2242 }
2243
2244leave:
2245 if (target_de_bh)
2246 brelse(target_de_bh);
2247
2248 mlog_exit(status);
2249 return status;
2250}
2251
2252struct inode_operations ocfs2_dir_iops = {
2253 .create = ocfs2_create,
2254 .lookup = ocfs2_lookup,
2255 .link = ocfs2_link,
2256 .unlink = ocfs2_unlink,
2257 .rmdir = ocfs2_unlink,
2258 .symlink = ocfs2_symlink,
2259 .mkdir = ocfs2_mkdir,
2260 .mknod = ocfs2_mknod,
2261 .rename = ocfs2_rename,
2262 .setattr = ocfs2_setattr,
2263 .getattr = ocfs2_getattr,
2264};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 000000000000..deaaa97dbf0b
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_NAMEI_H
27#define OCFS2_NAMEI_H
28
29extern struct inode_operations ocfs2_dir_iops;
30
31struct dentry *ocfs2_get_parent(struct dentry *child);
32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb,
42 struct ocfs2_journal_handle *handle,
43 struct inode *orphan_dir_inode,
44 struct inode *inode,
45 struct buffer_head *orphan_dir_bh);
46
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 000000000000..0b499bccec5a
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs1_fs_compat.h
5 *
6 * OCFS1 volume header definitions. OCFS2 creates valid but unmountable
7 * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
8 * This allows an OCFS1 volume to see the partition and cleanly fail to
9 * mount it.
10 *
11 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License, version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public
23 * License along with this program; if not, write to the
24 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 * Boston, MA 021110-1307, USA.
26 */
27
28#ifndef _OCFS1_FS_COMPAT_H
29#define _OCFS1_FS_COMPAT_H
30
31#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
32#define OCFS1_MAX_MOUNT_POINT_LEN 128
33#define OCFS1_MAX_VOL_ID_LENGTH 16
34#define OCFS1_MAX_VOL_LABEL_LEN 64
35#define OCFS1_MAX_CLUSTER_NAME_LEN 64
36
37#define OCFS1_MAJOR_VERSION (2)
38#define OCFS1_MINOR_VERSION (0)
39#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
40
41/*
42 * OCFS1 superblock. Lives at sector 0.
43 */
44struct ocfs1_vol_disk_hdr
45{
46/*00*/ __u32 minor_version;
47 __u32 major_version;
48/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
49/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
50/*108*/ __u64 serial_num;
51/*110*/ __u64 device_size;
52 __u64 start_off;
53/*120*/ __u64 bitmap_off;
54 __u64 publ_off;
55/*130*/ __u64 vote_off;
56 __u64 root_bitmap_off;
57/*140*/ __u64 data_start_off;
58 __u64 root_bitmap_size;
59/*150*/ __u64 root_off;
60 __u64 root_size;
61/*160*/ __u64 cluster_size;
62 __u64 num_nodes;
63/*170*/ __u64 num_clusters;
64 __u64 dir_node_size;
65/*180*/ __u64 file_node_size;
66 __u64 internal_off;
67/*190*/ __u64 node_cfg_off;
68 __u64 node_cfg_size;
69/*1A0*/ __u64 new_cfg_off;
70 __u32 prot_bits;
71 __s32 excl_mount;
72/*1B0*/
73};
74
75
76struct ocfs1_disk_lock
77{
78/*00*/ __u32 curr_master;
79 __u8 file_lock;
80 __u8 compat_pad[3]; /* Not in orignal definition. Used to
81 make the already existing alignment
82 explicit */
83 __u64 last_write_time;
84/*10*/ __u64 last_read_time;
85 __u32 writer_node_num;
86 __u32 reader_node_num;
87/*20*/ __u64 oin_node_map;
88 __u64 dlock_seq_num;
89/*30*/
90};
91
92/*
93 * OCFS1 volume label. Lives at sector 1.
94 */
95struct ocfs1_vol_label
96{
97/*00*/ struct ocfs1_disk_lock disk_lock;
98/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
99/*70*/ __u16 label_len;
100/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
101/*82*/ __u16 vol_id_len;
102/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
103/*A4*/ __u16 cluster_name_len;
104/*A6*/
105};
106
107
108#endif /* _OCFS1_FS_COMPAT_H */
109
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 000000000000..f468c600cf92
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2.h
5 *
6 * Defines macros and structures used in OCFS2
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_H
27#define OCFS2_H
28
29#include <linux/spinlock.h>
30#include <linux/sched.h>
31#include <linux/wait.h>
32#include <linux/list.h>
33#include <linux/rbtree.h>
34#include <linux/workqueue.h>
35#include <linux/kref.h>
36
37#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h"
39#include "cluster/tcp.h"
40
41#include "dlm/dlmapi.h"
42
43#include "ocfs2_fs.h"
44#include "endian.h"
45#include "ocfs2_lockid.h"
46
47struct ocfs2_extent_map {
48 u32 em_clusters;
49 struct rb_root em_extents;
50};
51
52/* Most user visible OCFS2 inodes will have very few pieces of
53 * metadata, but larger files (including bitmaps, etc) must be taken
54 * into account when designing an access scheme. We allow a small
55 * amount of inlined blocks to be stored on an array and grow the
56 * structure into a rb tree when necessary. */
57#define OCFS2_INODE_MAX_CACHE_ARRAY 2
58
59struct ocfs2_caching_info {
60 unsigned int ci_num_cached;
61 union {
62 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
63 struct rb_root ci_tree;
64 } ci_cache;
65};
66
67/* this limits us to 256 nodes
68 * if we need more, we can do a kmalloc for the map */
69#define OCFS2_NODE_MAP_MAX_NODES 256
70struct ocfs2_node_map {
71 u16 num_nodes;
72 unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
73};
74
75enum ocfs2_ast_action {
76 OCFS2_AST_INVALID = 0,
77 OCFS2_AST_ATTACH,
78 OCFS2_AST_CONVERT,
79 OCFS2_AST_DOWNCONVERT,
80};
81
82/* actions for an unlockast function to take. */
83enum ocfs2_unlock_action {
84 OCFS2_UNLOCK_INVALID = 0,
85 OCFS2_UNLOCK_CANCEL_CONVERT,
86 OCFS2_UNLOCK_DROP_LOCK,
87};
88
89/* ocfs2_lock_res->l_flags flags. */
90#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
91 * the lvb */
92#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
93 * dlm_lock */
94#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
95 * downconvert*/
96#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
97#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
98#define OCFS2_LOCK_REFRESHING (0x00000020)
99#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
100 * for shutdown paths */
101#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
102 * when to skip queueing
103 * a lock because it's
104 * about to be
105 * dropped. */
106#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
107
108struct ocfs2_lock_res_ops;
109
110typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
111
112struct ocfs2_lock_res {
113 void *l_priv;
114 struct ocfs2_lock_res_ops *l_ops;
115 spinlock_t l_lock;
116
117 struct list_head l_blocked_list;
118 struct list_head l_mask_waiters;
119
120 enum ocfs2_lock_type l_type;
121 unsigned long l_flags;
122 char l_name[OCFS2_LOCK_ID_MAX_LEN];
123 int l_level;
124 unsigned int l_ro_holders;
125 unsigned int l_ex_holders;
126 struct dlm_lockstatus l_lksb;
127
128 /* used from AST/BAST funcs. */
129 enum ocfs2_ast_action l_action;
130 enum ocfs2_unlock_action l_unlock_action;
131 int l_requested;
132 int l_blocking;
133
134 wait_queue_head_t l_event;
135
136 struct list_head l_debug_list;
137};
138
139struct ocfs2_dlm_debug {
140 struct kref d_refcnt;
141 struct dentry *d_locking_state;
142 struct list_head d_lockres_tracking;
143};
144
145enum ocfs2_vol_state
146{
147 VOLUME_INIT = 0,
148 VOLUME_MOUNTED,
149 VOLUME_DISMOUNTED,
150 VOLUME_DISABLED
151};
152
153struct ocfs2_alloc_stats
154{
155 atomic_t moves;
156 atomic_t local_data;
157 atomic_t bitmap_data;
158 atomic_t bg_allocs;
159 atomic_t bg_extends;
160};
161
162enum ocfs2_local_alloc_state
163{
164 OCFS2_LA_UNUSED = 0,
165 OCFS2_LA_ENABLED,
166 OCFS2_LA_DISABLED
167};
168
169enum ocfs2_mount_options
170{
171 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
172 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
173 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
174 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
175 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
176#ifdef OCFS2_ORACORE_WORKAROUNDS
177 OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
178#endif
179};
180
181#define OCFS2_OSB_SOFT_RO 0x0001
182#define OCFS2_OSB_HARD_RO 0x0002
183#define OCFS2_OSB_ERROR_FS 0x0004
184
185struct ocfs2_journal;
186struct ocfs2_journal_handle;
187struct ocfs2_super
188{
189 u32 osb_id; /* id used by the proc interface */
190 struct task_struct *commit_task;
191 struct super_block *sb;
192 struct inode *root_inode;
193 struct inode *sys_root_inode;
194 struct inode *system_inodes[NUM_SYSTEM_INODES];
195
196 struct ocfs2_slot_info *slot_info;
197
198 spinlock_t node_map_lock;
199 struct ocfs2_node_map mounted_map;
200 struct ocfs2_node_map recovery_map;
201 struct ocfs2_node_map umount_map;
202
203 u32 num_clusters;
204 u64 root_blkno;
205 u64 system_dir_blkno;
206 u64 bitmap_blkno;
207 u32 bitmap_cpg;
208 u8 *uuid;
209 char *uuid_str;
210 u8 *vol_label;
211 u64 first_cluster_group_blkno;
212 u32 fs_generation;
213
214 u32 s_feature_compat;
215 u32 s_feature_incompat;
216 u32 s_feature_ro_compat;
217
218 /* Protects s_next_generaion, osb_flags. Could protect more on
219 * osb as it's very short lived. */
220 spinlock_t osb_lock;
221 u32 s_next_generation;
222 unsigned long osb_flags;
223
224 unsigned long s_mount_opt;
225
226 u16 max_slots;
227 u16 num_nodes;
228 s16 node_num;
229 s16 slot_num;
230 int s_sectsize_bits;
231 int s_clustersize;
232 int s_clustersize_bits;
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234
235 atomic_t vol_state;
236 struct semaphore recovery_lock;
237 struct task_struct *recovery_thread_task;
238 int disable_recovery;
239 wait_queue_head_t checkpoint_event;
240 atomic_t needs_checkpoint;
241 struct ocfs2_journal *journal;
242
243 enum ocfs2_local_alloc_state local_alloc_state;
244 struct buffer_head *local_alloc_bh;
245
246 /* Next two fields are for local node slot recovery during
247 * mount. */
248 int dirty;
249 struct ocfs2_dinode *local_alloc_copy;
250
251 struct ocfs2_alloc_stats alloc_stats;
252 char dev_str[20]; /* "major,minor" of the device */
253
254 struct dlm_ctxt *dlm;
255 struct ocfs2_lock_res osb_super_lockres;
256 struct ocfs2_lock_res osb_rename_lockres;
257 struct dlm_eviction_cb osb_eviction_cb;
258 struct ocfs2_dlm_debug *osb_dlm_debug;
259
260 struct dentry *osb_debug_root;
261
262 wait_queue_head_t recovery_event;
263
264 spinlock_t vote_task_lock;
265 struct task_struct *vote_task;
266 wait_queue_head_t vote_event;
267 unsigned long vote_wake_sequence;
268 unsigned long vote_work_sequence;
269
270 struct list_head blocked_lock_list;
271 unsigned long blocked_lock_count;
272
273 struct list_head vote_list;
274 int vote_count;
275
276 u32 net_key;
277 spinlock_t net_response_lock;
278 unsigned int net_response_ids;
279 struct list_head net_response_list;
280
281 struct o2hb_callback_func osb_hb_up;
282 struct o2hb_callback_func osb_hb_down;
283
284 struct list_head osb_net_handlers;
285
286 wait_queue_head_t osb_mount_event;
287
288 /* Truncate log info */
289 struct inode *osb_tl_inode;
290 struct buffer_head *osb_tl_bh;
291 struct work_struct osb_truncate_log_wq;
292};
293
294#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
295#define OCFS2_MAX_OSB_ID 65536
296
297static inline int ocfs2_should_order_data(struct inode *inode)
298{
299 if (!S_ISREG(inode->i_mode))
300 return 0;
301 if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
302 return 0;
303 return 1;
304}
305
306/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock
309 * too! */
310static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
311 unsigned long flag)
312{
313 spin_lock(&osb->osb_lock);
314 osb->osb_flags |= flag;
315 spin_unlock(&osb->osb_lock);
316}
317
318static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
319 int hard)
320{
321 spin_lock(&osb->osb_lock);
322 osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
323 if (hard)
324 osb->osb_flags |= OCFS2_OSB_HARD_RO;
325 else
326 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
327 spin_unlock(&osb->osb_lock);
328}
329
330static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
331{
332 int ret;
333
334 spin_lock(&osb->osb_lock);
335 ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
336 spin_unlock(&osb->osb_lock);
337
338 return ret;
339}
340
341static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
342{
343 int ret;
344
345 spin_lock(&osb->osb_lock);
346 ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
347 spin_unlock(&osb->osb_lock);
348
349 return ret;
350}
351
352#define OCFS2_IS_VALID_DINODE(ptr) \
353 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
354
355#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
356 typeof(__di) ____di = (__di); \
357 ocfs2_error((__sb), \
358 "Dinode # %"MLFu64" has bad signature %.*s", \
359 (____di)->i_blkno, 7, \
360 (____di)->i_signature); \
361} while (0);
362
363#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
364 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
365
366#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
367 typeof(__eb) ____eb = (__eb); \
368 ocfs2_error((__sb), \
369 "Extent Block # %"MLFu64" has bad signature %.*s", \
370 (____eb)->h_blkno, 7, \
371 (____eb)->h_signature); \
372} while (0);
373
374#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
375 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
376
377#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
378 typeof(__gd) ____gd = (__gd); \
379 ocfs2_error((__sb), \
380 "Group Descriptor # %"MLFu64" has bad signature %.*s", \
381 (____gd)->bg_blkno, 7, \
382 (____gd)->bg_signature); \
383} while (0);
384
385static inline unsigned long ino_from_blkno(struct super_block *sb,
386 u64 blkno)
387{
388 return (unsigned long)(blkno & (u64)ULONG_MAX);
389}
390
391static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
392 u32 clusters)
393{
394 int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
395 sb->s_blocksize_bits;
396
397 return (u64)clusters << c_to_b_bits;
398}
399
400static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
401 u64 blocks)
402{
403 int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
404 sb->s_blocksize_bits;
405
406 return (u32)(blocks >> b_to_c_bits);
407}
408
409static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
410 u64 bytes)
411{
412 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
413 unsigned int clusters;
414
415 bytes += OCFS2_SB(sb)->s_clustersize - 1;
416 /* OCFS2 just cannot have enough clusters to overflow this */
417 clusters = (unsigned int)(bytes >> cl_bits);
418
419 return clusters;
420}
421
422static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
423 u64 bytes)
424{
425 bytes += sb->s_blocksize - 1;
426 return bytes >> sb->s_blocksize_bits;
427}
428
429static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
430 u32 clusters)
431{
432 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
433}
434
435static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
436 u64 bytes)
437{
438 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
439 unsigned int clusters;
440
441 clusters = ocfs2_clusters_for_bytes(sb, bytes);
442 return (u64)clusters << cl_bits;
443}
444
445static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
446 u64 bytes)
447{
448 u64 blocks;
449
450 blocks = ocfs2_blocks_for_bytes(sb, bytes);
451 return blocks << sb->s_blocksize_bits;
452}
453
454static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
455{
456 return (unsigned long)((bytes + 511) >> 9);
457}
458
459#define ocfs2_set_bit ext2_set_bit
460#define ocfs2_clear_bit ext2_clear_bit
461#define ocfs2_test_bit ext2_test_bit
462#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
463#endif /* OCFS2_H */
464
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 000000000000..dfb8a5bedfc8
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_fs.h
5 *
6 * On-disk structures for OCFS2.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _OCFS2_FS_H
26#define _OCFS2_FS_H
27
28/* Version */
29#define OCFS2_MAJOR_REV_LEVEL 0
30#define OCFS2_MINOR_REV_LEVEL 90
31
32/*
33 * An OCFS2 volume starts this way:
34 * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
35 * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
36 * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
37 *
38 * All other structures are found from the superblock information.
39 *
40 * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
41 * blocksize of 2K, it is 4096 bytes into disk.
42 */
43#define OCFS2_SUPER_BLOCK_BLKNO 2
44
45/*
46 * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
47 * grow if needed.
48 */
49#define OCFS2_MIN_CLUSTERSIZE 4096
50#define OCFS2_MAX_CLUSTERSIZE 1048576
51
52/*
53 * Blocks cannot be bigger than clusters, so the maximum blocksize is the
54 * minimum cluster size.
55 */
56#define OCFS2_MIN_BLOCKSIZE 512
57#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
58
59/* Filesystem magic number */
60#define OCFS2_SUPER_MAGIC 0x7461636f
61
62/* Object signatures */
63#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67
68/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
70 ( OCFS2_SB(sb)->s_feature_compat & (mask) )
71#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
72 ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
73#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
74 ( OCFS2_SB(sb)->s_feature_incompat & (mask) )
75#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
76 OCFS2_SB(sb)->s_feature_compat |= (mask)
77#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
78 OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
79#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
80 OCFS2_SB(sb)->s_feature_incompat |= (mask)
81#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
82 OCFS2_SB(sb)->s_feature_compat &= ~(mask)
83#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
84 OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
85#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87
88#define OCFS2_FEATURE_COMPAT_SUPP 0
89#define OCFS2_FEATURE_INCOMPAT_SUPP 0
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91
92/*
93 * Heartbeat-only devices are missing journals and other files. The
94 * filesystem driver can't load them, but the library can. Never put
95 * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
96 */
97#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
98
99
100/*
101 * Flags on ocfs2_dinode.i_flags
102 */
103#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
104#define OCFS2_UNUSED2_FL (0x00000002)
105#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
106#define OCFS2_UNUSED3_FL (0x00000008)
107/* System inode flags */
108#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
109#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
110#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
111#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
112#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
113#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
114#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
115#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
116
117/*
118 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
119 */
120#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
121
122/*
123 * superblock s_state flags
124 */
125#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
126
127/* Limit of space in ocfs2_dir_entry */
128#define OCFS2_MAX_FILENAME_LEN 255
129
130/* Maximum slots on an ocfs2 file system */
131#define OCFS2_MAX_SLOTS 255
132
133/* Slot map indicator for an empty slot */
134#define OCFS2_INVALID_SLOT -1
135
136#define OCFS2_VOL_UUID_LEN 16
137#define OCFS2_MAX_VOL_LABEL_LEN 64
138
139/* Journal limits (in bytes) */
140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
141#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
142
143struct ocfs2_system_inode_info {
144 char *si_name;
145 int si_iflags;
146 int si_mode;
147};
148
149/* System file index */
150enum {
151 BAD_BLOCK_SYSTEM_INODE = 0,
152 GLOBAL_INODE_ALLOC_SYSTEM_INODE,
153 SLOT_MAP_SYSTEM_INODE,
154#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
155 HEARTBEAT_SYSTEM_INODE,
156 GLOBAL_BITMAP_SYSTEM_INODE,
157#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
158 ORPHAN_DIR_SYSTEM_INODE,
159 EXTENT_ALLOC_SYSTEM_INODE,
160 INODE_ALLOC_SYSTEM_INODE,
161 JOURNAL_SYSTEM_INODE,
162 LOCAL_ALLOC_SYSTEM_INODE,
163 TRUNCATE_LOG_SYSTEM_INODE,
164 NUM_SYSTEM_INODES
165};
166
167static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
168 /* Global system inodes (single copy) */
169 /* The first two are only used from userspace mfks/tunefs */
170 [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
171 [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
172
173 /* These are used by the running filesystem */
174 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
175 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
176 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
177
178 /* Slot-specific system inodes (one copy per slot) */
179 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
180 [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
181 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
182 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
183 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
184 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
185};
186
187/* Parameter passed from mount.ocfs2 to module */
188#define OCFS2_HB_NONE "heartbeat=none"
189#define OCFS2_HB_LOCAL "heartbeat=local"
190
191/*
192 * OCFS2 directory file types. Only the low 3 bits are used. The
193 * other bits are reserved for now.
194 */
195#define OCFS2_FT_UNKNOWN 0
196#define OCFS2_FT_REG_FILE 1
197#define OCFS2_FT_DIR 2
198#define OCFS2_FT_CHRDEV 3
199#define OCFS2_FT_BLKDEV 4
200#define OCFS2_FT_FIFO 5
201#define OCFS2_FT_SOCK 6
202#define OCFS2_FT_SYMLINK 7
203
204#define OCFS2_FT_MAX 8
205
206/*
207 * OCFS2_DIR_PAD defines the directory entries boundaries
208 *
209 * NOTE: It must be a multiple of 4
210 */
211#define OCFS2_DIR_PAD 4
212#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
213#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
214#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
215 OCFS2_DIR_ROUND) & \
216 ~OCFS2_DIR_ROUND)
217
218#define OCFS2_LINK_MAX 32000
219
220#define S_SHIFT 12
221static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
222 [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
223 [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
224 [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
225 [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
226 [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
227 [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
228 [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
229};
230
231
232/*
233 * Convenience casts
234 */
235#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
236
237/*
238 * On disk extent record for OCFS2
239 * It describes a range of clusters on disk.
240 */
241struct ocfs2_extent_rec {
242/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
243 __le32 e_clusters; /* Clusters covered by this extent */
244 __le64 e_blkno; /* Physical disk offset, in blocks */
245/*10*/
246};
247
248struct ocfs2_chain_rec {
249 __le32 c_free; /* Number of free bits in this chain. */
250 __le32 c_total; /* Number of total bits in this chain */
251 __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
252};
253
254struct ocfs2_truncate_rec {
255 __le32 t_start; /* 1st cluster in this log */
256 __le32 t_clusters; /* Number of total clusters covered */
257};
258
259/*
260 * On disk extent list for OCFS2 (node in the tree). Note that this
261 * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
262 * offsets are relative to ocfs2_dinode.id2.i_list or
263 * ocfs2_extent_block.h_list, respectively.
264 */
265struct ocfs2_extent_list {
266/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
267 point. 0 means data extents
268 hang directly off this
269 header (a leaf) */
270 __le16 l_count; /* Number of extent records */
271 __le16 l_next_free_rec; /* Next unused extent slot */
272 __le16 l_reserved1;
273 __le64 l_reserved2; /* Pad to
274 sizeof(ocfs2_extent_rec) */
275/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
276};
277
278/*
279 * On disk allocation chain list for OCFS2. Note that this is
280 * contained inside ocfs2_dinode, so the offsets are relative to
281 * ocfs2_dinode.id2.i_chain.
282 */
283struct ocfs2_chain_list {
284/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
285 __le16 cl_bpc; /* Bits per cluster */
286 __le16 cl_count; /* Total chains in this list */
287 __le16 cl_next_free_rec; /* Next unused chain slot */
288 __le64 cl_reserved1;
289/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
290};
291
292/*
293 * On disk deallocation log for OCFS2. Note that this is
294 * contained inside ocfs2_dinode, so the offsets are relative to
295 * ocfs2_dinode.id2.i_dealloc.
296 */
297struct ocfs2_truncate_log {
298/*00*/ __le16 tl_count; /* Total records in this log */
299 __le16 tl_used; /* Number of records in use */
300 __le32 tl_reserved1;
301/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
302};
303
304/*
305 * On disk extent block (indirect block) for OCFS2
306 */
307struct ocfs2_extent_block
308{
309/*00*/ __u8 h_signature[8]; /* Signature for verification */
310 __le64 h_reserved1;
311/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
312 extent_header belongs to */
313 __le16 h_suballoc_bit; /* Bit offset in suballocator
314 block group */
315 __le32 h_fs_generation; /* Must match super block */
316 __le64 h_blkno; /* Offset on disk, in blocks */
317/*20*/ __le64 h_reserved3;
318 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
319 of next leaf header pointing
320 to data */
321/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
322/* Actual on-disk size is one block */
323};
324
325/*
326 * On disk superblock for OCFS2
327 * Note that it is contained inside an ocfs2_dinode, so all offsets
328 * are relative to the start of ocfs2_dinode.id2.
329 */
330struct ocfs2_super_block {
331/*00*/ __le16 s_major_rev_level;
332 __le16 s_minor_rev_level;
333 __le16 s_mnt_count;
334 __le16 s_max_mnt_count;
335 __le16 s_state; /* File system state */
336 __le16 s_errors; /* Behaviour when detecting errors */
337 __le32 s_checkinterval; /* Max time between checks */
338/*10*/ __le64 s_lastcheck; /* Time of last check */
339 __le32 s_creator_os; /* OS */
340 __le32 s_feature_compat; /* Compatible feature set */
341/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
342 __le32 s_feature_ro_compat; /* Readonly-compatible feature set */
343 __le64 s_root_blkno; /* Offset, in blocks, of root directory
344 dinode */
345/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
346 directory dinode */
347 __le32 s_blocksize_bits; /* Blocksize for this fs */
348 __le32 s_clustersize_bits; /* Clustersize for this fs */
349/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
350 before tunefs required */
351 __le16 s_reserved1;
352 __le32 s_reserved2;
353 __le64 s_first_cluster_group; /* Block offset of 1st cluster
354 * group header */
355/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
356/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
357/*A0*/
358};
359
360/*
361 * Local allocation bitmap for OCFS2 slots
362 * Note that it exists inside an ocfs2_dinode, so all offsets are
363 * relative to the start of ocfs2_dinode.id2.
364 */
365struct ocfs2_local_alloc
366{
367/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
368 __le16 la_size; /* Size of included bitmap, in bytes */
369 __le16 la_reserved1;
370 __le64 la_reserved2;
371/*10*/ __u8 la_bitmap[0];
372};
373
374/*
375 * On disk inode for OCFS2
376 */
377struct ocfs2_dinode {
378/*00*/ __u8 i_signature[8]; /* Signature for validation */
379 __le32 i_generation; /* Generation number */
380 __le16 i_suballoc_slot; /* Slot suballocator this inode
381 belongs to */
382 __le16 i_suballoc_bit; /* Bit offset in suballocator
383 block group */
384/*10*/ __le32 i_reserved0;
385 __le32 i_clusters; /* Cluster count */
386 __le32 i_uid; /* Owner UID */
387 __le32 i_gid; /* Owning GID */
388/*20*/ __le64 i_size; /* Size in bytes */
389 __le16 i_mode; /* File mode */
390 __le16 i_links_count; /* Links count */
391 __le32 i_flags; /* File flags */
392/*30*/ __le64 i_atime; /* Access time */
393 __le64 i_ctime; /* Creation time */
394/*40*/ __le64 i_mtime; /* Modification time */
395 __le64 i_dtime; /* Deletion time */
396/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
397 __le64 i_last_eb_blk; /* Pointer to last extent
398 block */
399/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
400 __le32 i_atime_nsec;
401 __le32 i_ctime_nsec;
402 __le32 i_mtime_nsec;
403/*70*/ __le64 i_reserved1[9];
404/*B8*/ union {
405 __le64 i_pad1; /* Generic way to refer to this
406 64bit union */
407 struct {
408 __le64 i_rdev; /* Device number */
409 } dev1;
410 struct { /* Info for bitmap system
411 inodes */
412 __le32 i_used; /* Bits (ie, clusters) used */
413 __le32 i_total; /* Total bits (clusters)
414 available */
415 } bitmap1;
416 struct { /* Info for journal system
417 inodes */
418 __le32 ij_flags; /* Mounted, version, etc. */
419 __le32 ij_pad;
420 } journal1;
421 } id1; /* Inode type dependant 1 */
422/*C0*/ union {
423 struct ocfs2_super_block i_super;
424 struct ocfs2_local_alloc i_lab;
425 struct ocfs2_chain_list i_chain;
426 struct ocfs2_extent_list i_list;
427 struct ocfs2_truncate_log i_dealloc;
428 __u8 i_symlink[0];
429 } id2;
430/* Actual on-disk size is one block */
431};
432
433/*
434 * On-disk directory entry structure for OCFS2
435 *
436 * Packed as this structure could be accessed unaligned on 64-bit platforms
437 */
438struct ocfs2_dir_entry {
439/*00*/ __le64 inode; /* Inode number */
440 __le16 rec_len; /* Directory entry length */
441 __u8 name_len; /* Name length */
442 __u8 file_type;
443/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
444/* Actual on-disk length specified by rec_len */
445} __attribute__ ((packed));
446
447/*
448 * On disk allocator group structure for OCFS2
449 */
450struct ocfs2_group_desc
451{
452/*00*/ __u8 bg_signature[8]; /* Signature for validation */
453 __le16 bg_size; /* Size of included bitmap in
454 bytes. */
455 __le16 bg_bits; /* Bits represented by this
456 group. */
457 __le16 bg_free_bits_count; /* Free bits count */
458 __le16 bg_chain; /* What chain I am in. */
459/*10*/ __le32 bg_generation;
460 __le32 bg_reserved1;
461 __le64 bg_next_group; /* Next group in my list, in
462 blocks */
463/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
464 blocks */
465 __le64 bg_blkno; /* Offset on disk, in blocks */
466/*30*/ __le64 bg_reserved2[2];
467/*40*/ __u8 bg_bitmap[0];
468};
469
470#ifdef __KERNEL__
471static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
472{
473 return sb->s_blocksize -
474 offsetof(struct ocfs2_dinode, id2.i_symlink);
475}
476
477static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
478{
479 int size;
480
481 size = sb->s_blocksize -
482 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
483
484 return size / sizeof(struct ocfs2_extent_rec);
485}
486
487static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
488{
489 int size;
490
491 size = sb->s_blocksize -
492 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
493
494 return size / sizeof(struct ocfs2_chain_rec);
495}
496
497static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
498{
499 int size;
500
501 size = sb->s_blocksize -
502 offsetof(struct ocfs2_extent_block, h_list.l_recs);
503
504 return size / sizeof(struct ocfs2_extent_rec);
505}
506
507static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
508{
509 u16 size;
510
511 size = sb->s_blocksize -
512 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
513
514 return size;
515}
516
517static inline int ocfs2_group_bitmap_size(struct super_block *sb)
518{
519 int size;
520
521 size = sb->s_blocksize -
522 offsetof(struct ocfs2_group_desc, bg_bitmap);
523
524 return size;
525}
526
527static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
528{
529 int size;
530
531 size = sb->s_blocksize -
532 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
533
534 return size / sizeof(struct ocfs2_truncate_rec);
535}
536#else
537static inline int ocfs2_fast_symlink_chars(int blocksize)
538{
539 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
540}
541
542static inline int ocfs2_extent_recs_per_inode(int blocksize)
543{
544 int size;
545
546 size = blocksize -
547 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
548
549 return size / sizeof(struct ocfs2_extent_rec);
550}
551
552static inline int ocfs2_chain_recs_per_inode(int blocksize)
553{
554 int size;
555
556 size = blocksize -
557 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
558
559 return size / sizeof(struct ocfs2_chain_rec);
560}
561
562static inline int ocfs2_extent_recs_per_eb(int blocksize)
563{
564 int size;
565
566 size = blocksize -
567 offsetof(struct ocfs2_extent_block, h_list.l_recs);
568
569 return size / sizeof(struct ocfs2_extent_rec);
570}
571
572static inline int ocfs2_local_alloc_size(int blocksize)
573{
574 int size;
575
576 size = blocksize -
577 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
578
579 return size;
580}
581
582static inline int ocfs2_group_bitmap_size(int blocksize)
583{
584 int size;
585
586 size = blocksize -
587 offsetof(struct ocfs2_group_desc, bg_bitmap);
588
589 return size;
590}
591
592static inline int ocfs2_truncate_recs_per_inode(int blocksize)
593{
594 int size;
595
596 size = blocksize -
597 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
598
599 return size / sizeof(struct ocfs2_truncate_rec);
600}
601#endif /* __KERNEL__ */
602
603
604static inline int ocfs2_system_inode_is_global(int type)
605{
606 return ((type >= 0) &&
607 (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
608}
609
610static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
611 int type, int slot)
612{
613 int chars;
614
615 /*
616 * Global system inodes can only have one copy. Everything
617 * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
618 * list has a copy per slot.
619 */
620 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
621 chars = snprintf(buf, len,
622 ocfs2_system_inodes[type].si_name);
623 else
624 chars = snprintf(buf, len,
625 ocfs2_system_inodes[type].si_name,
626 slot);
627
628 return chars;
629}
630
631static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
632 umode_t mode)
633{
634 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
635}
636
637#endif /* _OCFS2_FS_H */
638
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 000000000000..7dd9e1e705b0
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_lockid.h
5 *
6 * Defines OCFS2 lockid bits.
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCKID_H
27#define OCFS2_LOCKID_H
28
29/* lock ids are made up in the following manner:
30 * name[0] --> type
31 * name[1-6] --> 6 pad characters, reserved for now
32 * name[7-22] --> block number, expressed in hex as 16 chars
33 * name[23-30] --> i_generation, expressed in hex 8 chars
34 * name[31] --> '\0' */
35#define OCFS2_LOCK_ID_MAX_LEN 32
36#define OCFS2_LOCK_ID_PAD "000000"
37
38enum ocfs2_lock_type {
39 OCFS2_LOCK_TYPE_META = 0,
40 OCFS2_LOCK_TYPE_DATA,
41 OCFS2_LOCK_TYPE_SUPER,
42 OCFS2_LOCK_TYPE_RENAME,
43 OCFS2_LOCK_TYPE_RW,
44 OCFS2_NUM_LOCK_TYPES
45};
46
47static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
48{
49 char c;
50 switch (type) {
51 case OCFS2_LOCK_TYPE_META:
52 c = 'M';
53 break;
54 case OCFS2_LOCK_TYPE_DATA:
55 c = 'D';
56 break;
57 case OCFS2_LOCK_TYPE_SUPER:
58 c = 'S';
59 break;
60 case OCFS2_LOCK_TYPE_RENAME:
61 c = 'R';
62 break;
63 case OCFS2_LOCK_TYPE_RW:
64 c = 'W';
65 break;
66 default:
67 c = '\0';
68 }
69
70 return c;
71}
72
73#endif /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 000000000000..871627961d6d
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slot_map.c
5 *
6 *
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30
31#define MLOG_MASK_PREFIX ML_SUPER
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "dlmglue.h"
37#include "extent_map.h"
38#include "heartbeat.h"
39#include "inode.h"
40#include "slot_map.h"
41#include "super.h"
42#include "sysfile.h"
43
44#include "buffer_head_io.h"
45
46static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
47 s16 global);
48static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
49 s16 slot_num,
50 s16 node_num);
51
52/* Use the slot information we've collected to create a map of mounted
53 * nodes. Should be holding an EX on super block. assumes slot info is
54 * up to date. Note that we call this *after* we find a slot, so our
55 * own node should be set in the map too... */
56void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
57{
58 int i;
59 struct ocfs2_slot_info *si = osb->slot_info;
60
61 spin_lock(&si->si_lock);
62
63 for (i = 0; i < si->si_size; i++)
64 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
65 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
66 si->si_global_node_nums[i]);
67
68 spin_unlock(&si->si_lock);
69}
70
71/* post the slot information on disk into our slot_info struct. */
72void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
73{
74 int i;
75 __le16 *disk_info;
76
77 /* we don't read the slot block here as ocfs2_super_lock
78 * should've made sure we have the most recent copy. */
79 spin_lock(&si->si_lock);
80 disk_info = (__le16 *) si->si_bh->b_data;
81
82 for (i = 0; i < si->si_size; i++)
83 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
84
85 spin_unlock(&si->si_lock);
86}
87
88/* post the our slot info stuff into it's destination bh and write it
89 * out. */
90int ocfs2_update_disk_slots(struct ocfs2_super *osb,
91 struct ocfs2_slot_info *si)
92{
93 int status, i;
94 __le16 *disk_info = (__le16 *) si->si_bh->b_data;
95
96 spin_lock(&si->si_lock);
97 for (i = 0; i < si->si_size; i++)
98 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
99 spin_unlock(&si->si_lock);
100
101 status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
102 if (status < 0)
103 mlog_errno(status);
104
105 return status;
106}
107
108/* try to find global node in the slot info. Returns
109 * OCFS2_INVALID_SLOT if nothing is found. */
110static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
111 s16 global)
112{
113 int i;
114 s16 ret = OCFS2_INVALID_SLOT;
115
116 for(i = 0; i < si->si_num_slots; i++) {
117 if (global == si->si_global_node_nums[i]) {
118 ret = (s16) i;
119 break;
120 }
121 }
122 return ret;
123}
124
125static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
126{
127 int i;
128 s16 ret = OCFS2_INVALID_SLOT;
129
130 for(i = 0; i < si->si_num_slots; i++) {
131 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
132 ret = (s16) i;
133 break;
134 }
135 }
136 return ret;
137}
138
139s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
140 s16 global)
141{
142 s16 ret;
143
144 spin_lock(&si->si_lock);
145 ret = __ocfs2_node_num_to_slot(si, global);
146 spin_unlock(&si->si_lock);
147 return ret;
148}
149
150static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
151 s16 slot_num,
152 s16 node_num)
153{
154 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
155 BUG_ON(slot_num >= si->si_num_slots);
156 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
157 (node_num >= O2NM_MAX_NODES));
158
159 si->si_global_node_nums[slot_num] = node_num;
160}
161
162void ocfs2_clear_slot(struct ocfs2_slot_info *si,
163 s16 slot_num)
164{
165 spin_lock(&si->si_lock);
166 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
167 spin_unlock(&si->si_lock);
168}
169
170int ocfs2_init_slot_info(struct ocfs2_super *osb)
171{
172 int status, i;
173 u64 blkno;
174 struct inode *inode = NULL;
175 struct buffer_head *bh = NULL;
176 struct ocfs2_slot_info *si;
177
178 si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
179 if (!si) {
180 status = -ENOMEM;
181 mlog_errno(status);
182 goto bail;
183 }
184
185 spin_lock_init(&si->si_lock);
186 si->si_num_slots = osb->max_slots;
187 si->si_size = OCFS2_MAX_SLOTS;
188
189 for(i = 0; i < si->si_num_slots; i++)
190 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
191
192 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
193 OCFS2_INVALID_SLOT);
194 if (!inode) {
195 status = -EINVAL;
196 mlog_errno(status);
197 goto bail;
198 }
199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
201 if (status < 0) {
202 mlog_errno(status);
203 goto bail;
204 }
205
206 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
207 if (status < 0) {
208 mlog_errno(status);
209 goto bail;
210 }
211
212 si->si_inode = inode;
213 si->si_bh = bh;
214 osb->slot_info = si;
215bail:
216 if (status < 0 && si)
217 ocfs2_free_slot_info(si);
218
219 return status;
220}
221
222void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
223{
224 if (si->si_inode)
225 iput(si->si_inode);
226 if (si->si_bh)
227 brelse(si->si_bh);
228 kfree(si);
229}
230
231int ocfs2_find_slot(struct ocfs2_super *osb)
232{
233 int status;
234 s16 slot;
235 struct ocfs2_slot_info *si;
236
237 mlog_entry_void();
238
239 si = osb->slot_info;
240
241 ocfs2_update_slot_info(si);
242
243 spin_lock(&si->si_lock);
244 /* search for ourselves first and take the slot if it already
245 * exists. Perhaps we need to mark this in a variable for our
246 * own journal recovery? Possibly not, though we certainly
247 * need to warn to the user */
248 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
249 if (slot == OCFS2_INVALID_SLOT) {
250 /* if no slot yet, then just take 1st available
251 * one. */
252 slot = __ocfs2_find_empty_slot(si);
253 if (slot == OCFS2_INVALID_SLOT) {
254 spin_unlock(&si->si_lock);
255 mlog(ML_ERROR, "no free slots available!\n");
256 status = -EINVAL;
257 goto bail;
258 }
259 } else
260 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
261 slot);
262
263 __ocfs2_fill_slot(si, slot, osb->node_num);
264 osb->slot_num = slot;
265 spin_unlock(&si->si_lock);
266
267 mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
268
269 status = ocfs2_update_disk_slots(osb, si);
270 if (status < 0)
271 mlog_errno(status);
272
273bail:
274 mlog_exit(status);
275 return status;
276}
277
278void ocfs2_put_slot(struct ocfs2_super *osb)
279{
280 int status;
281 struct ocfs2_slot_info *si = osb->slot_info;
282
283 if (!si)
284 return;
285
286 ocfs2_update_slot_info(si);
287
288 spin_lock(&si->si_lock);
289 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
290 osb->slot_num = OCFS2_INVALID_SLOT;
291 spin_unlock(&si->si_lock);
292
293 status = ocfs2_update_disk_slots(osb, si);
294 if (status < 0) {
295 mlog_errno(status);
296 goto bail;
297 }
298
299bail:
300 osb->slot_info = NULL;
301 ocfs2_free_slot_info(si);
302}
303
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 000000000000..d8c8ceed031b
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slotmap.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef SLOTMAP_H
28#define SLOTMAP_H
29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
42
43int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb);
45
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num)
59{
60 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
61 assert_spin_locked(&si->si_lock);
62
63 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
64}
65
66#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 000000000000..c46c164aefbb
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45#include "uptodate.h"
46
47#include "buffer_head_io.h"
48
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
61
62static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
64
65static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted,
82 u32 min_bits,
83 u16 *bit_off,
84 unsigned int *num_bits,
85 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr);
88static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 struct buffer_head *bg_bh,
90 unsigned int bits_wanted,
91 u16 *bit_off,
92 u16 *bits_found);
93static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 struct inode *alloc_inode,
95 struct ocfs2_group_desc *bg,
96 struct buffer_head *group_bh,
97 unsigned int bit_off,
98 unsigned int num_bits);
99static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 struct inode *alloc_inode,
101 struct ocfs2_group_desc *bg,
102 struct buffer_head *group_bh,
103 unsigned int bit_off,
104 unsigned int num_bits);
105
106static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 struct inode *alloc_inode,
108 struct buffer_head *fe_bh,
109 struct buffer_head *bg_bh,
110 struct buffer_head *prev_bg_bh,
111 u16 chain);
112static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 u32 wanted);
114static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 struct inode *alloc_inode,
116 struct buffer_head *alloc_bh,
117 unsigned int start_bit,
118 u64 bg_blkno,
119 unsigned int count);
120static inline u64 ocfs2_which_suballoc_group(u64 block,
121 unsigned int bit);
122static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 u64 bg_blkno,
124 u16 bg_bit_off);
125static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 u32 cluster);
127static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 u64 data_blkno,
129 u64 *bg_blkno,
130 u16 *bg_bit_off);
131
132void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133{
134 if (ac->ac_inode)
135 iput(ac->ac_inode);
136 if (ac->ac_bh)
137 brelse(ac->ac_bh);
138 kfree(ac);
139}
140
141static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142{
143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144}
145
146static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 struct inode *alloc_inode,
148 struct buffer_head *bg_bh,
149 u64 group_blkno,
150 u16 my_chain,
151 struct ocfs2_chain_list *cl)
152{
153 int status = 0;
154 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 struct super_block * sb = alloc_inode->i_sb;
156
157 mlog_entry_void();
158
159 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161 "!= b_blocknr (%llu)", group_blkno,
162 (unsigned long long) bg_bh->b_blocknr);
163 status = -EIO;
164 goto bail;
165 }
166
167 status = ocfs2_journal_access(handle,
168 alloc_inode,
169 bg_bh,
170 OCFS2_JOURNAL_ACCESS_CREATE);
171 if (status < 0) {
172 mlog_errno(status);
173 goto bail;
174 }
175
176 memset(bg, 0, sb->s_blocksize);
177 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181 bg->bg_chain = cpu_to_le16(my_chain);
182 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184 bg->bg_blkno = cpu_to_le64(group_blkno);
185 /* set the 1st bit in the bitmap to account for the descriptor block */
186 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188
189 status = ocfs2_journal_dirty(handle, bg_bh);
190 if (status < 0)
191 mlog_errno(status);
192
193 /* There is no need to zero out or otherwise initialize the
194 * other blocks in a group - All valid FS metadata in a block
195 * group stores the superblock fs_generation value at
196 * allocation time. */
197
198bail:
199 mlog_exit(status);
200 return status;
201}
202
203static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204{
205 u16 curr, best;
206
207 best = curr = 0;
208 while (curr < le16_to_cpu(cl->cl_count)) {
209 if (le32_to_cpu(cl->cl_recs[best].c_total) >
210 le32_to_cpu(cl->cl_recs[curr].c_total))
211 best = curr;
212 curr++;
213 }
214 return best;
215}
216
217/*
218 * We expect the block group allocator to already be locked.
219 */
220static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221 struct inode *alloc_inode,
222 struct buffer_head *bh)
223{
224 int status, credits;
225 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226 struct ocfs2_chain_list *cl;
227 struct ocfs2_alloc_context *ac = NULL;
228 struct ocfs2_journal_handle *handle = NULL;
229 u32 bit_off, num_bits;
230 u16 alloc_rec;
231 u64 bg_blkno;
232 struct buffer_head *bg_bh = NULL;
233 struct ocfs2_group_desc *bg;
234
235 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236
237 mlog_entry_void();
238
239 handle = ocfs2_alloc_handle(osb);
240 if (!handle) {
241 status = -ENOMEM;
242 mlog_errno(status);
243 goto bail;
244 }
245
246 cl = &fe->id2.i_chain;
247 status = ocfs2_reserve_clusters(osb,
248 handle,
249 le16_to_cpu(cl->cl_cpg),
250 &ac);
251 if (status < 0) {
252 if (status != -ENOSPC)
253 mlog_errno(status);
254 goto bail;
255 }
256
257 credits = ocfs2_calc_group_alloc_credits(osb->sb,
258 le16_to_cpu(cl->cl_cpg));
259 handle = ocfs2_start_trans(osb, handle, credits);
260 if (IS_ERR(handle)) {
261 status = PTR_ERR(handle);
262 handle = NULL;
263 mlog_errno(status);
264 goto bail;
265 }
266
267 status = ocfs2_claim_clusters(osb,
268 handle,
269 ac,
270 le16_to_cpu(cl->cl_cpg),
271 &bit_off,
272 &num_bits);
273 if (status < 0) {
274 if (status != -ENOSPC)
275 mlog_errno(status);
276 goto bail;
277 }
278
279 alloc_rec = ocfs2_find_smallest_chain(cl);
280
281 /* setup the group */
282 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283 mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284 alloc_rec, bg_blkno);
285
286 bg_bh = sb_getblk(osb->sb, bg_blkno);
287 if (!bg_bh) {
288 status = -EIO;
289 mlog_errno(status);
290 goto bail;
291 }
292 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293
294 status = ocfs2_block_group_fill(handle,
295 alloc_inode,
296 bg_bh,
297 bg_blkno,
298 alloc_rec,
299 cl);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306
307 status = ocfs2_journal_access(handle, alloc_inode,
308 bh, OCFS2_JOURNAL_ACCESS_WRITE);
309 if (status < 0) {
310 mlog_errno(status);
311 goto bail;
312 }
313
314 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315 le16_to_cpu(bg->bg_free_bits_count));
316 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
318 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319 le16_add_cpu(&cl->cl_next_free_rec, 1);
320
321 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322 le16_to_cpu(bg->bg_free_bits_count));
323 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325
326 status = ocfs2_journal_dirty(handle, bh);
327 if (status < 0) {
328 mlog_errno(status);
329 goto bail;
330 }
331
332 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335 le32_to_cpu(fe->i_clusters)));
336 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338 alloc_inode->i_blocks =
339 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340
341 status = 0;
342bail:
343 if (handle)
344 ocfs2_commit_trans(handle);
345
346 if (ac)
347 ocfs2_free_alloc_context(ac);
348
349 if (bg_bh)
350 brelse(bg_bh);
351
352 mlog_exit(status);
353 return status;
354}
355
356static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357 struct ocfs2_alloc_context *ac)
358{
359 int status;
360 u32 bits_wanted = ac->ac_bits_wanted;
361 struct inode *alloc_inode = ac->ac_inode;
362 struct buffer_head *bh = NULL;
363 struct ocfs2_journal_handle *handle = ac->ac_handle;
364 struct ocfs2_dinode *fe;
365 u32 free_bits;
366
367 mlog_entry_void();
368
369 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370
371 ocfs2_handle_add_inode(handle, alloc_inode);
372 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373 if (status < 0) {
374 mlog_errno(status);
375 goto bail;
376 }
377
378 fe = (struct ocfs2_dinode *) bh->b_data;
379 if (!OCFS2_IS_VALID_DINODE(fe)) {
380 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381 status = -EIO;
382 goto bail;
383 }
384 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386 "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387 status = -EIO;
388 goto bail;
389 }
390
391 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392 le32_to_cpu(fe->id1.bitmap1.i_used);
393
394 if (bits_wanted > free_bits) {
395 /* cluster bitmap never grows */
396 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398 bits_wanted, free_bits);
399 status = -ENOSPC;
400 goto bail;
401 }
402
403 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404 if (status < 0) {
405 if (status != -ENOSPC)
406 mlog_errno(status);
407 goto bail;
408 }
409 atomic_inc(&osb->alloc_stats.bg_extends);
410
411 /* You should never ask for this much metadata */
412 BUG_ON(bits_wanted >
413 (le32_to_cpu(fe->id1.bitmap1.i_total)
414 - le32_to_cpu(fe->id1.bitmap1.i_used)));
415 }
416
417 get_bh(bh);
418 ac->ac_bh = bh;
419bail:
420 if (bh)
421 brelse(bh);
422
423 mlog_exit(status);
424 return status;
425}
426
427int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428 struct ocfs2_journal_handle *handle,
429 struct ocfs2_dinode *fe,
430 struct ocfs2_alloc_context **ac)
431{
432 int status;
433 struct inode *alloc_inode = NULL;
434
435 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436 if (!(*ac)) {
437 status = -ENOMEM;
438 mlog_errno(status);
439 goto bail;
440 }
441
442 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443 (*ac)->ac_handle = handle;
444 (*ac)->ac_which = OCFS2_AC_USE_META;
445
446#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447 alloc_inode = ocfs2_get_system_file_inode(osb,
448 EXTENT_ALLOC_SYSTEM_INODE,
449 0);
450#else
451 alloc_inode = ocfs2_get_system_file_inode(osb,
452 EXTENT_ALLOC_SYSTEM_INODE,
453 osb->slot_num);
454#endif
455 if (!alloc_inode) {
456 status = -ENOMEM;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 (*ac)->ac_inode = igrab(alloc_inode);
462 (*ac)->ac_group_search = ocfs2_block_group_search;
463
464 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465 if (status < 0) {
466 if (status != -ENOSPC)
467 mlog_errno(status);
468 goto bail;
469 }
470
471 status = 0;
472bail:
473 if ((status < 0) && *ac) {
474 ocfs2_free_alloc_context(*ac);
475 *ac = NULL;
476 }
477
478 if (alloc_inode)
479 iput(alloc_inode);
480
481 mlog_exit(status);
482 return status;
483}
484
485int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486 struct ocfs2_journal_handle *handle,
487 struct ocfs2_alloc_context **ac)
488{
489 int status;
490 struct inode *alloc_inode = NULL;
491
492 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493 if (!(*ac)) {
494 status = -ENOMEM;
495 mlog_errno(status);
496 goto bail;
497 }
498
499 (*ac)->ac_bits_wanted = 1;
500 (*ac)->ac_handle = handle;
501 (*ac)->ac_which = OCFS2_AC_USE_INODE;
502
503 alloc_inode = ocfs2_get_system_file_inode(osb,
504 INODE_ALLOC_SYSTEM_INODE,
505 osb->slot_num);
506 if (!alloc_inode) {
507 status = -ENOMEM;
508 mlog_errno(status);
509 goto bail;
510 }
511
512 (*ac)->ac_inode = igrab(alloc_inode);
513 (*ac)->ac_group_search = ocfs2_block_group_search;
514
515 status = ocfs2_reserve_suballoc_bits(osb, *ac);
516 if (status < 0) {
517 if (status != -ENOSPC)
518 mlog_errno(status);
519 goto bail;
520 }
521
522 status = 0;
523bail:
524 if ((status < 0) && *ac) {
525 ocfs2_free_alloc_context(*ac);
526 *ac = NULL;
527 }
528
529 if (alloc_inode)
530 iput(alloc_inode);
531
532 mlog_exit(status);
533 return status;
534}
535
536/* local alloc code has to do the same thing, so rather than do this
537 * twice.. */
538int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539 struct ocfs2_alloc_context *ac)
540{
541 int status;
542
543 ac->ac_inode = ocfs2_get_system_file_inode(osb,
544 GLOBAL_BITMAP_SYSTEM_INODE,
545 OCFS2_INVALID_SLOT);
546 if (!ac->ac_inode) {
547 status = -EINVAL;
548 mlog(ML_ERROR, "Could not get bitmap inode!\n");
549 goto bail;
550 }
551 ac->ac_which = OCFS2_AC_USE_MAIN;
552 ac->ac_group_search = ocfs2_cluster_group_search;
553
554 status = ocfs2_reserve_suballoc_bits(osb, ac);
555 if (status < 0 && status != -ENOSPC)
556 mlog_errno(status);
557bail:
558 return status;
559}
560
561/* Callers don't need to care which bitmap (local alloc or main) to
562 * use so we figure it out for them, but unfortunately this clutters
563 * things a bit. */
564int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565 struct ocfs2_journal_handle *handle,
566 u32 bits_wanted,
567 struct ocfs2_alloc_context **ac)
568{
569 int status;
570
571 mlog_entry_void();
572
573 BUG_ON(!handle);
574
575 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576 if (!(*ac)) {
577 status = -ENOMEM;
578 mlog_errno(status);
579 goto bail;
580 }
581
582 (*ac)->ac_bits_wanted = bits_wanted;
583 (*ac)->ac_handle = handle;
584
585 status = -ENOSPC;
586 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587 status = ocfs2_reserve_local_alloc_bits(osb,
588 handle,
589 bits_wanted,
590 *ac);
591 if ((status < 0) && (status != -ENOSPC)) {
592 mlog_errno(status);
593 goto bail;
594 } else if (status == -ENOSPC) {
595 /* reserve_local_bits will return enospc with
596 * the local alloc inode still locked, so we
597 * can change this safely here. */
598 mlog(0, "Disabling local alloc\n");
599 /* We set to OCFS2_LA_DISABLED so that umount
600 * can clean up what's left of the local
601 * allocation */
602 osb->local_alloc_state = OCFS2_LA_DISABLED;
603 }
604 }
605
606 if (status == -ENOSPC) {
607 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608 if (status < 0) {
609 if (status != -ENOSPC)
610 mlog_errno(status);
611 goto bail;
612 }
613 }
614
615 status = 0;
616bail:
617 if ((status < 0) && *ac) {
618 ocfs2_free_alloc_context(*ac);
619 *ac = NULL;
620 }
621
622 mlog_exit(status);
623 return status;
624}
625
626/*
627 * More or less lifted from ext3. I'll leave their description below:
628 *
629 * "For ext3 allocations, we must not reuse any blocks which are
630 * allocated in the bitmap buffer's "last committed data" copy. This
631 * prevents deletes from freeing up the page for reuse until we have
632 * committed the delete transaction.
633 *
634 * If we didn't do this, then deleting something and reallocating it as
635 * data would allow the old block to be overwritten before the
636 * transaction committed (because we force data to disk before commit).
637 * This would lead to corruption if we crashed between overwriting the
638 * data and committing the delete.
639 *
640 * @@@ We may want to make this allocation behaviour conditional on
641 * data-writes at some point, and disable it for metadata allocations or
642 * sync-data inodes."
643 *
644 * Note: OCFS2 already does this differently for metadata vs data
645 * allocations, as those bitmaps are seperate and undo access is never
646 * called on a metadata group descriptor.
647 */
648static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649 int nr)
650{
651 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652
653 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654 return 0;
655 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656 return 1;
657
658 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660}
661
662static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663 struct buffer_head *bg_bh,
664 unsigned int bits_wanted,
665 u16 *bit_off,
666 u16 *bits_found)
667{
668 void *bitmap;
669 u16 best_offset, best_size;
670 int offset, start, found, status = 0;
671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672
673 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675 return -EIO;
676 }
677
678 found = start = best_offset = best_size = 0;
679 bitmap = bg->bg_bitmap;
680
681 while((offset = ocfs2_find_next_zero_bit(bitmap,
682 le16_to_cpu(bg->bg_bits),
683 start)) != -1) {
684 if (offset == le16_to_cpu(bg->bg_bits))
685 break;
686
687 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688 /* We found a zero, but we can't use it as it
689 * hasn't been put to disk yet! */
690 found = 0;
691 start = offset + 1;
692 } else if (offset == start) {
693 /* we found a zero */
694 found++;
695 /* move start to the next bit to test */
696 start++;
697 } else {
698 /* got a zero after some ones */
699 found = 1;
700 start = offset + 1;
701 }
702 if (found > best_size) {
703 best_size = found;
704 best_offset = start - found;
705 }
706 /* we got everything we needed */
707 if (found == bits_wanted) {
708 /* mlog(0, "Found it all!\n"); */
709 break;
710 }
711 }
712
713 /* XXX: I think the first clause is equivalent to the second
714 * - jlbec */
715 if (found == bits_wanted) {
716 *bit_off = start - found;
717 *bits_found = found;
718 } else if (best_size) {
719 *bit_off = best_offset;
720 *bits_found = best_size;
721 } else {
722 status = -ENOSPC;
723 /* No error log here -- see the comment above
724 * ocfs2_test_bg_bit_allocatable */
725 }
726
727 return status;
728}
729
730static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731 struct inode *alloc_inode,
732 struct ocfs2_group_desc *bg,
733 struct buffer_head *group_bh,
734 unsigned int bit_off,
735 unsigned int num_bits)
736{
737 int status;
738 void *bitmap = bg->bg_bitmap;
739 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740
741 mlog_entry_void();
742
743 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745 status = -EIO;
746 goto bail;
747 }
748 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749
750 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751 num_bits);
752
753 if (ocfs2_is_cluster_bitmap(alloc_inode))
754 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755
756 status = ocfs2_journal_access(handle,
757 alloc_inode,
758 group_bh,
759 journal_type);
760 if (status < 0) {
761 mlog_errno(status);
762 goto bail;
763 }
764
765 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766
767 while(num_bits--)
768 ocfs2_set_bit(bit_off++, bitmap);
769
770 status = ocfs2_journal_dirty(handle,
771 group_bh);
772 if (status < 0) {
773 mlog_errno(status);
774 goto bail;
775 }
776
777bail:
778 mlog_exit(status);
779 return status;
780}
781
782/* find the one with the most empty bits */
783static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784{
785 u16 curr, best;
786
787 BUG_ON(!cl->cl_next_free_rec);
788
789 best = curr = 0;
790 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792 le32_to_cpu(cl->cl_recs[best].c_free))
793 best = curr;
794 curr++;
795 }
796
797 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798 return best;
799}
800
801static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802 struct inode *alloc_inode,
803 struct buffer_head *fe_bh,
804 struct buffer_head *bg_bh,
805 struct buffer_head *prev_bg_bh,
806 u16 chain)
807{
808 int status;
809 /* there is a really tiny chance the journal calls could fail,
810 * but we wouldn't want inconsistent blocks in *any* case. */
811 u64 fe_ptr, bg_ptr, prev_bg_ptr;
812 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815
816 if (!OCFS2_IS_VALID_DINODE(fe)) {
817 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818 status = -EIO;
819 goto out;
820 }
821 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823 status = -EIO;
824 goto out;
825 }
826 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828 status = -EIO;
829 goto out;
830 }
831
832 mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833 "top, prev = %"MLFu64"\n",
834 fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835
836 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837 bg_ptr = le64_to_cpu(bg->bg_next_group);
838 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839
840 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841 OCFS2_JOURNAL_ACCESS_WRITE);
842 if (status < 0) {
843 mlog_errno(status);
844 goto out_rollback;
845 }
846
847 prev_bg->bg_next_group = bg->bg_next_group;
848
849 status = ocfs2_journal_dirty(handle, prev_bg_bh);
850 if (status < 0) {
851 mlog_errno(status);
852 goto out_rollback;
853 }
854
855 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856 OCFS2_JOURNAL_ACCESS_WRITE);
857 if (status < 0) {
858 mlog_errno(status);
859 goto out_rollback;
860 }
861
862 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863
864 status = ocfs2_journal_dirty(handle, bg_bh);
865 if (status < 0) {
866 mlog_errno(status);
867 goto out_rollback;
868 }
869
870 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871 OCFS2_JOURNAL_ACCESS_WRITE);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out_rollback;
875 }
876
877 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878
879 status = ocfs2_journal_dirty(handle, fe_bh);
880 if (status < 0) {
881 mlog_errno(status);
882 goto out_rollback;
883 }
884
885 status = 0;
886out_rollback:
887 if (status < 0) {
888 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889 bg->bg_next_group = cpu_to_le64(bg_ptr);
890 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891 }
892out:
893 mlog_exit(status);
894 return status;
895}
896
897static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898 u32 wanted)
899{
900 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901}
902
903/* return 0 on success, -ENOSPC to keep searching and any other < 0
904 * value on error. */
905static int ocfs2_cluster_group_search(struct inode *inode,
906 struct buffer_head *group_bh,
907 u32 bits_wanted, u32 min_bits,
908 u16 *bit_off, u16 *bits_found)
909{
910 int search = -ENOSPC;
911 int ret;
912 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913 u16 tmp_off, tmp_found;
914
915 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916
917 if (bg->bg_free_bits_count) {
918 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919 group_bh, bits_wanted,
920 &tmp_off, &tmp_found);
921 if (ret)
922 return ret;
923
924 /* ocfs2_block_group_find_clear_bits() might
925 * return success, but we still want to return
926 * -ENOSPC unless it found the minimum number
927 * of bits. */
928 if (min_bits <= tmp_found) {
929 *bit_off = tmp_off;
930 *bits_found = tmp_found;
931 search = 0; /* success */
932 }
933 }
934
935 return search;
936}
937
938static int ocfs2_block_group_search(struct inode *inode,
939 struct buffer_head *group_bh,
940 u32 bits_wanted, u32 min_bits,
941 u16 *bit_off, u16 *bits_found)
942{
943 int ret = -ENOSPC;
944 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945
946 BUG_ON(min_bits != 1);
947 BUG_ON(ocfs2_is_cluster_bitmap(inode));
948
949 if (bg->bg_free_bits_count)
950 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951 group_bh, bits_wanted,
952 bit_off, bits_found);
953
954 return ret;
955}
956
957static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958 u32 bits_wanted,
959 u32 min_bits,
960 u16 *bit_off,
961 unsigned int *num_bits,
962 u64 *bg_blkno)
963{
964 int status;
965 u16 chain, tmp_bits;
966 u32 tmp_used;
967 u64 next_group;
968 struct ocfs2_journal_handle *handle = ac->ac_handle;
969 struct inode *alloc_inode = ac->ac_inode;
970 struct buffer_head *group_bh = NULL;
971 struct buffer_head *prev_group_bh = NULL;
972 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974 struct ocfs2_group_desc *bg;
975
976 chain = ac->ac_chain;
977 mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978 bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979
980 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981 le64_to_cpu(cl->cl_recs[chain].c_blkno),
982 &group_bh, OCFS2_BH_CACHED, alloc_inode);
983 if (status < 0) {
984 mlog_errno(status);
985 goto bail;
986 }
987 bg = (struct ocfs2_group_desc *) group_bh->b_data;
988 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990 status = -EIO;
991 goto bail;
992 }
993
994 status = -ENOSPC;
995 /* for now, the chain search is a bit simplistic. We just use
996 * the 1st group with any empty bits. */
997 while ((status = ac->ac_group_search(alloc_inode, group_bh,
998 bits_wanted, min_bits, bit_off,
999 &tmp_bits)) == -ENOSPC) {
1000 if (!bg->bg_next_group)
1001 break;
1002
1003 if (prev_group_bh) {
1004 brelse(prev_group_bh);
1005 prev_group_bh = NULL;
1006 }
1007 next_group = le64_to_cpu(bg->bg_next_group);
1008 prev_group_bh = group_bh;
1009 group_bh = NULL;
1010 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011 next_group, &group_bh,
1012 OCFS2_BH_CACHED, alloc_inode);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto bail;
1016 }
1017 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020 status = -EIO;
1021 goto bail;
1022 }
1023 }
1024 if (status < 0) {
1025 if (status != -ENOSPC)
1026 mlog_errno(status);
1027 goto bail;
1028 }
1029
1030 mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031 tmp_bits, bg->bg_blkno);
1032
1033 *num_bits = tmp_bits;
1034
1035 BUG_ON(*num_bits == 0);
1036
1037 /*
1038 * Keep track of previous block descriptor read. When
1039 * we find a target, if we have read more than X
1040 * number of descriptors, and the target is reasonably
1041 * empty, relink him to top of his chain.
1042 *
1043 * We've read 0 extra blocks and only send one more to
1044 * the transaction, yet the next guy to search has a
1045 * much easier time.
1046 *
1047 * Do this *after* figuring out how many bits we're taking out
1048 * of our target group.
1049 */
1050 if (ac->ac_allow_chain_relink &&
1051 (prev_group_bh) &&
1052 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053 status = ocfs2_relink_block_group(handle, alloc_inode,
1054 ac->ac_bh, group_bh,
1055 prev_group_bh, chain);
1056 if (status < 0) {
1057 mlog_errno(status);
1058 goto bail;
1059 }
1060 }
1061
1062 /* Ok, claim our bits now: set the info on dinode, chainlist
1063 * and then the group */
1064 status = ocfs2_journal_access(handle,
1065 alloc_inode,
1066 ac->ac_bh,
1067 OCFS2_JOURNAL_ACCESS_WRITE);
1068 if (status < 0) {
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076
1077 status = ocfs2_journal_dirty(handle,
1078 ac->ac_bh);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 status = ocfs2_block_group_set_bits(handle,
1085 alloc_inode,
1086 bg,
1087 group_bh,
1088 *bit_off,
1089 *num_bits);
1090 if (status < 0) {
1091 mlog_errno(status);
1092 goto bail;
1093 }
1094
1095 mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096 *num_bits, fe->i_blkno);
1097
1098 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1099bail:
1100 if (group_bh)
1101 brelse(group_bh);
1102 if (prev_group_bh)
1103 brelse(prev_group_bh);
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/* will give out up to bits_wanted contiguous bits. */
1110static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111 struct ocfs2_alloc_context *ac,
1112 u32 bits_wanted,
1113 u32 min_bits,
1114 u16 *bit_off,
1115 unsigned int *num_bits,
1116 u64 *bg_blkno)
1117{
1118 int status;
1119 u16 victim, i;
1120 struct ocfs2_chain_list *cl;
1121 struct ocfs2_dinode *fe;
1122
1123 mlog_entry_void();
1124
1125 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127 BUG_ON(!ac->ac_bh);
1128
1129 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130 if (!OCFS2_IS_VALID_DINODE(fe)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132 status = -EIO;
1133 goto bail;
1134 }
1135 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137 ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138 "used bits but only %u total.",
1139 le64_to_cpu(fe->i_blkno),
1140 le32_to_cpu(fe->id1.bitmap1.i_used),
1141 le32_to_cpu(fe->id1.bitmap1.i_total));
1142 status = -EIO;
1143 goto bail;
1144 }
1145
1146 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147
1148 victim = ocfs2_find_victim_chain(cl);
1149 ac->ac_chain = victim;
1150 ac->ac_allow_chain_relink = 1;
1151
1152 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153 num_bits, bg_blkno);
1154 if (!status)
1155 goto bail;
1156 if (status < 0 && status != -ENOSPC) {
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160
1161 mlog(0, "Search of victim chain %u came up with nothing, "
1162 "trying all chains now.\n", victim);
1163
1164 /* If we didn't pick a good victim, then just default to
1165 * searching each chain in order. Don't allow chain relinking
1166 * because we only calculate enough journal credits for one
1167 * relink per alloc. */
1168 ac->ac_allow_chain_relink = 0;
1169 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170 if (i == victim)
1171 continue;
1172 if (!cl->cl_recs[i].c_free)
1173 continue;
1174
1175 ac->ac_chain = i;
1176 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177 bit_off, num_bits,
1178 bg_blkno);
1179 if (!status)
1180 break;
1181 if (status < 0 && status != -ENOSPC) {
1182 mlog_errno(status);
1183 goto bail;
1184 }
1185 }
1186bail:
1187
1188 mlog_exit(status);
1189 return status;
1190}
1191
1192int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193 struct ocfs2_journal_handle *handle,
1194 struct ocfs2_alloc_context *ac,
1195 u32 bits_wanted,
1196 u16 *suballoc_bit_start,
1197 unsigned int *num_bits,
1198 u64 *blkno_start)
1199{
1200 int status;
1201 u64 bg_blkno;
1202
1203 BUG_ON(!ac);
1204 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206 BUG_ON(ac->ac_handle != handle);
1207
1208 status = ocfs2_claim_suballoc_bits(osb,
1209 ac,
1210 bits_wanted,
1211 1,
1212 suballoc_bit_start,
1213 num_bits,
1214 &bg_blkno);
1215 if (status < 0) {
1216 mlog_errno(status);
1217 goto bail;
1218 }
1219 atomic_inc(&osb->alloc_stats.bg_allocs);
1220
1221 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222 ac->ac_bits_given += (*num_bits);
1223 status = 0;
1224bail:
1225 mlog_exit(status);
1226 return status;
1227}
1228
1229int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230 struct ocfs2_journal_handle *handle,
1231 struct ocfs2_alloc_context *ac,
1232 u16 *suballoc_bit,
1233 u64 *fe_blkno)
1234{
1235 int status;
1236 unsigned int num_bits;
1237 u64 bg_blkno;
1238
1239 mlog_entry_void();
1240
1241 BUG_ON(!ac);
1242 BUG_ON(ac->ac_bits_given != 0);
1243 BUG_ON(ac->ac_bits_wanted != 1);
1244 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245 BUG_ON(ac->ac_handle != handle);
1246
1247 status = ocfs2_claim_suballoc_bits(osb,
1248 ac,
1249 1,
1250 1,
1251 suballoc_bit,
1252 &num_bits,
1253 &bg_blkno);
1254 if (status < 0) {
1255 mlog_errno(status);
1256 goto bail;
1257 }
1258 atomic_inc(&osb->alloc_stats.bg_allocs);
1259
1260 BUG_ON(num_bits != 1);
1261
1262 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263 ac->ac_bits_given++;
1264 status = 0;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* translate a group desc. blkno and it's bitmap offset into
1271 * disk cluster offset. */
1272static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273 u64 bg_blkno,
1274 u16 bg_bit_off)
1275{
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u32 cluster = 0;
1278
1279 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280
1281 if (bg_blkno != osb->first_cluster_group_blkno)
1282 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283 cluster += (u32) bg_bit_off;
1284 return cluster;
1285}
1286
1287/* given a cluster offset, calculate which block group it belongs to
1288 * and return that block offset. */
1289static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290 u32 cluster)
1291{
1292 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293 u32 group_no;
1294
1295 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296
1297 group_no = cluster / osb->bitmap_cpg;
1298 if (!group_no)
1299 return osb->first_cluster_group_blkno;
1300 return ocfs2_clusters_to_blocks(inode->i_sb,
1301 group_no * osb->bitmap_cpg);
1302}
1303
1304/* given the block number of a cluster start, calculate which cluster
1305 * group and descriptor bitmap offset that corresponds to. */
1306static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307 u64 data_blkno,
1308 u64 *bg_blkno,
1309 u16 *bg_bit_off)
1310{
1311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313
1314 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315
1316 *bg_blkno = ocfs2_which_cluster_group(inode,
1317 data_cluster);
1318
1319 if (*bg_blkno == osb->first_cluster_group_blkno)
1320 *bg_bit_off = (u16) data_cluster;
1321 else
1322 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323 data_blkno - *bg_blkno);
1324}
1325
1326/*
1327 * min_bits - minimum contiguous chunk from this total allocation we
1328 * can handle. set to what we asked for originally for a full
1329 * contig. allocation, set to '1' to indicate we can deal with extents
1330 * of any size.
1331 */
1332int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333 struct ocfs2_journal_handle *handle,
1334 struct ocfs2_alloc_context *ac,
1335 u32 min_clusters,
1336 u32 *cluster_start,
1337 u32 *num_clusters)
1338{
1339 int status;
1340 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341 u64 bg_blkno;
1342 u16 bg_bit_off;
1343
1344 mlog_entry_void();
1345
1346 BUG_ON(!ac);
1347 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348
1349 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350 && ac->ac_which != OCFS2_AC_USE_MAIN);
1351 BUG_ON(ac->ac_handle != handle);
1352
1353 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354 status = ocfs2_claim_local_alloc_bits(osb,
1355 handle,
1356 ac,
1357 bits_wanted,
1358 cluster_start,
1359 num_clusters);
1360 if (!status)
1361 atomic_inc(&osb->alloc_stats.local_data);
1362 } else {
1363 if (min_clusters > (osb->bitmap_cpg - 1)) {
1364 /* The only paths asking for contiguousness
1365 * should know about this already. */
1366 mlog(ML_ERROR, "minimum allocation requested exceeds "
1367 "group bitmap size!");
1368 status = -ENOSPC;
1369 goto bail;
1370 }
1371 /* clamp the current request down to a realistic size. */
1372 if (bits_wanted > (osb->bitmap_cpg - 1))
1373 bits_wanted = osb->bitmap_cpg - 1;
1374
1375 status = ocfs2_claim_suballoc_bits(osb,
1376 ac,
1377 bits_wanted,
1378 min_clusters,
1379 &bg_bit_off,
1380 num_clusters,
1381 &bg_blkno);
1382 if (!status) {
1383 *cluster_start =
1384 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385 bg_blkno,
1386 bg_bit_off);
1387 atomic_inc(&osb->alloc_stats.bitmap_data);
1388 }
1389 }
1390 if (status < 0) {
1391 if (status != -ENOSPC)
1392 mlog_errno(status);
1393 goto bail;
1394 }
1395
1396 ac->ac_bits_given += *num_clusters;
1397
1398bail:
1399 mlog_exit(status);
1400 return status;
1401}
1402
1403static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404 struct inode *alloc_inode,
1405 struct ocfs2_group_desc *bg,
1406 struct buffer_head *group_bh,
1407 unsigned int bit_off,
1408 unsigned int num_bits)
1409{
1410 int status;
1411 unsigned int tmp;
1412 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413 struct ocfs2_group_desc *undo_bg = NULL;
1414
1415 mlog_entry_void();
1416
1417 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419 status = -EIO;
1420 goto bail;
1421 }
1422
1423 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424
1425 if (ocfs2_is_cluster_bitmap(alloc_inode))
1426 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427
1428 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429 journal_type);
1430 if (status < 0) {
1431 mlog_errno(status);
1432 goto bail;
1433 }
1434
1435 if (ocfs2_is_cluster_bitmap(alloc_inode))
1436 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437
1438 tmp = num_bits;
1439 while(tmp--) {
1440 ocfs2_clear_bit((bit_off + tmp),
1441 (unsigned long *) bg->bg_bitmap);
1442 if (ocfs2_is_cluster_bitmap(alloc_inode))
1443 ocfs2_set_bit(bit_off + tmp,
1444 (unsigned long *) undo_bg->bg_bitmap);
1445 }
1446 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447
1448 status = ocfs2_journal_dirty(handle, group_bh);
1449 if (status < 0)
1450 mlog_errno(status);
1451bail:
1452 return status;
1453}
1454
1455/*
1456 * expects the suballoc inode to already be locked.
1457 */
1458static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459 struct inode *alloc_inode,
1460 struct buffer_head *alloc_bh,
1461 unsigned int start_bit,
1462 u64 bg_blkno,
1463 unsigned int count)
1464{
1465 int status = 0;
1466 u32 tmp_used;
1467 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470 struct buffer_head *group_bh = NULL;
1471 struct ocfs2_group_desc *group;
1472
1473 mlog_entry_void();
1474
1475 if (!OCFS2_IS_VALID_DINODE(fe)) {
1476 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477 status = -EIO;
1478 goto bail;
1479 }
1480 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481
1482 mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483 ", starting at %u\n",
1484 OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485 start_bit);
1486
1487 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488 alloc_inode);
1489 if (status < 0) {
1490 mlog_errno(status);
1491 goto bail;
1492 }
1493
1494 group = (struct ocfs2_group_desc *) group_bh->b_data;
1495 if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497 status = -EIO;
1498 goto bail;
1499 }
1500 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501
1502 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503 group, group_bh,
1504 start_bit, count);
1505 if (status < 0) {
1506 mlog_errno(status);
1507 goto bail;
1508 }
1509
1510 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511 OCFS2_JOURNAL_ACCESS_WRITE);
1512 if (status < 0) {
1513 mlog_errno(status);
1514 goto bail;
1515 }
1516
1517 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518 count);
1519 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521
1522 status = ocfs2_journal_dirty(handle, alloc_bh);
1523 if (status < 0) {
1524 mlog_errno(status);
1525 goto bail;
1526 }
1527
1528bail:
1529 if (group_bh)
1530 brelse(group_bh);
1531
1532 mlog_exit(status);
1533 return status;
1534}
1535
1536static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537{
1538 u64 group = block - (u64) bit;
1539
1540 return group;
1541}
1542
1543int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544 struct inode *inode_alloc_inode,
1545 struct buffer_head *inode_alloc_bh,
1546 struct ocfs2_dinode *di)
1547{
1548 u64 blk = le64_to_cpu(di->i_blkno);
1549 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551
1552 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553 inode_alloc_bh, bit, bg_blkno, 1);
1554}
1555
1556int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557 struct inode *eb_alloc_inode,
1558 struct buffer_head *eb_alloc_bh,
1559 struct ocfs2_extent_block *eb)
1560{
1561 u64 blk = le64_to_cpu(eb->h_blkno);
1562 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564
1565 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566 bit, bg_blkno, 1);
1567}
1568
1569int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570 struct inode *bitmap_inode,
1571 struct buffer_head *bitmap_bh,
1572 u64 start_blk,
1573 unsigned int num_clusters)
1574{
1575 int status;
1576 u16 bg_start_bit;
1577 u64 bg_blkno;
1578 struct ocfs2_dinode *fe;
1579
1580 /* You can't ever have a contiguous set of clusters
1581 * bigger than a block group bitmap so we never have to worry
1582 * about looping on them. */
1583
1584 mlog_entry_void();
1585
1586 /* This is expensive. We can safely remove once this stuff has
1587 * gotten tested really well. */
1588 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589
1590 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591
1592 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593 &bg_start_bit);
1594
1595 mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596 num_clusters, start_blk);
1597 mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598 bg_blkno, bg_start_bit);
1599
1600 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601 bg_start_bit, bg_blkno,
1602 num_clusters);
1603 if (status < 0)
1604 mlog_errno(status);
1605
1606 mlog_exit(status);
1607 return status;
1608}
1609
1610static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611{
1612 printk("Block Group:\n");
1613 printk("bg_signature: %s\n", bg->bg_signature);
1614 printk("bg_size: %u\n", bg->bg_size);
1615 printk("bg_bits: %u\n", bg->bg_bits);
1616 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617 printk("bg_chain: %u\n", bg->bg_chain);
1618 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1619 printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group);
1620 printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode);
1621 printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno);
1622}
1623
1624static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625{
1626 int i;
1627
1628 printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629 printk("i_signature: %s\n", fe->i_signature);
1630 printk("i_size: %"MLFu64"\n", fe->i_size);
1631 printk("i_clusters: %u\n", fe->i_clusters);
1632 printk("i_generation: %u\n",
1633 le32_to_cpu(fe->i_generation));
1634 printk("id1.bitmap1.i_used: %u\n",
1635 le32_to_cpu(fe->id1.bitmap1.i_used));
1636 printk("id1.bitmap1.i_total: %u\n",
1637 le32_to_cpu(fe->id1.bitmap1.i_total));
1638 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1639 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1640 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1641 printk("id2.i_chain.cl_next_free_rec: %u\n",
1642 fe->id2.i_chain.cl_next_free_rec);
1643 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1645 fe->id2.i_chain.cl_recs[i].c_free);
1646 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647 fe->id2.i_chain.cl_recs[i].c_total);
1648 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649 fe->id2.i_chain.cl_recs[i].c_blkno);
1650 }
1651}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 000000000000..a76c82a7ceac
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.h
5 *
6 * Defines sub allocator api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_
28
29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *,
31 u32,
32 u32,
33 u16 *,
34 u16 *);
35
36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_bits_wanted;
40 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1
42#define OCFS2_AC_USE_MAIN 2
43#define OCFS2_AC_USE_INODE 3
44#define OCFS2_AC_USE_META 4
45 u32 ac_which;
46 struct ocfs2_journal_handle *ac_handle;
47
48 /* these are used by the chain search */
49 u16 ac_chain;
50 int ac_allow_chain_relink;
51 group_search_t *ac_group_search;
52};
53
54void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
55static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
56{
57 return ac->ac_bits_wanted - ac->ac_bits_given;
58}
59
60int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *fe,
63 struct ocfs2_alloc_context **ac);
64int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
65 struct ocfs2_journal_handle *handle,
66 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb,
68 struct ocfs2_journal_handle *handle,
69 u32 bits_wanted,
70 struct ocfs2_alloc_context **ac);
71
72int ocfs2_claim_metadata(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac,
75 u32 bits_wanted,
76 u16 *suballoc_bit_start,
77 u32 *num_bits,
78 u64 *blkno_start);
79int ocfs2_claim_new_inode(struct ocfs2_super *osb,
80 struct ocfs2_journal_handle *handle,
81 struct ocfs2_alloc_context *ac,
82 u16 *suballoc_bit,
83 u64 *fe_blkno);
84int ocfs2_claim_clusters(struct ocfs2_super *osb,
85 struct ocfs2_journal_handle *handle,
86 struct ocfs2_alloc_context *ac,
87 u32 min_clusters,
88 u32 *cluster_start,
89 u32 *num_clusters);
90
91int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
92 struct inode *inode_alloc_inode,
93 struct buffer_head *inode_alloc_bh,
94 struct ocfs2_dinode *di);
95int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
96 struct inode *eb_alloc_inode,
97 struct buffer_head *eb_alloc_bh,
98 struct ocfs2_extent_block *eb);
99int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
100 struct inode *bitmap_inode,
101 struct buffer_head *bitmap_bh,
102 u64 start_blk,
103 unsigned int num_clusters);
104
105static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
106 u64 bg_blkno)
107{
108 /* This should work for all block group descriptors as only
109 * the 1st group descriptor of the cluster bitmap is
110 * different. */
111
112 if (bg_blkno == osb->first_cluster_group_blkno)
113 return 0;
114
115 /* the rest of the block groups are located at the beginning
116 * of their 1st cluster, so a direct translation just
117 * works. */
118 return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
119}
120
121static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
122{
123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
124 return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
125}
126
127/* This is for local alloc ONLY. Others should use the task-specific
128 * apis above. */
129int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
130 struct ocfs2_alloc_context *ac);
131
132#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 000000000000..48bf7f0ce544
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.c
5 *
6 * load/unload driver, mount/dismount volumes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/random.h>
34#include <linux/statfs.h>
35#include <linux/moduleparam.h>
36#include <linux/blkdev.h>
37#include <linux/socket.h>
38#include <linux/inet.h>
39#include <linux/parser.h>
40#include <linux/crc32.h>
41#include <linux/debugfs.h>
42
43#include <cluster/nodemanager.h>
44
45#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h>
47
48#include "ocfs2.h"
49
50/* this should be the only file to include a version 1 header */
51#include "ocfs1_fs_compat.h"
52
53#include "alloc.h"
54#include "dlmglue.h"
55#include "export.h"
56#include "extent_map.h"
57#include "heartbeat.h"
58#include "inode.h"
59#include "journal.h"
60#include "localalloc.h"
61#include "namei.h"
62#include "slot_map.h"
63#include "super.h"
64#include "sysfile.h"
65#include "uptodate.h"
66#include "ver.h"
67#include "vote.h"
68
69#include "buffer_head_io.h"
70
71/*
72 * Globals
73 */
74static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
75
76static u32 osb_id; /* Keeps track of next available OSB Id */
77
78static kmem_cache_t *ocfs2_inode_cachep = NULL;
79
80kmem_cache_t *ocfs2_lock_cache = NULL;
81
82/* OCFS2 needs to schedule several differnt types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL;
87
88static struct dentry *ocfs2_debugfs_root = NULL;
89
90MODULE_AUTHOR("Oracle");
91MODULE_LICENSE("GPL");
92
93static int ocfs2_parse_options(struct super_block *sb, char *options,
94 unsigned long *mount_opt, int is_remount);
95static void ocfs2_put_super(struct super_block *sb);
96static int ocfs2_mount_volume(struct super_block *sb);
97static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
98static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
99static int ocfs2_initialize_mem_caches(void);
100static void ocfs2_free_mem_caches(void);
101static void ocfs2_delete_osb(struct ocfs2_super *osb);
102
103static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
104
105static int ocfs2_sync_fs(struct super_block *sb, int wait);
106
107static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
111static int ocfs2_check_volume(struct ocfs2_super *osb);
112static int ocfs2_verify_volume(struct ocfs2_dinode *di,
113 struct buffer_head *bh,
114 u32 sectsize);
115static int ocfs2_initialize_super(struct super_block *sb,
116 struct buffer_head *bh,
117 int sector_size);
118static int ocfs2_get_sector(struct super_block *sb,
119 struct buffer_head **bh,
120 int block,
121 int sect_size);
122static void ocfs2_write_super(struct super_block *sb);
123static struct inode *ocfs2_alloc_inode(struct super_block *sb);
124static void ocfs2_destroy_inode(struct inode *inode);
125
126static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
127
128static struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs,
130 .alloc_inode = ocfs2_alloc_inode,
131 .destroy_inode = ocfs2_destroy_inode,
132 .drop_inode = ocfs2_drop_inode,
133 .clear_inode = ocfs2_clear_inode,
134 .delete_inode = ocfs2_delete_inode,
135 .sync_fs = ocfs2_sync_fs,
136 .write_super = ocfs2_write_super,
137 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount,
139};
140
141enum {
142 Opt_barrier,
143 Opt_err_panic,
144 Opt_err_ro,
145 Opt_intr,
146 Opt_nointr,
147 Opt_hb_none,
148 Opt_hb_local,
149 Opt_data_ordered,
150 Opt_data_writeback,
151 Opt_err,
152};
153
154static match_table_t tokens = {
155 {Opt_barrier, "barrier=%u"},
156 {Opt_err_panic, "errors=panic"},
157 {Opt_err_ro, "errors=remount-ro"},
158 {Opt_intr, "intr"},
159 {Opt_nointr, "nointr"},
160 {Opt_hb_none, OCFS2_HB_NONE},
161 {Opt_hb_local, OCFS2_HB_LOCAL},
162 {Opt_data_ordered, "data=ordered"},
163 {Opt_data_writeback, "data=writeback"},
164 {Opt_err, NULL}
165};
166
167/*
168 * write_super and sync_fs ripped right out of ext3.
169 */
170static void ocfs2_write_super(struct super_block *sb)
171{
172 if (down_trylock(&sb->s_lock) == 0)
173 BUG();
174 sb->s_dirt = 0;
175}
176
177static int ocfs2_sync_fs(struct super_block *sb, int wait)
178{
179 int status = 0;
180 tid_t target;
181 struct ocfs2_super *osb = OCFS2_SB(sb);
182
183 sb->s_dirt = 0;
184
185 if (ocfs2_is_hard_readonly(osb))
186 return -EROFS;
187
188 if (wait) {
189 status = ocfs2_flush_truncate_log(osb);
190 if (status < 0)
191 mlog_errno(status);
192 } else {
193 ocfs2_schedule_truncate_log_flush(osb, 0);
194 }
195
196 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
197 if (wait)
198 log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
199 target);
200 }
201 return 0;
202}
203
204static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
205{
206 struct inode *new = NULL;
207 int status = 0;
208 int i;
209
210 mlog_entry_void();
211
212 new = ocfs2_iget(osb, osb->root_blkno);
213 if (IS_ERR(new)) {
214 status = PTR_ERR(new);
215 mlog_errno(status);
216 goto bail;
217 }
218 osb->root_inode = new;
219
220 new = ocfs2_iget(osb, osb->system_dir_blkno);
221 if (IS_ERR(new)) {
222 status = PTR_ERR(new);
223 mlog_errno(status);
224 goto bail;
225 }
226 osb->sys_root_inode = new;
227
228 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
229 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
230 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
231 if (!new) {
232 ocfs2_release_system_inodes(osb);
233 status = -EINVAL;
234 mlog_errno(status);
235 /* FIXME: Should ERROR_RO_FS */
236 mlog(ML_ERROR, "Unable to load system inode %d, "
237 "possibly corrupt fs?", i);
238 goto bail;
239 }
240 // the array now has one ref, so drop this one
241 iput(new);
242 }
243
244bail:
245 mlog_exit(status);
246 return status;
247}
248
249static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
250{
251 struct inode *new = NULL;
252 int status = 0;
253 int i;
254
255 mlog_entry_void();
256
257 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
258 i < NUM_SYSTEM_INODES;
259 i++) {
260 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
261 if (!new) {
262 ocfs2_release_system_inodes(osb);
263 status = -EINVAL;
264 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
265 status, i, osb->slot_num);
266 goto bail;
267 }
268 /* the array now has one ref, so drop this one */
269 iput(new);
270 }
271
272bail:
273 mlog_exit(status);
274 return status;
275}
276
277static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
278{
279 int status = 0, i;
280 struct inode *inode;
281
282 mlog_entry_void();
283
284 for (i = 0; i < NUM_SYSTEM_INODES; i++) {
285 inode = osb->system_inodes[i];
286 if (inode) {
287 iput(inode);
288 osb->system_inodes[i] = NULL;
289 }
290 }
291
292 inode = osb->sys_root_inode;
293 if (inode) {
294 iput(inode);
295 osb->sys_root_inode = NULL;
296 }
297
298 inode = osb->root_inode;
299 if (inode) {
300 iput(inode);
301 osb->root_inode = NULL;
302 }
303
304 mlog_exit(status);
305 return status;
306}
307
308/* We're allocating fs objects, use GFP_NOFS */
309static struct inode *ocfs2_alloc_inode(struct super_block *sb)
310{
311 struct ocfs2_inode_info *oi;
312
313 oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
314 if (!oi)
315 return NULL;
316
317 return &oi->vfs_inode;
318}
319
320static void ocfs2_destroy_inode(struct inode *inode)
321{
322 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
323}
324
325/* From xfs_super.c:xfs_max_file_offset
326 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
327 */
328static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
329{
330 unsigned int pagefactor = 1;
331 unsigned int bitshift = BITS_PER_LONG - 1;
332
333 /* Figure out maximum filesize, on Linux this can depend on
334 * the filesystem blocksize (on 32 bit platforms).
335 * __block_prepare_write does this in an [unsigned] long...
336 * page->index << (PAGE_CACHE_SHIFT - bbits)
337 * So, for page sized blocks (4K on 32 bit platforms),
338 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
339 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
340 * but for smaller blocksizes it is less (bbits = log2 bsize).
341 * Note1: get_block_t takes a long (implicit cast from above)
342 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
343 * can optionally convert the [unsigned] long from above into
344 * an [unsigned] long long.
345 */
346
347#if BITS_PER_LONG == 32
348# if defined(CONFIG_LBD)
349 BUG_ON(sizeof(sector_t) != 8);
350 pagefactor = PAGE_CACHE_SIZE;
351 bitshift = BITS_PER_LONG;
352# else
353 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
354# endif
355#endif
356
357 return (((unsigned long long)pagefactor) << bitshift) - 1;
358}
359
360static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
361{
362 int incompat_features;
363 int ret = 0;
364 unsigned long parsed_options;
365 struct ocfs2_super *osb = OCFS2_SB(sb);
366
367 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
368 ret = -EINVAL;
369 goto out;
370 }
371
372 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
373 (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
374 ret = -EINVAL;
375 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
376 goto out;
377 }
378
379 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
380 (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
381 ret = -EINVAL;
382 mlog(ML_ERROR, "Cannot change data mode on remount\n");
383 goto out;
384 }
385
386 /* We're going to/from readonly mode. */
387 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
388 /* Lock here so the check of HARD_RO and the potential
389 * setting of SOFT_RO is atomic. */
390 spin_lock(&osb->osb_lock);
391 if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
392 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
393 ret = -EROFS;
394 goto unlock_osb;
395 }
396
397 if (*flags & MS_RDONLY) {
398 mlog(0, "Going to ro mode.\n");
399 sb->s_flags |= MS_RDONLY;
400 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
401 } else {
402 mlog(0, "Making ro filesystem writeable.\n");
403
404 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
405 mlog(ML_ERROR, "Cannot remount RDWR "
406 "filesystem due to previous errors.\n");
407 ret = -EROFS;
408 goto unlock_osb;
409 }
410 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
411 if (incompat_features) {
412 mlog(ML_ERROR, "Cannot remount RDWR because "
413 "of unsupported optional features "
414 "(%x).\n", incompat_features);
415 ret = -EINVAL;
416 goto unlock_osb;
417 }
418 sb->s_flags &= ~MS_RDONLY;
419 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
420 }
421unlock_osb:
422 spin_unlock(&osb->osb_lock);
423 }
424
425 if (!ret) {
426 if (!ocfs2_is_hard_readonly(osb))
427 ocfs2_set_journal_params(osb);
428
429 /* Only save off the new mount options in case of a successful
430 * remount. */
431 osb->s_mount_opt = parsed_options;
432 }
433out:
434 return ret;
435}
436
437static int ocfs2_sb_probe(struct super_block *sb,
438 struct buffer_head **bh,
439 int *sector_size)
440{
441 int status = 0, tmpstat;
442 struct ocfs1_vol_disk_hdr *hdr;
443 struct ocfs2_dinode *di;
444 int blksize;
445
446 *bh = NULL;
447
448 /* may be > 512 */
449 *sector_size = bdev_hardsect_size(sb->s_bdev);
450 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
451 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
452 *sector_size, OCFS2_MAX_BLOCKSIZE);
453 status = -EINVAL;
454 goto bail;
455 }
456
457 /* Can this really happen? */
458 if (*sector_size < OCFS2_MIN_BLOCKSIZE)
459 *sector_size = OCFS2_MIN_BLOCKSIZE;
460
461 /* check block zero for old format */
462 status = ocfs2_get_sector(sb, bh, 0, *sector_size);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
468 if (hdr->major_version == OCFS1_MAJOR_VERSION) {
469 mlog(ML_ERROR, "incompatible version: %u.%u\n",
470 hdr->major_version, hdr->minor_version);
471 status = -EINVAL;
472 }
473 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
474 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
475 mlog(ML_ERROR, "incompatible volume signature: %8s\n",
476 hdr->signature);
477 status = -EINVAL;
478 }
479 brelse(*bh);
480 *bh = NULL;
481 if (status < 0) {
482 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
483 "upgraded before mounting with ocfs v2\n");
484 goto bail;
485 }
486
487 /*
488 * Now check at magic offset for 512, 1024, 2048, 4096
489 * blocksizes. 4096 is the maximum blocksize because it is
490 * the minimum clustersize.
491 */
492 status = -EINVAL;
493 for (blksize = *sector_size;
494 blksize <= OCFS2_MAX_BLOCKSIZE;
495 blksize <<= 1) {
496 tmpstat = ocfs2_get_sector(sb, bh,
497 OCFS2_SUPER_BLOCK_BLKNO,
498 blksize);
499 if (tmpstat < 0) {
500 status = tmpstat;
501 mlog_errno(status);
502 goto bail;
503 }
504 di = (struct ocfs2_dinode *) (*bh)->b_data;
505 status = ocfs2_verify_volume(di, *bh, blksize);
506 if (status >= 0)
507 goto bail;
508 brelse(*bh);
509 *bh = NULL;
510 if (status != -EAGAIN)
511 break;
512 }
513
514bail:
515 return status;
516}
517
518static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
519{
520 struct dentry *root;
521 int status, sector_size;
522 unsigned long parsed_opt;
523 struct inode *inode = NULL;
524 struct ocfs2_super *osb = NULL;
525 struct buffer_head *bh = NULL;
526
527 mlog_entry("%p, %p, %i", sb, data, silent);
528
529 /* for now we only have one cluster/node, make sure we see it
530 * in the heartbeat universe */
531 if (!o2hb_check_local_node_heartbeating()) {
532 status = -EINVAL;
533 goto read_super_error;
534 }
535
536 /* probe for superblock */
537 status = ocfs2_sb_probe(sb, &bh, &sector_size);
538 if (status < 0) {
539 mlog(ML_ERROR, "superblock probe failed!\n");
540 goto read_super_error;
541 }
542
543 status = ocfs2_initialize_super(sb, bh, sector_size);
544 osb = OCFS2_SB(sb);
545 if (status < 0) {
546 mlog_errno(status);
547 goto read_super_error;
548 }
549 brelse(bh);
550 bh = NULL;
551
552 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
553 status = -EINVAL;
554 goto read_super_error;
555 }
556 osb->s_mount_opt = parsed_opt;
557
558 sb->s_magic = OCFS2_SUPER_MAGIC;
559
560 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
561 * heartbeat=none */
562 if (bdev_read_only(sb->s_bdev)) {
563 if (!(sb->s_flags & MS_RDONLY)) {
564 status = -EACCES;
565 mlog(ML_ERROR, "Readonly device detected but readonly "
566 "mount was not specified.\n");
567 goto read_super_error;
568 }
569
570 /* You should not be able to start a local heartbeat
571 * on a readonly device. */
572 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
573 status = -EROFS;
574 mlog(ML_ERROR, "Local heartbeat specified on readonly "
575 "device.\n");
576 goto read_super_error;
577 }
578
579 status = ocfs2_check_journals_nolocks(osb);
580 if (status < 0) {
581 if (status == -EROFS)
582 mlog(ML_ERROR, "Recovery required on readonly "
583 "file system, but write access is "
584 "unavailable.\n");
585 else
586 mlog_errno(status);
587 goto read_super_error;
588 }
589
590 ocfs2_set_ro_flag(osb, 1);
591
592 printk(KERN_NOTICE "Readonly device detected. No cluster "
593 "services will be utilized for this mount. Recovery "
594 "will be skipped.\n");
595 }
596
597 if (!ocfs2_is_hard_readonly(osb)) {
598 /* If this isn't a hard readonly mount, then we need
599 * to make sure that heartbeat is in a valid state,
600 * and that we mark ourselves soft readonly is -oro
601 * was specified. */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
603 mlog(ML_ERROR, "No heartbeat for device (%s)\n",
604 sb->s_id);
605 status = -EINVAL;
606 goto read_super_error;
607 }
608
609 if (sb->s_flags & MS_RDONLY)
610 ocfs2_set_ro_flag(osb, 0);
611 }
612
613 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
614 ocfs2_debugfs_root);
615 if (!osb->osb_debug_root) {
616 status = -EINVAL;
617 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
618 goto read_super_error;
619 }
620
621 status = ocfs2_mount_volume(sb);
622 if (osb->root_inode)
623 inode = igrab(osb->root_inode);
624
625 if (status < 0)
626 goto read_super_error;
627
628 if (!inode) {
629 status = -EIO;
630 mlog_errno(status);
631 goto read_super_error;
632 }
633
634 root = d_alloc_root(inode);
635 if (!root) {
636 status = -ENOMEM;
637 mlog_errno(status);
638 goto read_super_error;
639 }
640
641 sb->s_root = root;
642
643 ocfs2_complete_mount_recovery(osb);
644
645 printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
646 "data mode.\n",
647 MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
648 osb->slot_num,
649 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
650 "ordered");
651
652 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
653 wake_up(&osb->osb_mount_event);
654
655 mlog_exit(status);
656 return status;
657
658read_super_error:
659 if (bh != NULL)
660 brelse(bh);
661
662 if (inode)
663 iput(inode);
664
665 if (osb) {
666 atomic_set(&osb->vol_state, VOLUME_DISABLED);
667 wake_up(&osb->osb_mount_event);
668 ocfs2_dismount_volume(sb, 1);
669 }
670
671 mlog_exit(status);
672 return status;
673}
674
675static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
676 int flags,
677 const char *dev_name,
678 void *data)
679{
680 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
681}
682
683static struct file_system_type ocfs2_fs_type = {
684 .owner = THIS_MODULE,
685 .name = "ocfs2",
686 .get_sb = ocfs2_get_sb, /* is this called when we mount
687 * the fs? */
688 .kill_sb = kill_block_super, /* set to the generic one
689 * right now, but do we
690 * need to change that? */
691 .fs_flags = FS_REQUIRES_DEV,
692 .next = NULL
693};
694
695static int ocfs2_parse_options(struct super_block *sb,
696 char *options,
697 unsigned long *mount_opt,
698 int is_remount)
699{
700 int status;
701 char *p;
702
703 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
704 options ? options : "(none)");
705
706 *mount_opt = 0;
707
708 if (!options) {
709 status = 1;
710 goto bail;
711 }
712
713 while ((p = strsep(&options, ",")) != NULL) {
714 int token, option;
715 substring_t args[MAX_OPT_ARGS];
716
717 if (!*p)
718 continue;
719
720 token = match_token(p, tokens, args);
721 switch (token) {
722 case Opt_hb_local:
723 *mount_opt |= OCFS2_MOUNT_HB_LOCAL;
724 break;
725 case Opt_hb_none:
726 *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
727 break;
728 case Opt_barrier:
729 if (match_int(&args[0], &option)) {
730 status = 0;
731 goto bail;
732 }
733 if (option)
734 *mount_opt |= OCFS2_MOUNT_BARRIER;
735 else
736 *mount_opt &= ~OCFS2_MOUNT_BARRIER;
737 break;
738 case Opt_intr:
739 *mount_opt &= ~OCFS2_MOUNT_NOINTR;
740 break;
741 case Opt_nointr:
742 *mount_opt |= OCFS2_MOUNT_NOINTR;
743 break;
744 case Opt_err_panic:
745 *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
746 break;
747 case Opt_err_ro:
748 *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
749 break;
750 case Opt_data_ordered:
751 *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
752 break;
753 case Opt_data_writeback:
754 *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
755 break;
756 default:
757 mlog(ML_ERROR,
758 "Unrecognized mount option \"%s\" "
759 "or missing value\n", p);
760 status = 0;
761 goto bail;
762 }
763 }
764
765 status = 1;
766
767bail:
768 mlog_exit(status);
769 return status;
770}
771
772static int __init ocfs2_init(void)
773{
774 int status;
775
776 mlog_entry_void();
777
778 ocfs2_print_version();
779
780 if (init_ocfs2_extent_maps())
781 return -ENOMEM;
782
783 status = init_ocfs2_uptodate_cache();
784 if (status < 0) {
785 mlog_errno(status);
786 goto leave;
787 }
788
789 status = ocfs2_initialize_mem_caches();
790 if (status < 0) {
791 mlog_errno(status);
792 goto leave;
793 }
794
795 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
796 if (!ocfs2_wq) {
797 status = -ENOMEM;
798 goto leave;
799 }
800
801 spin_lock(&ocfs2_globals_lock);
802 osb_id = 0;
803 spin_unlock(&ocfs2_globals_lock);
804
805 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
806 if (!ocfs2_debugfs_root) {
807 status = -EFAULT;
808 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
809 }
810
811leave:
812 if (status < 0) {
813 ocfs2_free_mem_caches();
814 exit_ocfs2_uptodate_cache();
815 exit_ocfs2_extent_maps();
816 }
817
818 mlog_exit(status);
819
820 if (status >= 0) {
821 return register_filesystem(&ocfs2_fs_type);
822 } else
823 return -1;
824}
825
826static void __exit ocfs2_exit(void)
827{
828 mlog_entry_void();
829
830 if (ocfs2_wq) {
831 flush_workqueue(ocfs2_wq);
832 destroy_workqueue(ocfs2_wq);
833 }
834
835 debugfs_remove(ocfs2_debugfs_root);
836
837 ocfs2_free_mem_caches();
838
839 unregister_filesystem(&ocfs2_fs_type);
840
841 exit_ocfs2_extent_maps();
842
843 exit_ocfs2_uptodate_cache();
844
845 mlog_exit_void();
846}
847
848static void ocfs2_put_super(struct super_block *sb)
849{
850 mlog_entry("(0x%p)\n", sb);
851
852 ocfs2_sync_blockdev(sb);
853 ocfs2_dismount_volume(sb, 0);
854
855 mlog_exit_void();
856}
857
858static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
859{
860 struct ocfs2_super *osb;
861 u32 numbits, freebits;
862 int status;
863 struct ocfs2_dinode *bm_lock;
864 struct buffer_head *bh = NULL;
865 struct inode *inode = NULL;
866
867 mlog_entry("(%p, %p)\n", sb, buf);
868
869 osb = OCFS2_SB(sb);
870
871 inode = ocfs2_get_system_file_inode(osb,
872 GLOBAL_BITMAP_SYSTEM_INODE,
873 OCFS2_INVALID_SLOT);
874 if (!inode) {
875 mlog(ML_ERROR, "failed to get bitmap inode\n");
876 status = -EIO;
877 goto bail;
878 }
879
880 status = ocfs2_meta_lock(inode, NULL, &bh, 0);
881 if (status < 0) {
882 mlog_errno(status);
883 goto bail;
884 }
885
886 bm_lock = (struct ocfs2_dinode *) bh->b_data;
887
888 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
889 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
890
891 buf->f_type = OCFS2_SUPER_MAGIC;
892 buf->f_bsize = sb->s_blocksize;
893 buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
894 buf->f_blocks = ((sector_t) numbits) *
895 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
896 buf->f_bfree = ((sector_t) freebits) *
897 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
898 buf->f_bavail = buf->f_bfree;
899 buf->f_files = numbits;
900 buf->f_ffree = freebits;
901
902 brelse(bh);
903
904 ocfs2_meta_unlock(inode, 0);
905 status = 0;
906bail:
907 if (inode)
908 iput(inode);
909
910 mlog_exit(status);
911
912 return status;
913}
914
915static void ocfs2_inode_init_once(void *data,
916 kmem_cache_t *cachep,
917 unsigned long flags)
918{
919 struct ocfs2_inode_info *oi = data;
920
921 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
922 SLAB_CTOR_CONSTRUCTOR) {
923 oi->ip_flags = 0;
924 oi->ip_open_count = 0;
925 spin_lock_init(&oi->ip_lock);
926 ocfs2_extent_map_init(&oi->vfs_inode);
927 INIT_LIST_HEAD(&oi->ip_handle_list);
928 INIT_LIST_HEAD(&oi->ip_io_markers);
929 oi->ip_handle = NULL;
930 oi->ip_created_trans = 0;
931 oi->ip_last_trans = 0;
932 oi->ip_dir_start_lookup = 0;
933
934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem));
936
937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0;
939
940 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
941 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
942 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
943
944 ocfs2_metadata_cache_init(&oi->vfs_inode);
945
946 inode_init_once(&oi->vfs_inode);
947 }
948}
949
950static int ocfs2_initialize_mem_caches(void)
951{
952 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
953 sizeof(struct ocfs2_inode_info),
954 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
955 ocfs2_inode_init_once, NULL);
956 if (!ocfs2_inode_cachep)
957 return -ENOMEM;
958
959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
960 sizeof(struct ocfs2_journal_lock),
961 0,
962 SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
963 NULL, NULL);
964 if (!ocfs2_lock_cache)
965 return -ENOMEM;
966
967 return 0;
968}
969
970static void ocfs2_free_mem_caches(void)
971{
972 if (ocfs2_inode_cachep)
973 kmem_cache_destroy(ocfs2_inode_cachep);
974 if (ocfs2_lock_cache)
975 kmem_cache_destroy(ocfs2_lock_cache);
976
977 ocfs2_inode_cachep = NULL;
978 ocfs2_lock_cache = NULL;
979}
980
981static int ocfs2_get_sector(struct super_block *sb,
982 struct buffer_head **bh,
983 int block,
984 int sect_size)
985{
986 if (!sb_set_blocksize(sb, sect_size)) {
987 mlog(ML_ERROR, "unable to set blocksize\n");
988 return -EIO;
989 }
990
991 *bh = sb_getblk(sb, block);
992 if (!*bh) {
993 mlog_errno(-EIO);
994 return -EIO;
995 }
996 lock_buffer(*bh);
997 if (!buffer_dirty(*bh))
998 clear_buffer_uptodate(*bh);
999 unlock_buffer(*bh);
1000 ll_rw_block(READ, 1, bh);
1001 wait_on_buffer(*bh);
1002 return 0;
1003}
1004
1005/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1006static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1007{
1008 int status;
1009
1010 /* XXX hold a ref on the node while mounte? easy enough, if
1011 * desirable. */
1012 osb->node_num = o2nm_this_node();
1013 if (osb->node_num == O2NM_MAX_NODES) {
1014 mlog(ML_ERROR, "could not find this host's node number\n");
1015 status = -ENOENT;
1016 goto bail;
1017 }
1018
1019 mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
1020
1021 status = 0;
1022bail:
1023 return status;
1024}
1025
1026static int ocfs2_mount_volume(struct super_block *sb)
1027{
1028 int status = 0;
1029 int unlock_super = 0;
1030 struct ocfs2_super *osb = OCFS2_SB(sb);
1031
1032 mlog_entry_void();
1033
1034 if (ocfs2_is_hard_readonly(osb))
1035 goto leave;
1036
1037 status = ocfs2_fill_local_node_info(osb);
1038 if (status < 0) {
1039 mlog_errno(status);
1040 goto leave;
1041 }
1042
1043 status = ocfs2_register_hb_callbacks(osb);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto leave;
1047 }
1048
1049 status = ocfs2_dlm_init(osb);
1050 if (status < 0) {
1051 mlog_errno(status);
1052 goto leave;
1053 }
1054
1055 /* requires vote_thread to be running. */
1056 status = ocfs2_register_net_handlers(osb);
1057 if (status < 0) {
1058 mlog_errno(status);
1059 goto leave;
1060 }
1061
1062 status = ocfs2_super_lock(osb, 1);
1063 if (status < 0) {
1064 mlog_errno(status);
1065 goto leave;
1066 }
1067 unlock_super = 1;
1068
1069 /* This will load up the node map and add ourselves to it. */
1070 status = ocfs2_find_slot(osb);
1071 if (status < 0) {
1072 mlog_errno(status);
1073 goto leave;
1074 }
1075
1076 ocfs2_populate_mounted_map(osb);
1077
1078 /* load all node-local system inodes */
1079 status = ocfs2_init_local_system_inodes(osb);
1080 if (status < 0) {
1081 mlog_errno(status);
1082 goto leave;
1083 }
1084
1085 status = ocfs2_check_volume(osb);
1086 if (status < 0) {
1087 mlog_errno(status);
1088 goto leave;
1089 }
1090
1091 status = ocfs2_truncate_log_init(osb);
1092 if (status < 0) {
1093 mlog_errno(status);
1094 goto leave;
1095 }
1096
1097 /* This should be sent *after* we recovered our journal as it
1098 * will cause other nodes to unmark us as needing
1099 * recovery. However, we need to send it *before* dropping the
1100 * super block lock as otherwise their recovery threads might
1101 * try to clean us up while we're live! */
1102 status = ocfs2_request_mount_vote(osb);
1103 if (status < 0)
1104 mlog_errno(status);
1105
1106leave:
1107 if (unlock_super)
1108 ocfs2_super_unlock(osb, 1);
1109
1110 mlog_exit(status);
1111 return status;
1112}
1113
1114/* we can't grab the goofy sem lock from inside wait_event, so we use
1115 * memory barriers to make sure that we'll see the null task before
1116 * being woken up */
1117static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1118{
1119 mb();
1120 return osb->recovery_thread_task != NULL;
1121}
1122
1123static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1124{
1125 int tmp;
1126 struct ocfs2_super *osb = NULL;
1127
1128 mlog_entry("(0x%p)\n", sb);
1129
1130 BUG_ON(!sb);
1131 osb = OCFS2_SB(sb);
1132 BUG_ON(!osb);
1133
1134 ocfs2_shutdown_local_alloc(osb);
1135
1136 ocfs2_truncate_log_shutdown(osb);
1137
1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock);
1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144
1145 /* At this point, we know that no more recovery threads can be
1146 * launched, so wait for any recovery completion work to
1147 * complete. */
1148 flush_workqueue(ocfs2_wq);
1149
1150 ocfs2_journal_shutdown(osb);
1151
1152 ocfs2_sync_blockdev(sb);
1153
1154 /* No dlm means we've failed during mount, so skip all the
1155 * steps which depended on that to complete. */
1156 if (osb->dlm) {
1157 tmp = ocfs2_super_lock(osb, 1);
1158 if (tmp < 0) {
1159 mlog_errno(tmp);
1160 return;
1161 }
1162
1163 tmp = ocfs2_request_umount_vote(osb);
1164 if (tmp < 0)
1165 mlog_errno(tmp);
1166
1167 if (osb->slot_num != OCFS2_INVALID_SLOT)
1168 ocfs2_put_slot(osb);
1169
1170 ocfs2_super_unlock(osb, 1);
1171 }
1172
1173 ocfs2_release_system_inodes(osb);
1174
1175 if (osb->dlm) {
1176 ocfs2_unregister_net_handlers(osb);
1177
1178 ocfs2_dlm_shutdown(osb);
1179 }
1180
1181 ocfs2_clear_hb_callbacks(osb);
1182
1183 debugfs_remove(osb->osb_debug_root);
1184
1185 if (!mnt_err)
1186 ocfs2_stop_heartbeat(osb);
1187
1188 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1189
1190 printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
1191 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
1192
1193 ocfs2_delete_osb(osb);
1194 kfree(osb);
1195 sb->s_dev = 0;
1196 sb->s_fs_info = NULL;
1197}
1198
1199static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
1200 unsigned uuid_bytes)
1201{
1202 int i, ret;
1203 char *ptr;
1204
1205 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
1206
1207 osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
1208 if (osb->uuid_str == NULL)
1209 return -ENOMEM;
1210
1211 memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
1212
1213 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
1214 /* print with null */
1215 ret = snprintf(ptr, 3, "%02X", uuid[i]);
1216 if (ret != 2) /* drop super cleans up */
1217 return -EINVAL;
1218 /* then only advance past the last char */
1219 ptr += 2;
1220 }
1221
1222 return 0;
1223}
1224
1225static int ocfs2_initialize_super(struct super_block *sb,
1226 struct buffer_head *bh,
1227 int sector_size)
1228{
1229 int status = 0;
1230 int i;
1231 struct ocfs2_dinode *di = NULL;
1232 struct inode *inode = NULL;
1233 struct buffer_head *bitmap_bh = NULL;
1234 struct ocfs2_journal *journal;
1235 __le32 uuid_net_key;
1236 struct ocfs2_super *osb;
1237
1238 mlog_entry_void();
1239
1240 osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
1241 if (!osb) {
1242 status = -ENOMEM;
1243 mlog_errno(status);
1244 goto bail;
1245 }
1246
1247 sb->s_fs_info = osb;
1248 sb->s_op = &ocfs2_sops;
1249 sb->s_export_op = &ocfs2_export_ops;
1250 sb->s_flags |= MS_NOATIME;
1251 /* this is needed to support O_LARGEFILE */
1252 sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
1253
1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits)
1258 BUG();
1259
1260 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock);
1262 INIT_LIST_HEAD(&osb->net_response_list);
1263
1264 INIT_LIST_HEAD(&osb->osb_net_handlers);
1265 init_waitqueue_head(&osb->recovery_event);
1266 spin_lock_init(&osb->vote_task_lock);
1267 init_waitqueue_head(&osb->vote_event);
1268 osb->vote_work_sequence = 0;
1269 osb->vote_wake_sequence = 0;
1270 INIT_LIST_HEAD(&osb->blocked_lock_list);
1271 osb->blocked_lock_count = 0;
1272 INIT_LIST_HEAD(&osb->vote_list);
1273 spin_lock_init(&osb->osb_lock);
1274
1275 atomic_set(&osb->alloc_stats.moves, 0);
1276 atomic_set(&osb->alloc_stats.local_data, 0);
1277 atomic_set(&osb->alloc_stats.bitmap_data, 0);
1278 atomic_set(&osb->alloc_stats.bg_allocs, 0);
1279 atomic_set(&osb->alloc_stats.bg_extends, 0);
1280
1281 ocfs2_init_node_maps(osb);
1282
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285
1286 init_MUTEX(&osb->recovery_lock);
1287
1288 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL;
1290
1291 init_waitqueue_head(&osb->checkpoint_event);
1292 atomic_set(&osb->needs_checkpoint, 0);
1293
1294 osb->node_num = O2NM_INVALID_NODE_NUM;
1295 osb->slot_num = OCFS2_INVALID_SLOT;
1296
1297 osb->local_alloc_state = OCFS2_LA_UNUSED;
1298 osb->local_alloc_bh = NULL;
1299
1300 ocfs2_setup_hb_callbacks(osb);
1301
1302 init_waitqueue_head(&osb->osb_mount_event);
1303
1304 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
1305 if (!osb->vol_label) {
1306 mlog(ML_ERROR, "unable to alloc vol label\n");
1307 status = -ENOMEM;
1308 goto bail;
1309 }
1310
1311 osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
1312 if (!osb->uuid) {
1313 mlog(ML_ERROR, "unable to alloc uuid\n");
1314 status = -ENOMEM;
1315 goto bail;
1316 }
1317
1318 di = (struct ocfs2_dinode *)bh->b_data;
1319
1320 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
1321 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
1322 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
1323 osb->max_slots);
1324 status = -EINVAL;
1325 goto bail;
1326 }
1327 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
1328
1329 osb->s_feature_compat =
1330 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
1331 osb->s_feature_ro_compat =
1332 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
1333 osb->s_feature_incompat =
1334 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
1335
1336 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
1337 mlog(ML_ERROR, "couldn't mount because of unsupported "
1338 "optional features (%x).\n", i);
1339 status = -EINVAL;
1340 goto bail;
1341 }
1342 if (!(osb->sb->s_flags & MS_RDONLY) &&
1343 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
1344 mlog(ML_ERROR, "couldn't mount RDWR because of "
1345 "unsupported optional features (%x).\n", i);
1346 status = -EINVAL;
1347 goto bail;
1348 }
1349
1350 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1351
1352 /* FIXME
1353 * This should be done in ocfs2_journal_init(), but unknown
1354 * ordering issues will cause the filesystem to crash.
1355 * If anyone wants to figure out what part of the code
1356 * refers to osb->journal before ocfs2_journal_init() is run,
1357 * be my guest.
1358 */
1359 /* initialize our journal structure */
1360
1361 journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
1362 if (!journal) {
1363 mlog(ML_ERROR, "unable to alloc journal\n");
1364 status = -ENOMEM;
1365 goto bail;
1366 }
1367 osb->journal = journal;
1368 journal->j_osb = osb;
1369
1370 atomic_set(&journal->j_num_trans, 0);
1371 init_rwsem(&journal->j_trans_barrier);
1372 init_waitqueue_head(&journal->j_checkpointed);
1373 spin_lock_init(&journal->j_lock);
1374 journal->j_trans_id = (unsigned long) 1;
1375 INIT_LIST_HEAD(&journal->j_la_cleanups);
1376 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
1377 journal->j_state = OCFS2_JOURNAL_FREE;
1378
1379 /* get some pseudo constants for clustersize bits */
1380 osb->s_clustersize_bits =
1381 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
1382 osb->s_clustersize = 1 << osb->s_clustersize_bits;
1383 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
1384
1385 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
1386 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
1387 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
1388 osb->s_clustersize);
1389 status = -EINVAL;
1390 goto bail;
1391 }
1392
1393 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
1394 > (u32)~0UL) {
1395 mlog(ML_ERROR, "Volume might try to write to blocks beyond "
1396 "what jbd can address in 32 bits.\n");
1397 status = -EINVAL;
1398 goto bail;
1399 }
1400
1401 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
1402 sizeof(di->id2.i_super.s_uuid))) {
1403 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
1404 status = -ENOMEM;
1405 goto bail;
1406 }
1407
1408 memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
1409 osb->net_key = le32_to_cpu(uuid_net_key);
1410
1411 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1412 osb->vol_label[63] = '\0';
1413 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
1414 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
1415 osb->first_cluster_group_blkno =
1416 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1417 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1418 mlog(0, "vol_label: %s\n", osb->vol_label);
1419 mlog(0, "uuid: %s\n", osb->uuid_str);
1420 mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
1421 osb->root_blkno, osb->system_dir_blkno);
1422
1423 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
1424 if (!osb->osb_dlm_debug) {
1425 status = -ENOMEM;
1426 mlog_errno(status);
1427 goto bail;
1428 }
1429
1430 atomic_set(&osb->vol_state, VOLUME_INIT);
1431
1432 /* load root, system_dir, and all global system inodes */
1433 status = ocfs2_init_global_system_inodes(osb);
1434 if (status < 0) {
1435 mlog_errno(status);
1436 goto bail;
1437 }
1438
1439 /*
1440 * global bitmap
1441 */
1442 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
1443 OCFS2_INVALID_SLOT);
1444 if (!inode) {
1445 status = -EINVAL;
1446 mlog_errno(status);
1447 goto bail;
1448 }
1449
1450 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1451
1452 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1453 inode);
1454 iput(inode);
1455 if (status < 0) {
1456 mlog_errno(status);
1457 goto bail;
1458 }
1459
1460 di = (struct ocfs2_dinode *) bitmap_bh->b_data;
1461 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1462 osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
1463 brelse(bitmap_bh);
1464 mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
1465 osb->bitmap_blkno, osb->bitmap_cpg);
1466
1467 status = ocfs2_init_slot_info(osb);
1468 if (status < 0) {
1469 mlog_errno(status);
1470 goto bail;
1471 }
1472
1473 /* Link this osb onto the global linked list of all osb structures. */
1474 /* The Global Link List is mainted for the whole driver . */
1475 spin_lock(&ocfs2_globals_lock);
1476 osb->osb_id = osb_id;
1477 if (osb_id < OCFS2_MAX_OSB_ID)
1478 osb_id++;
1479 else {
1480 mlog(ML_ERROR, "Too many volumes mounted\n");
1481 status = -ENOMEM;
1482 }
1483 spin_unlock(&ocfs2_globals_lock);
1484
1485bail:
1486 mlog_exit(status);
1487 return status;
1488}
1489
1490/*
1491 * will return: -EAGAIN if it is ok to keep searching for superblocks
1492 * -EINVAL if there is a bad superblock
1493 * 0 on success
1494 */
1495static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1496 struct buffer_head *bh,
1497 u32 blksz)
1498{
1499 int status = -EAGAIN;
1500
1501 mlog_entry_void();
1502
1503 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1504 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1505 status = -EINVAL;
1506 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1507 mlog(ML_ERROR, "found superblock with incorrect block "
1508 "size: found %u, should be %u\n",
1509 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
1510 blksz);
1511 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
1512 OCFS2_MAJOR_REV_LEVEL ||
1513 le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
1514 OCFS2_MINOR_REV_LEVEL) {
1515 mlog(ML_ERROR, "found superblock with bad version: "
1516 "found %u.%u, should be %u.%u\n",
1517 le16_to_cpu(di->id2.i_super.s_major_rev_level),
1518 le16_to_cpu(di->id2.i_super.s_minor_rev_level),
1519 OCFS2_MAJOR_REV_LEVEL,
1520 OCFS2_MINOR_REV_LEVEL);
1521 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
1522 mlog(ML_ERROR, "bad block number on superblock: "
1523 "found %"MLFu64", should be %llu\n",
1524 di->i_blkno, (unsigned long long)bh->b_blocknr);
1525 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
1526 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
1527 mlog(ML_ERROR, "bad cluster size found: %u\n",
1528 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
1529 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
1530 mlog(ML_ERROR, "bad root_blkno: 0\n");
1531 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
1532 mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
1533 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
1534 mlog(ML_ERROR,
1535 "Superblock slots found greater than file system "
1536 "maximum: found %u, max %u\n",
1537 le16_to_cpu(di->id2.i_super.s_max_slots),
1538 OCFS2_MAX_SLOTS);
1539 } else {
1540 /* found it! */
1541 status = 0;
1542 }
1543 }
1544
1545 mlog_exit(status);
1546 return status;
1547}
1548
1549static int ocfs2_check_volume(struct ocfs2_super *osb)
1550{
1551 int status = 0;
1552 int dirty;
1553 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
1554 * recover
1555 * ourselves. */
1556
1557 mlog_entry_void();
1558
1559 /* Init our journal object. */
1560 status = ocfs2_journal_init(osb->journal, &dirty);
1561 if (status < 0) {
1562 mlog(ML_ERROR, "Could not initialize journal!\n");
1563 goto finally;
1564 }
1565
1566 /* If the journal was unmounted cleanly then we don't want to
1567 * recover anything. Otherwise, journal_load will do that
1568 * dirty work for us :) */
1569 if (!dirty) {
1570 status = ocfs2_journal_wipe(osb->journal, 0);
1571 if (status < 0) {
1572 mlog_errno(status);
1573 goto finally;
1574 }
1575 } else {
1576 mlog(ML_NOTICE, "File system was not unmounted cleanly, "
1577 "recovering volume.\n");
1578 }
1579
1580 /* will play back anything left in the journal. */
1581 ocfs2_journal_load(osb->journal);
1582
1583 if (dirty) {
1584 /* recover my local alloc if we didn't unmount cleanly. */
1585 status = ocfs2_begin_local_alloc_recovery(osb,
1586 osb->slot_num,
1587 &local_alloc);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto finally;
1591 }
1592 /* we complete the recovery process after we've marked
1593 * ourselves as mounted. */
1594 }
1595
1596 mlog(0, "Journal loaded.\n");
1597
1598 status = ocfs2_load_local_alloc(osb);
1599 if (status < 0) {
1600 mlog_errno(status);
1601 goto finally;
1602 }
1603
1604 if (dirty) {
1605 /* Recovery will be completed after we've mounted the
1606 * rest of the volume. */
1607 osb->dirty = 1;
1608 osb->local_alloc_copy = local_alloc;
1609 local_alloc = NULL;
1610 }
1611
1612 /* go through each journal, trylock it and if you get the
1613 * lock, and it's marked as dirty, set the bit in the recover
1614 * map and launch a recovery thread for it. */
1615 status = ocfs2_mark_dead_nodes(osb);
1616 if (status < 0)
1617 mlog_errno(status);
1618
1619finally:
1620 if (local_alloc)
1621 kfree(local_alloc);
1622
1623 mlog_exit(status);
1624 return status;
1625}
1626
1627/*
1628 * The routine gets called from dismount or close whenever a dismount on
1629 * volume is requested and the osb open count becomes 1.
1630 * It will remove the osb from the global list and also free up all the
1631 * initialized resources and fileobject.
1632 */
1633static void ocfs2_delete_osb(struct ocfs2_super *osb)
1634{
1635 mlog_entry_void();
1636
1637 /* This function assumes that the caller has the main osb resource */
1638
1639 if (osb->slot_info)
1640 ocfs2_free_slot_info(osb->slot_info);
1641
1642 /* FIXME
1643 * This belongs in journal shutdown, but because we have to
1644 * allocate osb->journal at the start of ocfs2_initalize_osb(),
1645 * we free it here.
1646 */
1647 kfree(osb->journal);
1648 if (osb->local_alloc_copy)
1649 kfree(osb->local_alloc_copy);
1650 kfree(osb->uuid_str);
1651 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
1652 memset(osb, 0, sizeof(struct ocfs2_super));
1653
1654 mlog_exit_void();
1655}
1656
1657/* Put OCFS2 into a readonly state, or (if the user specifies it),
1658 * panic(). We do not support continue-on-error operation. */
1659static void ocfs2_handle_error(struct super_block *sb)
1660{
1661 struct ocfs2_super *osb = OCFS2_SB(sb);
1662
1663 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
1664 panic("OCFS2: (device %s): panic forced after error\n",
1665 sb->s_id);
1666
1667 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
1668
1669 if (sb->s_flags & MS_RDONLY &&
1670 (ocfs2_is_soft_readonly(osb) ||
1671 ocfs2_is_hard_readonly(osb)))
1672 return;
1673
1674 printk(KERN_CRIT "File system is now read-only due to the potential "
1675 "of on-disk corruption. Please run fsck.ocfs2 once the file "
1676 "system is unmounted.\n");
1677 sb->s_flags |= MS_RDONLY;
1678 ocfs2_set_ro_flag(osb, 0);
1679}
1680
1681static char error_buf[1024];
1682
1683void __ocfs2_error(struct super_block *sb,
1684 const char *function,
1685 const char *fmt, ...)
1686{
1687 va_list args;
1688
1689 va_start(args, fmt);
1690 vsprintf(error_buf, fmt, args);
1691 va_end(args);
1692
1693 /* Not using mlog here because we want to show the actual
1694 * function the error came from. */
1695 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
1696 sb->s_id, function, error_buf);
1697
1698 ocfs2_handle_error(sb);
1699}
1700
1701/* Handle critical errors. This is intentionally more drastic than
1702 * ocfs2_handle_error, so we only use for things like journal errors,
1703 * etc. */
1704void __ocfs2_abort(struct super_block* sb,
1705 const char *function,
1706 const char *fmt, ...)
1707{
1708 va_list args;
1709
1710 va_start(args, fmt);
1711 vsprintf(error_buf, fmt, args);
1712 va_end(args);
1713
1714 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
1715 sb->s_id, function, error_buf);
1716
1717 /* We don't have the cluster support yet to go straight to
1718 * hard readonly in here. Until then, we want to keep
1719 * ocfs2_abort() so that we can at least mark critical
1720 * errors.
1721 *
1722 * TODO: This should abort the journal and alert other nodes
1723 * that our slot needs recovery. */
1724
1725 /* Force a panic(). This stinks, but it's better than letting
1726 * things continue without having a proper hard readonly
1727 * here. */
1728 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1729 ocfs2_handle_error(sb);
1730}
1731
1732module_init(ocfs2_init);
1733module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 000000000000..c564177dfbdc
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H
28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num);
33
34void __ocfs2_error(struct super_block *sb,
35 const char *function,
36 const char *fmt, ...);
37#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
38
39void __ocfs2_abort(struct super_block *sb,
40 const char *function,
41 const char *fmt, ...);
42#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
43
44#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 000000000000..f6986bd79e75
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * linux/cluster/ssi/cfs/symlink.c
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
14 * or NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
22 *
23 * Copyright (C) 1992 Rick Sladkey
24 *
25 * Optimization changes Copyright (C) 1994 Florian La Roche
26 *
27 * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
28 *
29 * Portions Copyright (C) 2001 Compaq Computer Corporation
30 *
31 * ocfs2 symlink handling code.
32 *
33 * Copyright (C) 2004, 2005 Oracle.
34 *
35 */
36
37#include <linux/fs.h>
38#include <linux/types.h>
39#include <linux/slab.h>
40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42
43#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "file.h"
50#include "inode.h"
51#include "journal.h"
52#include "symlink.h"
53
54#include "buffer_head_io.h"
55
56static char *ocfs2_page_getlink(struct dentry * dentry,
57 struct page **ppage);
58static char *ocfs2_fast_symlink_getlink(struct inode *inode,
59 struct buffer_head **bh);
60
61/* get the link contents into pagecache */
62static char *ocfs2_page_getlink(struct dentry * dentry,
63 struct page **ppage)
64{
65 struct page * page;
66 struct address_space *mapping = dentry->d_inode->i_mapping;
67 page = read_cache_page(mapping, 0,
68 (filler_t *)mapping->a_ops->readpage, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 wait_on_page_locked(page);
72 if (!PageUptodate(page))
73 goto async_fail;
74 *ppage = page;
75 return kmap(page);
76
77async_fail:
78 page_cache_release(page);
79 return ERR_PTR(-EIO);
80
81sync_fail:
82 return (char*)page;
83}
84
85static char *ocfs2_fast_symlink_getlink(struct inode *inode,
86 struct buffer_head **bh)
87{
88 int status;
89 char *link = NULL;
90 struct ocfs2_dinode *fe;
91
92 mlog_entry_void();
93
94 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
95 OCFS2_I(inode)->ip_blkno,
96 bh,
97 OCFS2_BH_CACHED,
98 inode);
99 if (status < 0) {
100 mlog_errno(status);
101 link = ERR_PTR(status);
102 goto bail;
103 }
104
105 fe = (struct ocfs2_dinode *) (*bh)->b_data;
106 link = (char *) fe->id2.i_symlink;
107bail:
108 mlog_exit(status);
109
110 return link;
111}
112
113static int ocfs2_readlink(struct dentry *dentry,
114 char __user *buffer,
115 int buflen)
116{
117 int ret;
118 char *link;
119 struct buffer_head *bh = NULL;
120 struct inode *inode = dentry->d_inode;
121
122 mlog_entry_void();
123
124 link = ocfs2_fast_symlink_getlink(inode, &bh);
125 if (IS_ERR(link)) {
126 ret = PTR_ERR(link);
127 goto out;
128 }
129
130 ret = vfs_readlink(dentry, buffer, buflen, link);
131
132 brelse(bh);
133out:
134 mlog_exit(ret);
135 return ret;
136}
137
138static void *ocfs2_follow_link(struct dentry *dentry,
139 struct nameidata *nd)
140{
141 int status;
142 char *link;
143 struct inode *inode = dentry->d_inode;
144 struct page *page = NULL;
145 struct buffer_head *bh = NULL;
146
147 if (ocfs2_inode_is_fast_symlink(inode))
148 link = ocfs2_fast_symlink_getlink(inode, &bh);
149 else
150 link = ocfs2_page_getlink(dentry, &page);
151 if (IS_ERR(link)) {
152 status = PTR_ERR(link);
153 mlog_errno(status);
154 goto bail;
155 }
156
157 status = vfs_follow_link(nd, link);
158 if (status)
159 mlog_errno(status);
160bail:
161 if (page) {
162 kunmap(page);
163 page_cache_release(page);
164 }
165 if (bh)
166 brelse(bh);
167
168 return ERR_PTR(status);
169}
170
171struct inode_operations ocfs2_symlink_inode_operations = {
172 .readlink = page_readlink,
173 .follow_link = ocfs2_follow_link,
174 .getattr = ocfs2_getattr,
175};
176struct inode_operations ocfs2_fast_symlink_inode_operations = {
177 .readlink = ocfs2_readlink,
178 .follow_link = ocfs2_follow_link,
179 .getattr = ocfs2_getattr,
180};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 000000000000..1ea9e4d9e9eb
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYMLINK_H
27#define OCFS2_SYMLINK_H
28
29extern struct inode_operations ocfs2_symlink_inode_operations;
30extern struct inode_operations ocfs2_fast_symlink_inode_operations;
31
32/*
33 * Test whether an inode is a fast symlink.
34 */
35static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
36{
37 return (S_ISLNK(inode->i_mode) &&
38 inode->i_blocks == 0);
39}
40
41
42#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 000000000000..600a8bc5b541
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.c
5 *
6 * Initialize, read, write, etc. system files.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include "ocfs2.h"
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "alloc.h"
37#include "dir.h"
38#include "inode.h"
39#include "journal.h"
40#include "sysfile.h"
41
42#include "buffer_head_io.h"
43
44static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
45 int type,
46 u32 slot);
47
48static inline int is_global_system_inode(int type);
49static inline int is_in_system_inode_array(struct ocfs2_super *osb,
50 int type,
51 u32 slot);
52
53static inline int is_global_system_inode(int type)
54{
55 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
56 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
57}
58
59static inline int is_in_system_inode_array(struct ocfs2_super *osb,
60 int type,
61 u32 slot)
62{
63 return slot == osb->slot_num || is_global_system_inode(type);
64}
65
66struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
67 int type,
68 u32 slot)
69{
70 struct inode *inode = NULL;
71 struct inode **arr = NULL;
72
73 /* avoid the lookup if cached in local system file array */
74 if (is_in_system_inode_array(osb, type, slot))
75 arr = &(osb->system_inodes[type]);
76
77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */
79 inode = igrab(inode);
80 if (!inode)
81 BUG();
82
83 return inode;
84 }
85
86 /* this gets one ref thru iget */
87 inode = _ocfs2_get_system_file_inode(osb, type, slot);
88
89 /* add one more if putting into array for first time */
90 if (arr && inode) {
91 *arr = igrab(inode);
92 if (!*arr)
93 BUG();
94 }
95 return inode;
96}
97
98static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
99 int type,
100 u32 slot)
101{
102 char namebuf[40];
103 struct inode *inode = NULL;
104 u64 blkno;
105 struct buffer_head *dirent_bh = NULL;
106 struct ocfs2_dir_entry *de = NULL;
107 int status = 0;
108
109 ocfs2_sprintf_system_inode_name(namebuf,
110 sizeof(namebuf),
111 type, slot);
112
113 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
114 &blkno, osb->sys_root_inode,
115 &dirent_bh, &de);
116 if (status < 0) {
117 goto bail;
118 }
119
120 inode = ocfs2_iget(osb, blkno);
121 if (IS_ERR(inode)) {
122 mlog_errno(PTR_ERR(inode));
123 inode = NULL;
124 goto bail;
125 }
126bail:
127 if (dirent_bh)
128 brelse(dirent_bh);
129 return inode;
130}
131
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 000000000000..cc9ea661ffc1
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYSFILE_H
27#define OCFS2_SYSFILE_H
28
29struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
30 int type,
31 u32 slot);
32
33#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 000000000000..3a0458fd3e1b
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.c
5 *
6 * Tracking the up-to-date-ness of a local buffer_head with respect to
7 * the cluster.
8 *
9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 * Standard buffer head caching flags (uptodate, etc) are insufficient
27 * in a clustered environment - a buffer may be marked up to date on
28 * our local node but could have been modified by another cluster
29 * member. As a result an additional (and performant) caching scheme
30 * is required. A further requirement is that we consume as little
31 * memory as possible - we never pin buffer_head structures in order
32 * to cache them.
33 *
34 * We track the existence of up to date buffers on the inodes which
35 * are associated with them. Because we don't want to pin
36 * buffer_heads, this is only a (strong) hint and several other checks
37 * are made in the I/O path to ensure that we don't use a stale or
38 * invalid buffer without going to disk:
39 * - buffer_jbd is used liberally - if a bh is in the journal on
40 * this node then it *must* be up to date.
41 * - the standard buffer_uptodate() macro is used to detect buffers
42 * which may be invalid (even if we have an up to date tracking
43 * item for them)
44 *
45 * For a full understanding of how this code works together, one
46 * should read the callers in dlmglue.c, the I/O functions in
47 * buffer_head_io.c and ocfs2_journal_access in journal.c
48 */
49
50#include <linux/fs.h>
51#include <linux/types.h>
52#include <linux/slab.h>
53#include <linux/highmem.h>
54#include <linux/buffer_head.h>
55#include <linux/rbtree.h>
56#include <linux/jbd.h>
57
58#define MLOG_MASK_PREFIX ML_UPTODATE
59
60#include <cluster/masklog.h>
61
62#include "ocfs2.h"
63
64#include "inode.h"
65#include "uptodate.h"
66
67struct ocfs2_meta_cache_item {
68 struct rb_node c_node;
69 sector_t c_block;
70};
71
72static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
73
74void ocfs2_metadata_cache_init(struct inode *inode)
75{
76 struct ocfs2_inode_info *oi = OCFS2_I(inode);
77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
78
79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
80 ci->ci_num_cached = 0;
81}
82
83/* No lock taken here as 'root' is not expected to be visible to other
84 * processes. */
85static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
86{
87 unsigned int purged = 0;
88 struct rb_node *node;
89 struct ocfs2_meta_cache_item *item;
90
91 while ((node = rb_last(root)) != NULL) {
92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
93
94 mlog(0, "Purge item %llu\n",
95 (unsigned long long) item->c_block);
96
97 rb_erase(&item->c_node, root);
98 kmem_cache_free(ocfs2_uptodate_cachep, item);
99
100 purged++;
101 }
102 return purged;
103}
104
105/* Called from locking and called from ocfs2_clear_inode. Dump the
106 * cache for a given inode.
107 *
108 * This function is a few more lines longer than necessary due to some
109 * accounting done here, but I think it's worth tracking down those
110 * bugs sooner -- Mark */
111void ocfs2_metadata_cache_purge(struct inode *inode)
112{
113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
114 unsigned int tree, to_purge, purged;
115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
116 struct rb_root root = RB_ROOT;
117
118 spin_lock(&oi->ip_lock);
119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
120 to_purge = ci->ci_num_cached;
121
122 mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
123 tree ? "array" : "tree", oi->ip_blkno);
124
125 /* If we're a tree, save off the root so that we can safely
126 * initialize the cache. We do the work to free tree members
127 * without the spinlock. */
128 if (tree)
129 root = ci->ci_cache.ci_tree;
130
131 ocfs2_metadata_cache_init(inode);
132 spin_unlock(&oi->ip_lock);
133
134 purged = ocfs2_purge_copied_metadata_tree(&root);
135 /* If possible, track the number wiped so that we can more
136 * easily detect counting errors. Unfortunately, this is only
137 * meaningful for trees. */
138 if (tree && purged != to_purge)
139 mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
140 oi->ip_blkno, to_purge, purged);
141}
142
143/* Returns the index in the cache array, -1 if not found.
144 * Requires ip_lock. */
145static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
146 sector_t item)
147{
148 int i;
149
150 for (i = 0; i < ci->ci_num_cached; i++) {
151 if (item == ci->ci_cache.ci_array[i])
152 return i;
153 }
154
155 return -1;
156}
157
158/* Returns the cache item if found, otherwise NULL.
159 * Requires ip_lock. */
160static struct ocfs2_meta_cache_item *
161ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
162 sector_t block)
163{
164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
165 struct ocfs2_meta_cache_item *item = NULL;
166
167 while (n) {
168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
169
170 if (block < item->c_block)
171 n = n->rb_left;
172 else if (block > item->c_block)
173 n = n->rb_right;
174 else
175 return item;
176 }
177
178 return NULL;
179}
180
181static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
182 struct buffer_head *bh)
183{
184 int index = -1;
185 struct ocfs2_meta_cache_item *item = NULL;
186
187 spin_lock(&oi->ip_lock);
188
189 mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
190 oi->ip_blkno, (unsigned long long) bh->b_blocknr,
191 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
192
193 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
194 index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
195 bh->b_blocknr);
196 else
197 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
198 bh->b_blocknr);
199
200 spin_unlock(&oi->ip_lock);
201
202 mlog(0, "index = %d, item = %p\n", index, item);
203
204 return (index != -1) || (item != NULL);
205}
206
207/* Warning: even if it returns true, this does *not* guarantee that
208 * the block is stored in our inode metadata cache. */
209int ocfs2_buffer_uptodate(struct inode *inode,
210 struct buffer_head *bh)
211{
212 /* Doesn't matter if the bh is in our cache or not -- if it's
213 * not marked uptodate then we know it can't have correct
214 * data. */
215 if (!buffer_uptodate(bh))
216 return 0;
217
218 /* OCFS2 does not allow multiple nodes to be changing the same
219 * block at the same time. */
220 if (buffer_jbd(bh))
221 return 1;
222
223 /* Ok, locally the buffer is marked as up to date, now search
224 * our cache to see if we can trust that. */
225 return ocfs2_buffer_cached(OCFS2_I(inode), bh);
226}
227
228/* Requires ip_lock */
229static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
230 sector_t block)
231{
232 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
233
234 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
235 ci->ci_num_cached);
236
237 ci->ci_cache.ci_array[ci->ci_num_cached] = block;
238 ci->ci_num_cached++;
239}
240
241/* By now the caller should have checked that the item does *not*
242 * exist in the tree.
243 * Requires ip_lock. */
244static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
245 struct ocfs2_meta_cache_item *new)
246{
247 sector_t block = new->c_block;
248 struct rb_node *parent = NULL;
249 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
250 struct ocfs2_meta_cache_item *tmp;
251
252 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
253 ci->ci_num_cached);
254
255 while(*p) {
256 parent = *p;
257
258 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
259
260 if (block < tmp->c_block)
261 p = &(*p)->rb_left;
262 else if (block > tmp->c_block)
263 p = &(*p)->rb_right;
264 else {
265 /* This should never happen! */
266 mlog(ML_ERROR, "Duplicate block %llu cached!\n",
267 (unsigned long long) block);
268 BUG();
269 }
270 }
271
272 rb_link_node(&new->c_node, parent, p);
273 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
274 ci->ci_num_cached++;
275}
276
277static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
278 struct ocfs2_caching_info *ci)
279{
280 assert_spin_locked(&oi->ip_lock);
281
282 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
283 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
284}
285
286/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
287 * pointers in tree after we use them - this allows caller to detect
288 * when to free in case of error. */
289static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
290 struct ocfs2_meta_cache_item **tree)
291{
292 int i;
293 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
294
295 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
296 "Inode %"MLFu64", num cached = %u, should be %u\n",
297 oi->ip_blkno, ci->ci_num_cached,
298 OCFS2_INODE_MAX_CACHE_ARRAY);
299 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
300 "Inode %"MLFu64" not marked as inline anymore!\n",
301 oi->ip_blkno);
302 assert_spin_locked(&oi->ip_lock);
303
304 /* Be careful to initialize the tree members *first* because
305 * once the ci_tree is used, the array is junk... */
306 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
307 tree[i]->c_block = ci->ci_cache.ci_array[i];
308
309 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
310 ci->ci_cache.ci_tree = RB_ROOT;
311 /* this will be set again by __ocfs2_insert_cache_tree */
312 ci->ci_num_cached = 0;
313
314 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
315 __ocfs2_insert_cache_tree(ci, tree[i]);
316 tree[i] = NULL;
317 }
318
319 mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
320 oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
321}
322
323/* Slow path function - memory allocation is necessary. See the
324 * comment above ocfs2_set_buffer_uptodate for more information. */
325static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
326 sector_t block,
327 int expand_tree)
328{
329 int i;
330 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
331 struct ocfs2_meta_cache_item *new = NULL;
332 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
333 { NULL, };
334
335 mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
336 oi->ip_blkno, (unsigned long long) block, expand_tree);
337
338 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
339 if (!new) {
340 mlog_errno(-ENOMEM);
341 return;
342 }
343 new->c_block = block;
344
345 if (expand_tree) {
346 /* Do *not* allocate an array here - the removal code
347 * has no way of tracking that. */
348 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
349 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
350 GFP_KERNEL);
351 if (!tree[i]) {
352 mlog_errno(-ENOMEM);
353 goto out_free;
354 }
355
356 /* These are initialized in ocfs2_expand_cache! */
357 }
358 }
359
360 spin_lock(&oi->ip_lock);
361 if (ocfs2_insert_can_use_array(oi, ci)) {
362 mlog(0, "Someone cleared the tree underneath us\n");
363 /* Ok, items were removed from the cache in between
364 * locks. Detect this and revert back to the fast path */
365 ocfs2_append_cache_array(ci, block);
366 spin_unlock(&oi->ip_lock);
367 goto out_free;
368 }
369
370 if (expand_tree)
371 ocfs2_expand_cache(oi, tree);
372
373 __ocfs2_insert_cache_tree(ci, new);
374 spin_unlock(&oi->ip_lock);
375
376 new = NULL;
377out_free:
378 if (new)
379 kmem_cache_free(ocfs2_uptodate_cachep, new);
380
381 /* If these were used, then ocfs2_expand_cache re-set them to
382 * NULL for us. */
383 if (tree[0]) {
384 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
385 if (tree[i])
386 kmem_cache_free(ocfs2_uptodate_cachep,
387 tree[i]);
388 }
389}
390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock --
395 * multiple paths cannot be expanding to a tree at the same time.
396 *
397 * The slow path takes into account that items can be removed
398 * (including the whole tree wiped and reset) when this process it out
399 * allocating memory. In those cases, it reverts back to the fast
400 * path.
401 *
402 * Note that this function may actually fail to insert the block if
403 * memory cannot be allocated. This is not fatal however (but may
404 * result in a performance penalty) */
405void ocfs2_set_buffer_uptodate(struct inode *inode,
406 struct buffer_head *bh)
407{
408 int expand;
409 struct ocfs2_inode_info *oi = OCFS2_I(inode);
410 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
411
412 /* The block may very well exist in our cache already, so avoid
413 * doing any more work in that case. */
414 if (ocfs2_buffer_cached(oi, bh))
415 return;
416
417 mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
418 (unsigned long long) bh->b_blocknr);
419
420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */
422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free
425 * spot. */
426 ocfs2_append_cache_array(ci, bh->b_blocknr);
427 spin_unlock(&oi->ip_lock);
428 return;
429 }
430
431 expand = 0;
432 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
433 /* We need to bump things up to a tree. */
434 expand = 1;
435 }
436 spin_unlock(&oi->ip_lock);
437
438 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
439}
440
441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh)
446{
447 struct ocfs2_inode_info *oi = OCFS2_I(inode);
448
449 /* This should definitely *not* exist in our cache */
450 BUG_ON(ocfs2_buffer_cached(oi, bh));
451
452 set_buffer_uptodate(bh);
453
454 down(&oi->ip_io_sem);
455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem);
457}
458
459/* Requires ip_lock. */
460static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
461 int index)
462{
463 sector_t *array = ci->ci_cache.ci_array;
464 int bytes;
465
466 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
467 BUG_ON(index >= ci->ci_num_cached);
468 BUG_ON(!ci->ci_num_cached);
469
470 mlog(0, "remove index %d (num_cached = %u\n", index,
471 ci->ci_num_cached);
472
473 ci->ci_num_cached--;
474
475 /* don't need to copy if the array is now empty, or if we
476 * removed at the tail */
477 if (ci->ci_num_cached && index < ci->ci_num_cached) {
478 bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
479 memmove(&array[index], &array[index + 1], bytes);
480 }
481}
482
483/* Requires ip_lock. */
484static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
485 struct ocfs2_meta_cache_item *item)
486{
487 mlog(0, "remove block %llu from tree\n",
488 (unsigned long long) item->c_block);
489
490 rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
491 ci->ci_num_cached--;
492}
493
494/* Called when we remove a chunk of metadata from an inode. We don't
495 * bother reverting things to an inlined array in the case of a remove
496 * which moves us back under the limit. */
497void ocfs2_remove_from_cache(struct inode *inode,
498 struct buffer_head *bh)
499{
500 int index;
501 sector_t block = bh->b_blocknr;
502 struct ocfs2_meta_cache_item *item = NULL;
503 struct ocfs2_inode_info *oi = OCFS2_I(inode);
504 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
505
506 spin_lock(&oi->ip_lock);
507 mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
508 oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
509 oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
510
511 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
512 index = ocfs2_search_cache_array(ci, block);
513 if (index != -1)
514 ocfs2_remove_metadata_array(ci, index);
515 } else {
516 item = ocfs2_search_cache_tree(ci, block);
517 if (item)
518 ocfs2_remove_metadata_tree(ci, item);
519 }
520 spin_unlock(&oi->ip_lock);
521
522 if (item)
523 kmem_cache_free(ocfs2_uptodate_cachep, item);
524}
525
526int __init init_ocfs2_uptodate_cache(void)
527{
528 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
529 sizeof(struct ocfs2_meta_cache_item),
530 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
531 if (!ocfs2_uptodate_cachep)
532 return -ENOMEM;
533
534 mlog(0, "%u inlined cache items per inode.\n",
535 OCFS2_INODE_MAX_CACHE_ARRAY);
536
537 return 0;
538}
539
540void __exit exit_ocfs2_uptodate_cache(void)
541{
542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep);
544}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 000000000000..e5aacdf4eabf
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.h
5 *
6 * Cluster uptodate tracking
7 *
8 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H
28
29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void);
31
32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode);
34
35int ocfs2_buffer_uptodate(struct inode *inode,
36 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode,
38 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh);
43
44#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 000000000000..5405ce121c99
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.3.3"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 000000000000..d7395cb91d2f
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 000000000000..021978e0576b
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30#include <linux/kthread.h>
31
32#include <cluster/heartbeat.h>
33#include <cluster/nodemanager.h>
34#include <cluster/tcp.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_VOTE
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "dlmglue.h"
45#include "extent_map.h"
46#include "heartbeat.h"
47#include "inode.h"
48#include "journal.h"
49#include "slot_map.h"
50#include "vote.h"
51
52#include "buffer_head_io.h"
53
54#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
56struct ocfs2_msg_hdr
57{
58 __be32 h_response_id; /* used to lookup message handle on sending
59 * node. */
60 __be32 h_request;
61 __be64 h_blkno;
62 __be32 h_generation;
63 __be32 h_node_num; /* node sending this particular message. */
64};
65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg
70{
71 struct ocfs2_msg_hdr v_hdr;
72 union {
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77 __be32 v_unlink_namelen;
78 __be64 v_unlink_parent;
79 u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
80};
81
82/* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84#define OCFS2_RESPONSE_OK (0)
85#define OCFS2_RESPONSE_BUSY (-16)
86#define OCFS2_RESPONSE_BAD_MSG (-22)
87
88struct ocfs2_response_msg
89{
90 struct ocfs2_msg_hdr r_hdr;
91 __be32 r_response;
92 __be32 r_orphaned_slot;
93};
94
95struct ocfs2_vote_work {
96 struct list_head w_list;
97 struct ocfs2_vote_msg w_msg;
98};
99
100enum ocfs2_vote_request {
101 OCFS2_VOTE_REQ_INVALID = 0,
102 OCFS2_VOTE_REQ_DELETE,
103 OCFS2_VOTE_REQ_UNLINK,
104 OCFS2_VOTE_REQ_RENAME,
105 OCFS2_VOTE_REQ_MOUNT,
106 OCFS2_VOTE_REQ_UMOUNT,
107 OCFS2_VOTE_REQ_LAST
108};
109
110static inline int ocfs2_is_valid_vote_request(int request)
111{
112 return OCFS2_VOTE_REQ_INVALID < request &&
113 request < OCFS2_VOTE_REQ_LAST;
114}
115
116typedef void (*ocfs2_net_response_callback)(void *priv,
117 struct ocfs2_response_msg *resp);
118struct ocfs2_net_response_cb {
119 ocfs2_net_response_callback rc_cb;
120 void *rc_priv;
121};
122
123struct ocfs2_net_wait_ctxt {
124 struct list_head n_list;
125 u32 n_response_id;
126 wait_queue_head_t n_event;
127 struct ocfs2_node_map n_node_map;
128 int n_response; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb *n_callback;
133};
134
135static void ocfs2_process_mount_request(struct ocfs2_super *osb,
136 unsigned int node_num)
137{
138 mlog(0, "MOUNT vote from node %u\n", node_num);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb, node_num);
143 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
144
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
149}
150
151static void ocfs2_process_umount_request(struct ocfs2_super *osb,
152 unsigned int node_num)
153{
154 mlog(0, "UMOUNT vote from node %u\n", node_num);
155 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
156 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
157}
158
159void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
160{
161 struct ocfs2_inode_info *oi = OCFS2_I(inode);
162
163 assert_spin_locked(&oi->ip_lock);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
168 * up the slack. */
169 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
170}
171
172static int ocfs2_process_delete_request(struct inode *inode,
173 int *orphaned_slot)
174{
175 int response = OCFS2_RESPONSE_BUSY;
176
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode->i_ino, inode->i_nlink, *orphaned_slot);
179
180 spin_lock(&OCFS2_I(inode)->ip_lock);
181
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
189 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
190 OCFS2_INVALID_SLOT &&
191 OCFS2_I(inode)->ip_orphaned_slot !=
192 (*orphaned_slot),
193 "Inode %"MLFu64": This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 OCFS2_I(inode)->ip_blkno,
196 OCFS2_I(inode)->ip_orphaned_slot,
197 *orphaned_slot);
198
199 mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
200 OCFS2_I(inode)->ip_blkno, *orphaned_slot);
201
202 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
203 } else {
204 mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
205 OCFS2_I(inode)->ip_orphaned_slot,
206 OCFS2_I(inode)->ip_blkno);
207
208 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
209 }
210
211 /* vote no if the file is still open. */
212 if (OCFS2_I(inode)->ip_open_count) {
213 mlog(0, "open count = %u\n",
214 OCFS2_I(inode)->ip_open_count);
215 spin_unlock(&OCFS2_I(inode)->ip_lock);
216 goto done;
217 }
218 spin_unlock(&OCFS2_I(inode)->ip_lock);
219
220 /* directories are a bit ugly... What if someone is sitting in
221 * it? We want to make sure the inode is removed completely as
222 * a result of the iput in process_vote. */
223 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
224 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
225 goto done;
226 }
227
228 if (filemap_fdatawrite(inode->i_mapping)) {
229 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
230 OCFS2_I(inode)->ip_blkno);
231 goto done;
232 }
233 sync_mapping_buffers(inode->i_mapping);
234 truncate_inode_pages(inode->i_mapping, 0);
235 ocfs2_extent_map_trunc(inode, 0);
236
237 spin_lock(&OCFS2_I(inode)->ip_lock);
238 /* double check open count - someone might have raced this
239 * thread into ocfs2_file_open while we were writing out
240 * data. If we're to allow a wipe of this inode now, we *must*
241 * hold the spinlock until we've marked it. */
242 if (OCFS2_I(inode)->ip_open_count) {
243 mlog(0, "Raced to wipe! open count = %u\n",
244 OCFS2_I(inode)->ip_open_count);
245 spin_unlock(&OCFS2_I(inode)->ip_lock);
246 goto done;
247 }
248
249 /* Mark the inode as being wiped from disk. */
250 ocfs2_mark_inode_remotely_deleted(inode);
251 spin_unlock(&OCFS2_I(inode)->ip_lock);
252
253 /* Not sure this is necessary anymore. */
254 d_prune_aliases(inode);
255
256 /* If we get here, then we're voting 'yes', so commit the
257 * delete on our side. */
258 response = OCFS2_RESPONSE_OK;
259done:
260 return response;
261}
262
263static int ocfs2_match_dentry(struct dentry *dentry,
264 u64 parent_blkno,
265 unsigned int namelen,
266 const char *name)
267{
268 struct inode *parent;
269
270 if (!dentry->d_parent) {
271 mlog(0, "Detached from parent.\n");
272 return 0;
273 }
274
275 parent = dentry->d_parent->d_inode;
276 /* Negative parent dentry? */
277 if (!parent)
278 return 0;
279
280 /* Name is in a different directory. */
281 if (OCFS2_I(parent)->ip_blkno != parent_blkno)
282 return 0;
283
284 if (dentry->d_name.len != namelen)
285 return 0;
286
287 /* comparison above guarantees this is safe. */
288 if (memcmp(dentry->d_name.name, name, namelen))
289 return 0;
290
291 return 1;
292}
293
294static void ocfs2_process_dentry_request(struct inode *inode,
295 int rename,
296 unsigned int new_nlink,
297 u64 parent_blkno,
298 unsigned int namelen,
299 const char *name)
300{
301 struct dentry *dentry = NULL;
302 struct list_head *p;
303 struct ocfs2_inode_info *oi = OCFS2_I(inode);
304
305 mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
306 namelen, namelen, name);
307
308 spin_lock(&dcache_lock);
309
310 /* Another node is removing this name from the system. It is
311 * up to us to find the corresponding dentry and if it exists,
312 * unhash it from the dcache. */
313 list_for_each(p, &inode->i_dentry) {
314 dentry = list_entry(p, struct dentry, d_alias);
315
316 if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
317 mlog(0, "dentry found: %.*s\n",
318 dentry->d_name.len, dentry->d_name.name);
319
320 dget_locked(dentry);
321 break;
322 }
323
324 dentry = NULL;
325 }
326
327 spin_unlock(&dcache_lock);
328
329 if (dentry) {
330 d_delete(dentry);
331 dput(dentry);
332 }
333
334 /* rename votes don't send link counts */
335 if (!rename) {
336 mlog(0, "new_nlink = %u\n", new_nlink);
337
338 /* We don't have the proper locks here to directly
339 * change i_nlink and besides, the vote is sent
340 * *before* the operation so it may have failed on the
341 * other node. This passes a hint to ocfs2_drop_inode
342 * to force ocfs2_delete_inode, who will take the
343 * proper cluster locks to sort things out. */
344 if (new_nlink == 0) {
345 spin_lock(&oi->ip_lock);
346 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
347 spin_unlock(&OCFS2_I(inode)->ip_lock);
348 }
349 }
350}
351
352static void ocfs2_process_vote(struct ocfs2_super *osb,
353 struct ocfs2_vote_msg *msg)
354{
355 int net_status, vote_response;
356 int orphaned_slot = 0;
357 int rename = 0;
358 unsigned int node_num, generation, new_nlink, namelen;
359 u64 blkno, parent_blkno;
360 enum ocfs2_vote_request request;
361 struct inode *inode = NULL;
362 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
363 struct ocfs2_response_msg response;
364
365 /* decode the network mumbo jumbo into local variables. */
366 request = be32_to_cpu(hdr->h_request);
367 blkno = be64_to_cpu(hdr->h_blkno);
368 generation = be32_to_cpu(hdr->h_generation);
369 node_num = be32_to_cpu(hdr->h_node_num);
370 if (request == OCFS2_VOTE_REQ_DELETE)
371 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
372
373 mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
374 "generation = %u, node_num = %u, priv1 = %u\n", request,
375 blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
376
377 if (!ocfs2_is_valid_vote_request(request)) {
378 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
379 request, node_num);
380 vote_response = OCFS2_RESPONSE_BAD_MSG;
381 goto respond;
382 }
383
384 vote_response = OCFS2_RESPONSE_OK;
385
386 switch (request) {
387 case OCFS2_VOTE_REQ_UMOUNT:
388 ocfs2_process_umount_request(osb, node_num);
389 goto respond;
390 case OCFS2_VOTE_REQ_MOUNT:
391 ocfs2_process_mount_request(osb, node_num);
392 goto respond;
393 default:
394 /* avoids a gcc warning */
395 break;
396 }
397
398 /* We cannot process the remaining message types before we're
399 * fully mounted. It's perfectly safe however to send a 'yes'
400 * response as we can't possibly have any of the state they're
401 * asking us to modify yet. */
402 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
403 goto respond;
404
405 /* If we get here, then the request is against an inode. */
406 inode = ocfs2_ilookup_for_vote(osb, blkno,
407 request == OCFS2_VOTE_REQ_DELETE);
408
409 /* Not finding the inode is perfectly valid - it means we're
410 * not interested in what the other node is about to do to it
411 * so in those cases we automatically respond with an
412 * affirmative. Cluster locking ensures that we won't race
413 * interest in the inode with this vote request. */
414 if (!inode)
415 goto respond;
416
417 /* Check generation values. It's possible for us to get a
418 * request against a stale inode. If so then we proceed as if
419 * we had not found an inode in the first place. */
420 if (inode->i_generation != generation) {
421 mlog(0, "generation passed %u != inode generation = %u, "
422 "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
423 "i_count = %u, message type = %u\n",
424 generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
425 OCFS2_I(inode)->ip_blkno, blkno,
426 atomic_read(&inode->i_count), request);
427 iput(inode);
428 inode = NULL;
429 goto respond;
430 }
431
432 switch (request) {
433 case OCFS2_VOTE_REQ_DELETE:
434 vote_response = ocfs2_process_delete_request(inode,
435 &orphaned_slot);
436 break;
437 case OCFS2_VOTE_REQ_RENAME:
438 rename = 1;
439 /* fall through */
440 case OCFS2_VOTE_REQ_UNLINK:
441 parent_blkno = be64_to_cpu(msg->v_unlink_parent);
442 namelen = be32_to_cpu(msg->v_unlink_namelen);
443 /* new_nlink will be ignored in case of a rename vote */
444 new_nlink = be32_to_cpu(msg->md1.v_nlink);
445 ocfs2_process_dentry_request(inode, rename, new_nlink,
446 parent_blkno, namelen,
447 msg->v_unlink_dirent);
448 break;
449 default:
450 mlog(ML_ERROR, "node %u, invalid request: %u\n",
451 node_num, request);
452 vote_response = OCFS2_RESPONSE_BAD_MSG;
453 }
454
455respond:
456 /* Response struture is small so we just put it on the stack
457 * and stuff it inline. */
458 memset(&response, 0, sizeof(struct ocfs2_response_msg));
459 response.r_hdr.h_response_id = hdr->h_response_id;
460 response.r_hdr.h_blkno = hdr->h_blkno;
461 response.r_hdr.h_generation = hdr->h_generation;
462 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
463 response.r_response = cpu_to_be32(vote_response);
464 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
465
466 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
467 osb->net_key,
468 &response,
469 sizeof(struct ocfs2_response_msg),
470 node_num,
471 NULL);
472 /* We still want to error print for ENOPROTOOPT here. The
473 * sending node shouldn't have unregistered his net handler
474 * without sending an unmount vote 1st */
475 if (net_status < 0
476 && net_status != -ETIMEDOUT
477 && net_status != -ENOTCONN)
478 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
479 node_num, net_status);
480
481 if (inode)
482 iput(inode);
483}
484
485static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
486{
487 unsigned long processed;
488 struct ocfs2_lock_res *lockres;
489 struct ocfs2_vote_work *work;
490
491 mlog_entry_void();
492
493 spin_lock(&osb->vote_task_lock);
494 /* grab this early so we know to try again if a state change and
495 * wake happens part-way through our work */
496 osb->vote_work_sequence = osb->vote_wake_sequence;
497
498 processed = osb->blocked_lock_count;
499 while (processed) {
500 BUG_ON(list_empty(&osb->blocked_lock_list));
501
502 lockres = list_entry(osb->blocked_lock_list.next,
503 struct ocfs2_lock_res, l_blocked_list);
504 list_del_init(&lockres->l_blocked_list);
505 osb->blocked_lock_count--;
506 spin_unlock(&osb->vote_task_lock);
507
508 BUG_ON(!processed);
509 processed--;
510
511 ocfs2_process_blocked_lock(osb, lockres);
512
513 spin_lock(&osb->vote_task_lock);
514 }
515
516 while (osb->vote_count) {
517 BUG_ON(list_empty(&osb->vote_list));
518 work = list_entry(osb->vote_list.next,
519 struct ocfs2_vote_work, w_list);
520 list_del(&work->w_list);
521 osb->vote_count--;
522 spin_unlock(&osb->vote_task_lock);
523
524 ocfs2_process_vote(osb, &work->w_msg);
525 kfree(work);
526
527 spin_lock(&osb->vote_task_lock);
528 }
529 spin_unlock(&osb->vote_task_lock);
530
531 mlog_exit_void();
532}
533
534static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
535{
536 int empty = 0;
537
538 spin_lock(&osb->vote_task_lock);
539 if (list_empty(&osb->blocked_lock_list) &&
540 list_empty(&osb->vote_list))
541 empty = 1;
542
543 spin_unlock(&osb->vote_task_lock);
544 return empty;
545}
546
547static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
548{
549 int should_wake = 0;
550
551 spin_lock(&osb->vote_task_lock);
552 if (osb->vote_work_sequence != osb->vote_wake_sequence)
553 should_wake = 1;
554 spin_unlock(&osb->vote_task_lock);
555
556 return should_wake;
557}
558
559int ocfs2_vote_thread(void *arg)
560{
561 int status = 0;
562 struct ocfs2_super *osb = arg;
563
564 /* only quit once we've been asked to stop and there is no more
565 * work available */
566 while (!(kthread_should_stop() &&
567 ocfs2_vote_thread_lists_empty(osb))) {
568
569 wait_event_interruptible(osb->vote_event,
570 ocfs2_vote_thread_should_wake(osb) ||
571 kthread_should_stop());
572
573 mlog(0, "vote_thread: awoken\n");
574
575 ocfs2_vote_thread_do_work(osb);
576 }
577
578 osb->vote_task = NULL;
579 return status;
580}
581
582static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
583{
584 struct ocfs2_net_wait_ctxt *w;
585
586 w = kcalloc(1, sizeof(*w), GFP_KERNEL);
587 if (!w) {
588 mlog_errno(-ENOMEM);
589 goto bail;
590 }
591
592 INIT_LIST_HEAD(&w->n_list);
593 init_waitqueue_head(&w->n_event);
594 ocfs2_node_map_init(&w->n_node_map);
595 w->n_response_id = response_id;
596 w->n_callback = NULL;
597bail:
598 return w;
599}
600
601static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
602{
603 unsigned int ret;
604
605 spin_lock(&osb->net_response_lock);
606 ret = ++osb->net_response_ids;
607 spin_unlock(&osb->net_response_lock);
608
609 return ret;
610}
611
612static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
613 struct ocfs2_net_wait_ctxt *w)
614{
615 spin_lock(&osb->net_response_lock);
616 list_del(&w->n_list);
617 spin_unlock(&osb->net_response_lock);
618}
619
620static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
621 struct ocfs2_net_wait_ctxt *w)
622{
623 spin_lock(&osb->net_response_lock);
624 list_add_tail(&w->n_list,
625 &osb->net_response_list);
626 spin_unlock(&osb->net_response_lock);
627}
628
629static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
630 struct ocfs2_net_wait_ctxt *w,
631 int node_num)
632{
633 assert_spin_locked(&osb->net_response_lock);
634
635 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
636 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
637 wake_up(&w->n_event);
638}
639
640/* Intended to be called from the node down callback, we fake remove
641 * the node from all our response contexts */
642void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
643 int node_num)
644{
645 struct list_head *p;
646 struct ocfs2_net_wait_ctxt *w = NULL;
647
648 spin_lock(&osb->net_response_lock);
649
650 list_for_each(p, &osb->net_response_list) {
651 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
652
653 __ocfs2_mark_node_responded(osb, w, node_num);
654 }
655
656 spin_unlock(&osb->net_response_lock);
657}
658
659static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
660 struct ocfs2_vote_msg *request,
661 unsigned int response_id,
662 int *response,
663 struct ocfs2_net_response_cb *callback)
664{
665 int status, i, remote_err;
666 struct ocfs2_net_wait_ctxt *w = NULL;
667 int dequeued = 0;
668
669 mlog_entry_void();
670
671 w = ocfs2_new_net_wait_ctxt(response_id);
672 if (!w) {
673 status = -ENOMEM;
674 mlog_errno(status);
675 goto bail;
676 }
677 w->n_callback = callback;
678
679 /* we're pretty much ready to go at this point, and this fills
680 * in n_response which we need anyway... */
681 ocfs2_queue_net_wait_ctxt(osb, w);
682
683 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
684
685 while (i != O2NM_INVALID_NODE_NUM) {
686 if (i != osb->node_num) {
687 mlog(0, "trying to send request to node %i\n", i);
688 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
689
690 remote_err = 0;
691 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
692 osb->net_key,
693 request,
694 sizeof(*request),
695 i,
696 &remote_err);
697 if (status == -ETIMEDOUT) {
698 mlog(0, "remote node %d timed out!\n", i);
699 status = -EAGAIN;
700 goto bail;
701 }
702 if (remote_err < 0) {
703 status = remote_err;
704 mlog(0, "remote error %d on node %d!\n",
705 remote_err, i);
706 mlog_errno(status);
707 goto bail;
708 }
709 if (status < 0) {
710 mlog_errno(status);
711 goto bail;
712 }
713 }
714 i++;
715 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
716 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
717 }
718 mlog(0, "done sending, now waiting on responses...\n");
719
720 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
721
722 ocfs2_dequeue_net_wait_ctxt(osb, w);
723 dequeued = 1;
724
725 *response = w->n_response;
726 status = 0;
727bail:
728 if (w) {
729 if (!dequeued)
730 ocfs2_dequeue_net_wait_ctxt(osb, w);
731 kfree(w);
732 }
733
734 mlog_exit(status);
735 return status;
736}
737
738static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
739 u64 blkno,
740 unsigned int generation,
741 enum ocfs2_vote_request type,
742 u32 priv)
743{
744 struct ocfs2_vote_msg *request;
745 struct ocfs2_msg_hdr *hdr;
746
747 BUG_ON(!ocfs2_is_valid_vote_request(type));
748
749 request = kcalloc(1, sizeof(*request), GFP_KERNEL);
750 if (!request) {
751 mlog_errno(-ENOMEM);
752 } else {
753 hdr = &request->v_hdr;
754 hdr->h_node_num = cpu_to_be32(osb->node_num);
755 hdr->h_request = cpu_to_be32(type);
756 hdr->h_blkno = cpu_to_be64(blkno);
757 hdr->h_generation = cpu_to_be32(generation);
758
759 request->md1.v_generic1 = cpu_to_be32(priv);
760 }
761
762 return request;
763}
764
765/* Complete the buildup of a new vote request and process the
766 * broadcast return value. */
767static int ocfs2_do_request_vote(struct ocfs2_super *osb,
768 struct ocfs2_vote_msg *request,
769 struct ocfs2_net_response_cb *callback)
770{
771 int status, response;
772 unsigned int response_id;
773 struct ocfs2_msg_hdr *hdr;
774
775 response_id = ocfs2_new_response_id(osb);
776
777 hdr = &request->v_hdr;
778 hdr->h_response_id = cpu_to_be32(response_id);
779
780 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
781 callback);
782 if (status < 0) {
783 mlog_errno(status);
784 goto bail;
785 }
786
787 status = response;
788bail:
789
790 return status;
791}
792
793static int ocfs2_request_vote(struct inode *inode,
794 struct ocfs2_vote_msg *request,
795 struct ocfs2_net_response_cb *callback)
796{
797 int status;
798 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
799
800 if (ocfs2_inode_is_new(inode))
801 return 0;
802
803 status = -EAGAIN;
804 while (status == -EAGAIN) {
805 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
806 signal_pending(current))
807 return -ERESTARTSYS;
808
809 status = ocfs2_super_lock(osb, 0);
810 if (status < 0) {
811 mlog_errno(status);
812 break;
813 }
814
815 status = 0;
816 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
817 osb->node_num))
818 status = ocfs2_do_request_vote(osb, request, callback);
819
820 ocfs2_super_unlock(osb, 0);
821 }
822 return status;
823}
824
825static void ocfs2_delete_response_cb(void *priv,
826 struct ocfs2_response_msg *resp)
827{
828 int orphaned_slot, node;
829 struct inode *inode = priv;
830
831 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
832 node = be32_to_cpu(resp->r_hdr.h_node_num);
833 mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
834 "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
835
836 /* The other node may not actually know which slot the inode
837 * is orphaned in. */
838 if (orphaned_slot == OCFS2_INVALID_SLOT)
839 return;
840
841 /* Ok, the responding node knows which slot this inode is
842 * orphaned in. We verify that the information is correct and
843 * then record this in the inode. ocfs2_delete_inode will use
844 * this information to determine which lock to take. */
845 spin_lock(&OCFS2_I(inode)->ip_lock);
846 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
847 OCFS2_I(inode)->ip_orphaned_slot
848 != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
849 "says it's orphaned in slot %d, we think it's in %d\n",
850 OCFS2_I(inode)->ip_blkno,
851 be32_to_cpu(resp->r_hdr.h_node_num),
852 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
853
854 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
855 spin_unlock(&OCFS2_I(inode)->ip_lock);
856}
857
858int ocfs2_request_delete_vote(struct inode *inode)
859{
860 int orphaned_slot, status;
861 struct ocfs2_net_response_cb delete_cb;
862 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
863 struct ocfs2_vote_msg *request;
864
865 spin_lock(&OCFS2_I(inode)->ip_lock);
866 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
867 spin_unlock(&OCFS2_I(inode)->ip_lock);
868
869 delete_cb.rc_cb = ocfs2_delete_response_cb;
870 delete_cb.rc_priv = inode;
871
872 mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
873 OCFS2_I(inode)->ip_blkno, orphaned_slot);
874
875 status = -ENOMEM;
876 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
877 inode->i_generation,
878 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
879 if (request) {
880 status = ocfs2_request_vote(inode, request, &delete_cb);
881
882 kfree(request);
883 }
884
885 return status;
886}
887
888static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
889 struct dentry *dentry)
890{
891 struct inode *parent = dentry->d_parent->d_inode;
892
893 /* We need some values which will uniquely identify a dentry
894 * on the other nodes so that they can find it and run
895 * d_delete against it. Parent directory block and full name
896 * should suffice. */
897
898 mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
899 OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
900 dentry->d_name.name);
901
902 request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
903 request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
904 memcpy(request->v_unlink_dirent, dentry->d_name.name,
905 dentry->d_name.len);
906}
907
908int ocfs2_request_unlink_vote(struct inode *inode,
909 struct dentry *dentry,
910 unsigned int nlink)
911{
912 int status;
913 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
914 struct ocfs2_vote_msg *request;
915
916 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
917 return -ENAMETOOLONG;
918
919 status = -ENOMEM;
920 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
921 inode->i_generation,
922 OCFS2_VOTE_REQ_UNLINK, nlink);
923 if (request) {
924 ocfs2_setup_unlink_vote(request, dentry);
925
926 status = ocfs2_request_vote(inode, request, NULL);
927
928 kfree(request);
929 }
930 return status;
931}
932
933int ocfs2_request_rename_vote(struct inode *inode,
934 struct dentry *dentry)
935{
936 int status;
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938 struct ocfs2_vote_msg *request;
939
940 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
941 return -ENAMETOOLONG;
942
943 status = -ENOMEM;
944 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
945 inode->i_generation,
946 OCFS2_VOTE_REQ_RENAME, 0);
947 if (request) {
948 ocfs2_setup_unlink_vote(request, dentry);
949
950 status = ocfs2_request_vote(inode, request, NULL);
951
952 kfree(request);
953 }
954 return status;
955}
956
957int ocfs2_request_mount_vote(struct ocfs2_super *osb)
958{
959 int status;
960 struct ocfs2_vote_msg *request = NULL;
961
962 request = ocfs2_new_vote_request(osb, 0ULL, 0,
963 OCFS2_VOTE_REQ_MOUNT, 0);
964 if (!request) {
965 status = -ENOMEM;
966 goto bail;
967 }
968
969 status = -EAGAIN;
970 while (status == -EAGAIN) {
971 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
972 signal_pending(current)) {
973 status = -ERESTARTSYS;
974 goto bail;
975 }
976
977 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
978 osb->node_num)) {
979 status = 0;
980 goto bail;
981 }
982
983 status = ocfs2_do_request_vote(osb, request, NULL);
984 }
985
986bail:
987 if (request)
988 kfree(request);
989
990 return status;
991}
992
993int ocfs2_request_umount_vote(struct ocfs2_super *osb)
994{
995 int status;
996 struct ocfs2_vote_msg *request = NULL;
997
998 request = ocfs2_new_vote_request(osb, 0ULL, 0,
999 OCFS2_VOTE_REQ_UMOUNT, 0);
1000 if (!request) {
1001 status = -ENOMEM;
1002 goto bail;
1003 }
1004
1005 status = -EAGAIN;
1006 while (status == -EAGAIN) {
1007 /* Do not check signals on this vote... We really want
1008 * this one to go all the way through. */
1009
1010 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
1011 osb->node_num)) {
1012 status = 0;
1013 goto bail;
1014 }
1015
1016 status = ocfs2_do_request_vote(osb, request, NULL);
1017 }
1018
1019bail:
1020 if (request)
1021 kfree(request);
1022
1023 return status;
1024}
1025
1026/* TODO: This should eventually be a hash table! */
1027static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
1028 u32 response_id)
1029{
1030 struct list_head *p;
1031 struct ocfs2_net_wait_ctxt *w = NULL;
1032
1033 list_for_each(p, &osb->net_response_list) {
1034 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
1035 if (response_id == w->n_response_id)
1036 break;
1037 w = NULL;
1038 }
1039
1040 return w;
1041}
1042
1043/* Translate response codes into local node errno values */
1044static inline int ocfs2_translate_response(int response)
1045{
1046 int ret;
1047
1048 switch (response) {
1049 case OCFS2_RESPONSE_OK:
1050 ret = 0;
1051 break;
1052
1053 case OCFS2_RESPONSE_BUSY:
1054 ret = -EBUSY;
1055 break;
1056
1057 default:
1058 ret = -EINVAL;
1059 }
1060
1061 return ret;
1062}
1063
1064static int ocfs2_handle_response_message(struct o2net_msg *msg,
1065 u32 len,
1066 void *data)
1067{
1068 unsigned int response_id, node_num;
1069 int response_status;
1070 struct ocfs2_super *osb = data;
1071 struct ocfs2_response_msg *resp;
1072 struct ocfs2_net_wait_ctxt * w;
1073 struct ocfs2_net_response_cb *resp_cb;
1074
1075 resp = (struct ocfs2_response_msg *) msg->buf;
1076
1077 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
1078 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
1079 response_status =
1080 ocfs2_translate_response(be32_to_cpu(resp->r_response));
1081
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
1085 mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
1086 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
1087 mlog(0, "h_node_num = %u\n", node_num);
1088 mlog(0, "r_response = %d\n", response_status);
1089
1090 spin_lock(&osb->net_response_lock);
1091 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
1092 if (!w) {
1093 mlog(0, "request not found!\n");
1094 goto bail;
1095 }
1096 resp_cb = w->n_callback;
1097
1098 if (response_status && (!w->n_response)) {
1099 /* we only really need one negative response so don't
1100 * set it twice. */
1101 w->n_response = response_status;
1102 }
1103
1104 if (resp_cb) {
1105 spin_unlock(&osb->net_response_lock);
1106
1107 resp_cb->rc_cb(resp_cb->rc_priv, resp);
1108
1109 spin_lock(&osb->net_response_lock);
1110 }
1111
1112 __ocfs2_mark_node_responded(osb, w, node_num);
1113bail:
1114 spin_unlock(&osb->net_response_lock);
1115
1116 return 0;
1117}
1118
1119static int ocfs2_handle_vote_message(struct o2net_msg *msg,
1120 u32 len,
1121 void *data)
1122{
1123 int status;
1124 struct ocfs2_super *osb = data;
1125 struct ocfs2_vote_work *work;
1126
1127 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
1128 if (!work) {
1129 status = -ENOMEM;
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133
1134 INIT_LIST_HEAD(&work->w_list);
1135 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
1136
1137 mlog(0, "scheduling vote request:\n");
1138 mlog(0, "h_response_id = %u\n",
1139 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
1140 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
1141 mlog(0, "h_blkno = %"MLFu64"\n",
1142 be64_to_cpu(work->w_msg.v_hdr.h_blkno));
1143 mlog(0, "h_generation = %u\n",
1144 be32_to_cpu(work->w_msg.v_hdr.h_generation));
1145 mlog(0, "h_node_num = %u\n",
1146 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
1147 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
1148
1149 spin_lock(&osb->vote_task_lock);
1150 list_add_tail(&work->w_list, &osb->vote_list);
1151 osb->vote_count++;
1152 spin_unlock(&osb->vote_task_lock);
1153
1154 ocfs2_kick_vote_thread(osb);
1155
1156 status = 0;
1157bail:
1158 return status;
1159}
1160
1161void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
1162{
1163 if (!osb->net_key)
1164 return;
1165
1166 o2net_unregister_handler_list(&osb->osb_net_handlers);
1167
1168 if (!list_empty(&osb->net_response_list))
1169 mlog(ML_ERROR, "net response list not empty!\n");
1170
1171 osb->net_key = 0;
1172}
1173
1174int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1175{
1176 int status = 0;
1177
1178 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
1179 osb->net_key,
1180 sizeof(struct ocfs2_response_msg),
1181 ocfs2_handle_response_message,
1182 osb, &osb->osb_net_handlers);
1183 if (status) {
1184 mlog_errno(status);
1185 goto bail;
1186 }
1187
1188 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
1189 osb->net_key,
1190 sizeof(struct ocfs2_vote_msg),
1191 ocfs2_handle_vote_message,
1192 osb, &osb->osb_net_handlers);
1193 if (status) {
1194 mlog_errno(status);
1195 goto bail;
1196 }
1197bail:
1198 if (status < 0)
1199 ocfs2_unregister_net_handlers(osb);
1200
1201 return status;
1202}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 000000000000..9cce60703466
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef VOTE_H
28#define VOTE_H
29
30int ocfs2_vote_thread(void *arg);
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_unlink_vote(struct inode *inode,
43 struct dentry *dentry,
44 unsigned int nlink);
45int ocfs2_request_rename_vote(struct inode *inode,
46 struct dentry *dentry);
47int ocfs2_request_mount_vote(struct ocfs2_super *osb);
48int ocfs2_request_umount_vote(struct ocfs2_super *osb);
49int ocfs2_register_net_handlers(struct ocfs2_super *osb);
50void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
51
52void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
53
54void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
55 int node_num);
56#endif