diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2005-12-15 17:31:24 -0500 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2006-01-03 14:45:47 -0500 |
commit | ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch) | |
tree | c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae /fs | |
parent | 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff) |
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Diffstat (limited to 'fs')
52 files changed, 24438 insertions, 0 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile new file mode 100644 index 000000000000..7d3be845a614 --- /dev/null +++ b/fs/ocfs2/Makefile | |||
@@ -0,0 +1,33 @@ | |||
1 | EXTRA_CFLAGS += -Ifs/ocfs2 | ||
2 | |||
3 | EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES | ||
4 | |||
5 | obj-$(CONFIG_OCFS2_FS) += ocfs2.o | ||
6 | |||
7 | ocfs2-objs := \ | ||
8 | alloc.o \ | ||
9 | aops.o \ | ||
10 | buffer_head_io.o \ | ||
11 | dcache.o \ | ||
12 | dir.o \ | ||
13 | dlmglue.o \ | ||
14 | export.o \ | ||
15 | extent_map.o \ | ||
16 | file.o \ | ||
17 | heartbeat.o \ | ||
18 | inode.o \ | ||
19 | journal.o \ | ||
20 | localalloc.o \ | ||
21 | mmap.o \ | ||
22 | namei.o \ | ||
23 | slot_map.o \ | ||
24 | suballoc.o \ | ||
25 | super.o \ | ||
26 | symlink.o \ | ||
27 | sysfile.o \ | ||
28 | uptodate.o \ | ||
29 | ver.o \ | ||
30 | vote.o | ||
31 | |||
32 | obj-$(CONFIG_OCFS2_FS) += cluster/ | ||
33 | obj-$(CONFIG_OCFS2_FS) += dlm/ | ||
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 000000000000..465f797451ee --- /dev/null +++ b/fs/ocfs2/alloc.c | |||
@@ -0,0 +1,2040 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * alloc.c | ||
5 | * | ||
6 | * Extent allocs and frees | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dlmglue.h" | ||
38 | #include "extent_map.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "sysfile.h" | ||
44 | #include "file.h" | ||
45 | #include "super.h" | ||
46 | #include "uptodate.h" | ||
47 | |||
48 | #include "buffer_head_io.h" | ||
49 | |||
50 | static int ocfs2_extent_contig(struct inode *inode, | ||
51 | struct ocfs2_extent_rec *ext, | ||
52 | u64 blkno); | ||
53 | |||
54 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | ||
55 | struct ocfs2_journal_handle *handle, | ||
56 | struct inode *inode, | ||
57 | int wanted, | ||
58 | struct ocfs2_alloc_context *meta_ac, | ||
59 | struct buffer_head *bhs[]); | ||
60 | |||
61 | static int ocfs2_add_branch(struct ocfs2_super *osb, | ||
62 | struct ocfs2_journal_handle *handle, | ||
63 | struct inode *inode, | ||
64 | struct buffer_head *fe_bh, | ||
65 | struct buffer_head *eb_bh, | ||
66 | struct buffer_head *last_eb_bh, | ||
67 | struct ocfs2_alloc_context *meta_ac); | ||
68 | |||
69 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | ||
70 | struct ocfs2_journal_handle *handle, | ||
71 | struct inode *inode, | ||
72 | struct buffer_head *fe_bh, | ||
73 | struct ocfs2_alloc_context *meta_ac, | ||
74 | struct buffer_head **ret_new_eb_bh); | ||
75 | |||
76 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
77 | struct ocfs2_journal_handle *handle, | ||
78 | struct inode *inode, | ||
79 | struct buffer_head *fe_bh, | ||
80 | u64 blkno, | ||
81 | u32 new_clusters); | ||
82 | |||
83 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | ||
84 | struct inode *inode, | ||
85 | struct buffer_head *fe_bh, | ||
86 | struct buffer_head **target_bh); | ||
87 | |||
88 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | ||
89 | struct inode *inode, | ||
90 | struct ocfs2_dinode *fe, | ||
91 | unsigned int new_i_clusters, | ||
92 | struct buffer_head *old_last_eb, | ||
93 | struct buffer_head **new_last_eb); | ||
94 | |||
95 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); | ||
96 | |||
97 | static int ocfs2_extent_contig(struct inode *inode, | ||
98 | struct ocfs2_extent_rec *ext, | ||
99 | u64 blkno) | ||
100 | { | ||
101 | return blkno == (le64_to_cpu(ext->e_blkno) + | ||
102 | ocfs2_clusters_to_blocks(inode->i_sb, | ||
103 | le32_to_cpu(ext->e_clusters))); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * How many free extents have we got before we need more meta data? | ||
108 | */ | ||
109 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | ||
110 | struct inode *inode, | ||
111 | struct ocfs2_dinode *fe) | ||
112 | { | ||
113 | int retval; | ||
114 | struct ocfs2_extent_list *el; | ||
115 | struct ocfs2_extent_block *eb; | ||
116 | struct buffer_head *eb_bh = NULL; | ||
117 | |||
118 | mlog_entry_void(); | ||
119 | |||
120 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
121 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
122 | retval = -EIO; | ||
123 | goto bail; | ||
124 | } | ||
125 | |||
126 | if (fe->i_last_eb_blk) { | ||
127 | retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
128 | &eb_bh, OCFS2_BH_CACHED, inode); | ||
129 | if (retval < 0) { | ||
130 | mlog_errno(retval); | ||
131 | goto bail; | ||
132 | } | ||
133 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; | ||
134 | el = &eb->h_list; | ||
135 | } else | ||
136 | el = &fe->id2.i_list; | ||
137 | |||
138 | BUG_ON(el->l_tree_depth != 0); | ||
139 | |||
140 | retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); | ||
141 | bail: | ||
142 | if (eb_bh) | ||
143 | brelse(eb_bh); | ||
144 | |||
145 | mlog_exit(retval); | ||
146 | return retval; | ||
147 | } | ||
148 | |||
149 | /* expects array to already be allocated | ||
150 | * | ||
151 | * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and | ||
152 | * l_count for you | ||
153 | */ | ||
154 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | ||
155 | struct ocfs2_journal_handle *handle, | ||
156 | struct inode *inode, | ||
157 | int wanted, | ||
158 | struct ocfs2_alloc_context *meta_ac, | ||
159 | struct buffer_head *bhs[]) | ||
160 | { | ||
161 | int count, status, i; | ||
162 | u16 suballoc_bit_start; | ||
163 | u32 num_got; | ||
164 | u64 first_blkno; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | |||
167 | mlog_entry_void(); | ||
168 | |||
169 | count = 0; | ||
170 | while (count < wanted) { | ||
171 | status = ocfs2_claim_metadata(osb, | ||
172 | handle, | ||
173 | meta_ac, | ||
174 | wanted - count, | ||
175 | &suballoc_bit_start, | ||
176 | &num_got, | ||
177 | &first_blkno); | ||
178 | if (status < 0) { | ||
179 | mlog_errno(status); | ||
180 | goto bail; | ||
181 | } | ||
182 | |||
183 | for(i = count; i < (num_got + count); i++) { | ||
184 | bhs[i] = sb_getblk(osb->sb, first_blkno); | ||
185 | if (bhs[i] == NULL) { | ||
186 | status = -EIO; | ||
187 | mlog_errno(status); | ||
188 | goto bail; | ||
189 | } | ||
190 | ocfs2_set_new_buffer_uptodate(inode, bhs[i]); | ||
191 | |||
192 | status = ocfs2_journal_access(handle, inode, bhs[i], | ||
193 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
194 | if (status < 0) { | ||
195 | mlog_errno(status); | ||
196 | goto bail; | ||
197 | } | ||
198 | |||
199 | memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); | ||
200 | eb = (struct ocfs2_extent_block *) bhs[i]->b_data; | ||
201 | /* Ok, setup the minimal stuff here. */ | ||
202 | strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); | ||
203 | eb->h_blkno = cpu_to_le64(first_blkno); | ||
204 | eb->h_fs_generation = cpu_to_le32(osb->fs_generation); | ||
205 | |||
206 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
207 | /* we always use slot zero's suballocator */ | ||
208 | eb->h_suballoc_slot = 0; | ||
209 | #else | ||
210 | eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); | ||
211 | #endif | ||
212 | eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); | ||
213 | eb->h_list.l_count = | ||
214 | cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); | ||
215 | |||
216 | suballoc_bit_start++; | ||
217 | first_blkno++; | ||
218 | |||
219 | /* We'll also be dirtied by the caller, so | ||
220 | * this isn't absolutely necessary. */ | ||
221 | status = ocfs2_journal_dirty(handle, bhs[i]); | ||
222 | if (status < 0) { | ||
223 | mlog_errno(status); | ||
224 | goto bail; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | count += num_got; | ||
229 | } | ||
230 | |||
231 | status = 0; | ||
232 | bail: | ||
233 | if (status < 0) { | ||
234 | for(i = 0; i < wanted; i++) { | ||
235 | if (bhs[i]) | ||
236 | brelse(bhs[i]); | ||
237 | bhs[i] = NULL; | ||
238 | } | ||
239 | } | ||
240 | mlog_exit(status); | ||
241 | return status; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Add an entire tree branch to our inode. eb_bh is the extent block | ||
246 | * to start at, if we don't want to start the branch at the dinode | ||
247 | * structure. | ||
248 | * | ||
249 | * last_eb_bh is required as we have to update it's next_leaf pointer | ||
250 | * for the new last extent block. | ||
251 | * | ||
252 | * the new branch will be 'empty' in the sense that every block will | ||
253 | * contain a single record with e_clusters == 0. | ||
254 | */ | ||
255 | static int ocfs2_add_branch(struct ocfs2_super *osb, | ||
256 | struct ocfs2_journal_handle *handle, | ||
257 | struct inode *inode, | ||
258 | struct buffer_head *fe_bh, | ||
259 | struct buffer_head *eb_bh, | ||
260 | struct buffer_head *last_eb_bh, | ||
261 | struct ocfs2_alloc_context *meta_ac) | ||
262 | { | ||
263 | int status, new_blocks, i; | ||
264 | u64 next_blkno, new_last_eb_blk; | ||
265 | struct buffer_head *bh; | ||
266 | struct buffer_head **new_eb_bhs = NULL; | ||
267 | struct ocfs2_dinode *fe; | ||
268 | struct ocfs2_extent_block *eb; | ||
269 | struct ocfs2_extent_list *eb_el; | ||
270 | struct ocfs2_extent_list *el; | ||
271 | |||
272 | mlog_entry_void(); | ||
273 | |||
274 | BUG_ON(!last_eb_bh); | ||
275 | |||
276 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
277 | |||
278 | if (eb_bh) { | ||
279 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; | ||
280 | el = &eb->h_list; | ||
281 | } else | ||
282 | el = &fe->id2.i_list; | ||
283 | |||
284 | /* we never add a branch to a leaf. */ | ||
285 | BUG_ON(!el->l_tree_depth); | ||
286 | |||
287 | new_blocks = le16_to_cpu(el->l_tree_depth); | ||
288 | |||
289 | /* allocate the number of new eb blocks we need */ | ||
290 | new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), | ||
291 | GFP_KERNEL); | ||
292 | if (!new_eb_bhs) { | ||
293 | status = -ENOMEM; | ||
294 | mlog_errno(status); | ||
295 | goto bail; | ||
296 | } | ||
297 | |||
298 | status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, | ||
299 | meta_ac, new_eb_bhs); | ||
300 | if (status < 0) { | ||
301 | mlog_errno(status); | ||
302 | goto bail; | ||
303 | } | ||
304 | |||
305 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | ||
306 | * linked with the rest of the tree. | ||
307 | * conversly, new_eb_bhs[0] is the new bottommost leaf. | ||
308 | * | ||
309 | * when we leave the loop, new_last_eb_blk will point to the | ||
310 | * newest leaf, and next_blkno will point to the topmost extent | ||
311 | * block. */ | ||
312 | next_blkno = new_last_eb_blk = 0; | ||
313 | for(i = 0; i < new_blocks; i++) { | ||
314 | bh = new_eb_bhs[i]; | ||
315 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
316 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
317 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
318 | status = -EIO; | ||
319 | goto bail; | ||
320 | } | ||
321 | eb_el = &eb->h_list; | ||
322 | |||
323 | status = ocfs2_journal_access(handle, inode, bh, | ||
324 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
325 | if (status < 0) { | ||
326 | mlog_errno(status); | ||
327 | goto bail; | ||
328 | } | ||
329 | |||
330 | eb->h_next_leaf_blk = 0; | ||
331 | eb_el->l_tree_depth = cpu_to_le16(i); | ||
332 | eb_el->l_next_free_rec = cpu_to_le16(1); | ||
333 | eb_el->l_recs[0].e_cpos = fe->i_clusters; | ||
334 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); | ||
335 | eb_el->l_recs[0].e_clusters = cpu_to_le32(0); | ||
336 | if (!eb_el->l_tree_depth) | ||
337 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); | ||
338 | |||
339 | status = ocfs2_journal_dirty(handle, bh); | ||
340 | if (status < 0) { | ||
341 | mlog_errno(status); | ||
342 | goto bail; | ||
343 | } | ||
344 | |||
345 | next_blkno = le64_to_cpu(eb->h_blkno); | ||
346 | } | ||
347 | |||
348 | /* This is a bit hairy. We want to update up to three blocks | ||
349 | * here without leaving any of them in an inconsistent state | ||
350 | * in case of error. We don't have to worry about | ||
351 | * journal_dirty erroring as it won't unless we've aborted the | ||
352 | * handle (in which case we would never be here) so reserving | ||
353 | * the write with journal_access is all we need to do. */ | ||
354 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
355 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
356 | if (status < 0) { | ||
357 | mlog_errno(status); | ||
358 | goto bail; | ||
359 | } | ||
360 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
361 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
362 | if (status < 0) { | ||
363 | mlog_errno(status); | ||
364 | goto bail; | ||
365 | } | ||
366 | if (eb_bh) { | ||
367 | status = ocfs2_journal_access(handle, inode, eb_bh, | ||
368 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
369 | if (status < 0) { | ||
370 | mlog_errno(status); | ||
371 | goto bail; | ||
372 | } | ||
373 | } | ||
374 | |||
375 | /* Link the new branch into the rest of the tree (el will | ||
376 | * either be on the fe, or the extent block passed in. */ | ||
377 | i = le16_to_cpu(el->l_next_free_rec); | ||
378 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); | ||
379 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
380 | el->l_recs[i].e_clusters = 0; | ||
381 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
382 | |||
383 | /* fe needs a new last extent block pointer, as does the | ||
384 | * next_leaf on the previously last-extent-block. */ | ||
385 | fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); | ||
386 | |||
387 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
388 | eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); | ||
389 | |||
390 | status = ocfs2_journal_dirty(handle, last_eb_bh); | ||
391 | if (status < 0) | ||
392 | mlog_errno(status); | ||
393 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
394 | if (status < 0) | ||
395 | mlog_errno(status); | ||
396 | if (eb_bh) { | ||
397 | status = ocfs2_journal_dirty(handle, eb_bh); | ||
398 | if (status < 0) | ||
399 | mlog_errno(status); | ||
400 | } | ||
401 | |||
402 | status = 0; | ||
403 | bail: | ||
404 | if (new_eb_bhs) { | ||
405 | for (i = 0; i < new_blocks; i++) | ||
406 | if (new_eb_bhs[i]) | ||
407 | brelse(new_eb_bhs[i]); | ||
408 | kfree(new_eb_bhs); | ||
409 | } | ||
410 | |||
411 | mlog_exit(status); | ||
412 | return status; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * adds another level to the allocation tree. | ||
417 | * returns back the new extent block so you can add a branch to it | ||
418 | * after this call. | ||
419 | */ | ||
420 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | ||
421 | struct ocfs2_journal_handle *handle, | ||
422 | struct inode *inode, | ||
423 | struct buffer_head *fe_bh, | ||
424 | struct ocfs2_alloc_context *meta_ac, | ||
425 | struct buffer_head **ret_new_eb_bh) | ||
426 | { | ||
427 | int status, i; | ||
428 | struct buffer_head *new_eb_bh = NULL; | ||
429 | struct ocfs2_dinode *fe; | ||
430 | struct ocfs2_extent_block *eb; | ||
431 | struct ocfs2_extent_list *fe_el; | ||
432 | struct ocfs2_extent_list *eb_el; | ||
433 | |||
434 | mlog_entry_void(); | ||
435 | |||
436 | status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, | ||
437 | &new_eb_bh); | ||
438 | if (status < 0) { | ||
439 | mlog_errno(status); | ||
440 | goto bail; | ||
441 | } | ||
442 | |||
443 | eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; | ||
444 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
445 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
446 | status = -EIO; | ||
447 | goto bail; | ||
448 | } | ||
449 | |||
450 | eb_el = &eb->h_list; | ||
451 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
452 | fe_el = &fe->id2.i_list; | ||
453 | |||
454 | status = ocfs2_journal_access(handle, inode, new_eb_bh, | ||
455 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
456 | if (status < 0) { | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | /* copy the fe data into the new extent block */ | ||
462 | eb_el->l_tree_depth = fe_el->l_tree_depth; | ||
463 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; | ||
464 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | ||
465 | eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; | ||
466 | eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; | ||
467 | eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; | ||
468 | } | ||
469 | |||
470 | status = ocfs2_journal_dirty(handle, new_eb_bh); | ||
471 | if (status < 0) { | ||
472 | mlog_errno(status); | ||
473 | goto bail; | ||
474 | } | ||
475 | |||
476 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
477 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
478 | if (status < 0) { | ||
479 | mlog_errno(status); | ||
480 | goto bail; | ||
481 | } | ||
482 | |||
483 | /* update fe now */ | ||
484 | le16_add_cpu(&fe_el->l_tree_depth, 1); | ||
485 | fe_el->l_recs[0].e_cpos = 0; | ||
486 | fe_el->l_recs[0].e_blkno = eb->h_blkno; | ||
487 | fe_el->l_recs[0].e_clusters = fe->i_clusters; | ||
488 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | ||
489 | fe_el->l_recs[i].e_cpos = 0; | ||
490 | fe_el->l_recs[i].e_clusters = 0; | ||
491 | fe_el->l_recs[i].e_blkno = 0; | ||
492 | } | ||
493 | fe_el->l_next_free_rec = cpu_to_le16(1); | ||
494 | |||
495 | /* If this is our 1st tree depth shift, then last_eb_blk | ||
496 | * becomes the allocated extent block */ | ||
497 | if (fe_el->l_tree_depth == cpu_to_le16(1)) | ||
498 | fe->i_last_eb_blk = eb->h_blkno; | ||
499 | |||
500 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
501 | if (status < 0) { | ||
502 | mlog_errno(status); | ||
503 | goto bail; | ||
504 | } | ||
505 | |||
506 | *ret_new_eb_bh = new_eb_bh; | ||
507 | new_eb_bh = NULL; | ||
508 | status = 0; | ||
509 | bail: | ||
510 | if (new_eb_bh) | ||
511 | brelse(new_eb_bh); | ||
512 | |||
513 | mlog_exit(status); | ||
514 | return status; | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * Expects the tree to already have room in the rightmost leaf for the | ||
519 | * extent. Updates all the extent blocks (and the dinode) on the way | ||
520 | * down. | ||
521 | */ | ||
522 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
523 | struct ocfs2_journal_handle *handle, | ||
524 | struct inode *inode, | ||
525 | struct buffer_head *fe_bh, | ||
526 | u64 start_blk, | ||
527 | u32 new_clusters) | ||
528 | { | ||
529 | int status, i, num_bhs = 0; | ||
530 | u64 next_blkno; | ||
531 | u16 next_free; | ||
532 | struct buffer_head **eb_bhs = NULL; | ||
533 | struct ocfs2_dinode *fe; | ||
534 | struct ocfs2_extent_block *eb; | ||
535 | struct ocfs2_extent_list *el; | ||
536 | |||
537 | mlog_entry_void(); | ||
538 | |||
539 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
540 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
541 | if (status < 0) { | ||
542 | mlog_errno(status); | ||
543 | goto bail; | ||
544 | } | ||
545 | |||
546 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
547 | el = &fe->id2.i_list; | ||
548 | if (el->l_tree_depth) { | ||
549 | /* This is another operation where we want to be | ||
550 | * careful about our tree updates. An error here means | ||
551 | * none of the previous changes we made should roll | ||
552 | * forward. As a result, we have to record the buffers | ||
553 | * for this part of the tree in an array and reserve a | ||
554 | * journal write to them before making any changes. */ | ||
555 | num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
556 | eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), | ||
557 | GFP_KERNEL); | ||
558 | if (!eb_bhs) { | ||
559 | status = -ENOMEM; | ||
560 | mlog_errno(status); | ||
561 | goto bail; | ||
562 | } | ||
563 | |||
564 | i = 0; | ||
565 | while(el->l_tree_depth) { | ||
566 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
567 | if (next_free == 0) { | ||
568 | ocfs2_error(inode->i_sb, | ||
569 | "Dinode %"MLFu64" has a bad " | ||
570 | "extent list", | ||
571 | OCFS2_I(inode)->ip_blkno); | ||
572 | status = -EIO; | ||
573 | goto bail; | ||
574 | } | ||
575 | next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); | ||
576 | |||
577 | BUG_ON(i >= num_bhs); | ||
578 | status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], | ||
579 | OCFS2_BH_CACHED, inode); | ||
580 | if (status < 0) { | ||
581 | mlog_errno(status); | ||
582 | goto bail; | ||
583 | } | ||
584 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
585 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
586 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
587 | eb); | ||
588 | status = -EIO; | ||
589 | goto bail; | ||
590 | } | ||
591 | |||
592 | status = ocfs2_journal_access(handle, inode, eb_bhs[i], | ||
593 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
594 | if (status < 0) { | ||
595 | mlog_errno(status); | ||
596 | goto bail; | ||
597 | } | ||
598 | |||
599 | el = &eb->h_list; | ||
600 | i++; | ||
601 | /* When we leave this loop, eb_bhs[num_bhs - 1] will | ||
602 | * hold the bottom-most leaf extent block. */ | ||
603 | } | ||
604 | BUG_ON(el->l_tree_depth); | ||
605 | |||
606 | el = &fe->id2.i_list; | ||
607 | /* If we have tree depth, then the fe update is | ||
608 | * trivial, and we want to switch el out for the | ||
609 | * bottom-most leaf in order to update it with the | ||
610 | * actual extent data below. */ | ||
611 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
612 | if (next_free == 0) { | ||
613 | ocfs2_error(inode->i_sb, | ||
614 | "Dinode %"MLFu64" has a bad " | ||
615 | "extent list", | ||
616 | OCFS2_I(inode)->ip_blkno); | ||
617 | status = -EIO; | ||
618 | goto bail; | ||
619 | } | ||
620 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
621 | new_clusters); | ||
622 | /* (num_bhs - 1) to avoid the leaf */ | ||
623 | for(i = 0; i < (num_bhs - 1); i++) { | ||
624 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
625 | el = &eb->h_list; | ||
626 | |||
627 | /* finally, make our actual change to the | ||
628 | * intermediate extent blocks. */ | ||
629 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
630 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
631 | new_clusters); | ||
632 | |||
633 | status = ocfs2_journal_dirty(handle, eb_bhs[i]); | ||
634 | if (status < 0) | ||
635 | mlog_errno(status); | ||
636 | } | ||
637 | BUG_ON(i != (num_bhs - 1)); | ||
638 | /* note that the leaf block wasn't touched in | ||
639 | * the loop above */ | ||
640 | eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; | ||
641 | el = &eb->h_list; | ||
642 | BUG_ON(el->l_tree_depth); | ||
643 | } | ||
644 | |||
645 | /* yay, we can finally add the actual extent now! */ | ||
646 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
647 | if (le16_to_cpu(el->l_next_free_rec) && | ||
648 | ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { | ||
649 | le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); | ||
650 | } else if (le16_to_cpu(el->l_next_free_rec) && | ||
651 | (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { | ||
652 | /* having an empty extent at eof is legal. */ | ||
653 | if (el->l_recs[i].e_cpos != fe->i_clusters) { | ||
654 | ocfs2_error(inode->i_sb, | ||
655 | "Dinode %"MLFu64" trailing extent is bad: " | ||
656 | "cpos (%u) != number of clusters (%u)", | ||
657 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
658 | le32_to_cpu(fe->i_clusters)); | ||
659 | status = -EIO; | ||
660 | goto bail; | ||
661 | } | ||
662 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
663 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
664 | } else { | ||
665 | /* No contiguous record, or no empty record at eof, so | ||
666 | * we add a new one. */ | ||
667 | |||
668 | BUG_ON(le16_to_cpu(el->l_next_free_rec) >= | ||
669 | le16_to_cpu(el->l_count)); | ||
670 | i = le16_to_cpu(el->l_next_free_rec); | ||
671 | |||
672 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
673 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
674 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
675 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * extent_map errors are not fatal, so they are ignored outside | ||
680 | * of flushing the thing. | ||
681 | */ | ||
682 | status = ocfs2_extent_map_append(inode, &el->l_recs[i], | ||
683 | new_clusters); | ||
684 | if (status) { | ||
685 | mlog_errno(status); | ||
686 | ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); | ||
687 | } | ||
688 | |||
689 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
690 | if (status < 0) | ||
691 | mlog_errno(status); | ||
692 | if (fe->id2.i_list.l_tree_depth) { | ||
693 | status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); | ||
694 | if (status < 0) | ||
695 | mlog_errno(status); | ||
696 | } | ||
697 | |||
698 | status = 0; | ||
699 | bail: | ||
700 | if (eb_bhs) { | ||
701 | for (i = 0; i < num_bhs; i++) | ||
702 | if (eb_bhs[i]) | ||
703 | brelse(eb_bhs[i]); | ||
704 | kfree(eb_bhs); | ||
705 | } | ||
706 | |||
707 | mlog_exit(status); | ||
708 | return status; | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * Should only be called when there is no space left in any of the | ||
713 | * leaf nodes. What we want to do is find the lowest tree depth | ||
714 | * non-leaf extent block with room for new records. There are three | ||
715 | * valid results of this search: | ||
716 | * | ||
717 | * 1) a lowest extent block is found, then we pass it back in | ||
718 | * *lowest_eb_bh and return '0' | ||
719 | * | ||
720 | * 2) the search fails to find anything, but the dinode has room. We | ||
721 | * pass NULL back in *lowest_eb_bh, but still return '0' | ||
722 | * | ||
723 | * 3) the search fails to find anything AND the dinode is full, in | ||
724 | * which case we return > 0 | ||
725 | * | ||
726 | * return status < 0 indicates an error. | ||
727 | */ | ||
728 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | ||
729 | struct inode *inode, | ||
730 | struct buffer_head *fe_bh, | ||
731 | struct buffer_head **target_bh) | ||
732 | { | ||
733 | int status = 0, i; | ||
734 | u64 blkno; | ||
735 | struct ocfs2_dinode *fe; | ||
736 | struct ocfs2_extent_block *eb; | ||
737 | struct ocfs2_extent_list *el; | ||
738 | struct buffer_head *bh = NULL; | ||
739 | struct buffer_head *lowest_bh = NULL; | ||
740 | |||
741 | mlog_entry_void(); | ||
742 | |||
743 | *target_bh = NULL; | ||
744 | |||
745 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
746 | el = &fe->id2.i_list; | ||
747 | |||
748 | while(le16_to_cpu(el->l_tree_depth) > 1) { | ||
749 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
750 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty " | ||
751 | "extent list (next_free_rec == 0)", | ||
752 | OCFS2_I(inode)->ip_blkno); | ||
753 | status = -EIO; | ||
754 | goto bail; | ||
755 | } | ||
756 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
757 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); | ||
758 | if (!blkno) { | ||
759 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent " | ||
760 | "list where extent # %d has no physical " | ||
761 | "block start", | ||
762 | OCFS2_I(inode)->ip_blkno, i); | ||
763 | status = -EIO; | ||
764 | goto bail; | ||
765 | } | ||
766 | |||
767 | if (bh) { | ||
768 | brelse(bh); | ||
769 | bh = NULL; | ||
770 | } | ||
771 | |||
772 | status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, | ||
773 | inode); | ||
774 | if (status < 0) { | ||
775 | mlog_errno(status); | ||
776 | goto bail; | ||
777 | } | ||
778 | |||
779 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
780 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
781 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
782 | status = -EIO; | ||
783 | goto bail; | ||
784 | } | ||
785 | el = &eb->h_list; | ||
786 | |||
787 | if (le16_to_cpu(el->l_next_free_rec) < | ||
788 | le16_to_cpu(el->l_count)) { | ||
789 | if (lowest_bh) | ||
790 | brelse(lowest_bh); | ||
791 | lowest_bh = bh; | ||
792 | get_bh(lowest_bh); | ||
793 | } | ||
794 | } | ||
795 | |||
796 | /* If we didn't find one and the fe doesn't have any room, | ||
797 | * then return '1' */ | ||
798 | if (!lowest_bh | ||
799 | && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) | ||
800 | status = 1; | ||
801 | |||
802 | *target_bh = lowest_bh; | ||
803 | bail: | ||
804 | if (bh) | ||
805 | brelse(bh); | ||
806 | |||
807 | mlog_exit(status); | ||
808 | return status; | ||
809 | } | ||
810 | |||
811 | /* the caller needs to update fe->i_clusters */ | ||
812 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
813 | struct ocfs2_journal_handle *handle, | ||
814 | struct inode *inode, | ||
815 | struct buffer_head *fe_bh, | ||
816 | u64 start_blk, | ||
817 | u32 new_clusters, | ||
818 | struct ocfs2_alloc_context *meta_ac) | ||
819 | { | ||
820 | int status, i, shift; | ||
821 | struct buffer_head *last_eb_bh = NULL; | ||
822 | struct buffer_head *bh = NULL; | ||
823 | struct ocfs2_dinode *fe; | ||
824 | struct ocfs2_extent_block *eb; | ||
825 | struct ocfs2_extent_list *el; | ||
826 | |||
827 | mlog_entry_void(); | ||
828 | |||
829 | mlog(0, "add %u clusters starting at block %"MLFu64" to " | ||
830 | "inode %"MLFu64"\n", | ||
831 | new_clusters, start_blk, OCFS2_I(inode)->ip_blkno); | ||
832 | |||
833 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
834 | el = &fe->id2.i_list; | ||
835 | |||
836 | if (el->l_tree_depth) { | ||
837 | /* jump to end of tree */ | ||
838 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
839 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
840 | if (status < 0) { | ||
841 | mlog_exit(status); | ||
842 | goto bail; | ||
843 | } | ||
844 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
845 | el = &eb->h_list; | ||
846 | } | ||
847 | |||
848 | /* Can we allocate without adding/shifting tree bits? */ | ||
849 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
850 | if (le16_to_cpu(el->l_next_free_rec) == 0 | ||
851 | || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) | ||
852 | || le32_to_cpu(el->l_recs[i].e_clusters) == 0 | ||
853 | || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) | ||
854 | goto out_add; | ||
855 | |||
856 | mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " | ||
857 | "tree now.\n"); | ||
858 | |||
859 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); | ||
860 | if (shift < 0) { | ||
861 | status = shift; | ||
862 | mlog_errno(status); | ||
863 | goto bail; | ||
864 | } | ||
865 | |||
866 | /* We traveled all the way to the bottom of the allocation tree | ||
867 | * and didn't find room for any more extents - we need to add | ||
868 | * another tree level */ | ||
869 | if (shift) { | ||
870 | /* if we hit a leaf, we'd better be empty :) */ | ||
871 | BUG_ON(le16_to_cpu(el->l_next_free_rec) != | ||
872 | le16_to_cpu(el->l_count)); | ||
873 | BUG_ON(bh); | ||
874 | mlog(0, "ocfs2_allocate_extent: need to shift tree depth " | ||
875 | "(current = %u)\n", | ||
876 | le16_to_cpu(fe->id2.i_list.l_tree_depth)); | ||
877 | |||
878 | /* ocfs2_shift_tree_depth will return us a buffer with | ||
879 | * the new extent block (so we can pass that to | ||
880 | * ocfs2_add_branch). */ | ||
881 | status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, | ||
882 | meta_ac, &bh); | ||
883 | if (status < 0) { | ||
884 | mlog_errno(status); | ||
885 | goto bail; | ||
886 | } | ||
887 | /* Special case: we have room now if we shifted from | ||
888 | * tree_depth 0 */ | ||
889 | if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) | ||
890 | goto out_add; | ||
891 | } | ||
892 | |||
893 | /* call ocfs2_add_branch to add the final part of the tree with | ||
894 | * the new data. */ | ||
895 | mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); | ||
896 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, | ||
897 | meta_ac); | ||
898 | if (status < 0) { | ||
899 | mlog_errno(status); | ||
900 | goto bail; | ||
901 | } | ||
902 | |||
903 | out_add: | ||
904 | /* Finally, we can add clusters. */ | ||
905 | status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, | ||
906 | start_blk, new_clusters); | ||
907 | if (status < 0) | ||
908 | mlog_errno(status); | ||
909 | |||
910 | bail: | ||
911 | if (bh) | ||
912 | brelse(bh); | ||
913 | |||
914 | if (last_eb_bh) | ||
915 | brelse(last_eb_bh); | ||
916 | |||
917 | mlog_exit(status); | ||
918 | return status; | ||
919 | } | ||
920 | |||
921 | static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) | ||
922 | { | ||
923 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
924 | struct ocfs2_dinode *di; | ||
925 | struct ocfs2_truncate_log *tl; | ||
926 | |||
927 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
928 | tl = &di->id2.i_dealloc; | ||
929 | |||
930 | mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), | ||
931 | "slot %d, invalid truncate log parameters: used = " | ||
932 | "%u, count = %u\n", osb->slot_num, | ||
933 | le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); | ||
934 | return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); | ||
935 | } | ||
936 | |||
937 | static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, | ||
938 | unsigned int new_start) | ||
939 | { | ||
940 | unsigned int tail_index; | ||
941 | unsigned int current_tail; | ||
942 | |||
943 | /* No records, nothing to coalesce */ | ||
944 | if (!le16_to_cpu(tl->tl_used)) | ||
945 | return 0; | ||
946 | |||
947 | tail_index = le16_to_cpu(tl->tl_used) - 1; | ||
948 | current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); | ||
949 | current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); | ||
950 | |||
951 | return current_tail == new_start; | ||
952 | } | ||
953 | |||
954 | static int ocfs2_truncate_log_append(struct ocfs2_super *osb, | ||
955 | struct ocfs2_journal_handle *handle, | ||
956 | u64 start_blk, | ||
957 | unsigned int num_clusters) | ||
958 | { | ||
959 | int status, index; | ||
960 | unsigned int start_cluster, tl_count; | ||
961 | struct inode *tl_inode = osb->osb_tl_inode; | ||
962 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
963 | struct ocfs2_dinode *di; | ||
964 | struct ocfs2_truncate_log *tl; | ||
965 | |||
966 | mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk, | ||
967 | num_clusters); | ||
968 | |||
969 | BUG_ON(!down_trylock(&tl_inode->i_sem)); | ||
970 | |||
971 | start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); | ||
972 | |||
973 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
974 | tl = &di->id2.i_dealloc; | ||
975 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
976 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); | ||
977 | status = -EIO; | ||
978 | goto bail; | ||
979 | } | ||
980 | |||
981 | tl_count = le16_to_cpu(tl->tl_count); | ||
982 | mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || | ||
983 | tl_count == 0, | ||
984 | "Truncate record count on #%"MLFu64" invalid (" | ||
985 | "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno, | ||
986 | ocfs2_truncate_recs_per_inode(osb->sb), | ||
987 | le16_to_cpu(tl->tl_count)); | ||
988 | |||
989 | /* Caller should have known to flush before calling us. */ | ||
990 | index = le16_to_cpu(tl->tl_used); | ||
991 | if (index >= tl_count) { | ||
992 | status = -ENOSPC; | ||
993 | mlog_errno(status); | ||
994 | goto bail; | ||
995 | } | ||
996 | |||
997 | status = ocfs2_journal_access(handle, tl_inode, tl_bh, | ||
998 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
999 | if (status < 0) { | ||
1000 | mlog_errno(status); | ||
1001 | goto bail; | ||
1002 | } | ||
1003 | |||
1004 | mlog(0, "Log truncate of %u clusters starting at cluster %u to " | ||
1005 | "%"MLFu64" (index = %d)\n", num_clusters, start_cluster, | ||
1006 | OCFS2_I(tl_inode)->ip_blkno, index); | ||
1007 | |||
1008 | if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { | ||
1009 | /* | ||
1010 | * Move index back to the record we are coalescing with. | ||
1011 | * ocfs2_truncate_log_can_coalesce() guarantees nonzero | ||
1012 | */ | ||
1013 | index--; | ||
1014 | |||
1015 | num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); | ||
1016 | mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", | ||
1017 | index, le32_to_cpu(tl->tl_recs[index].t_start), | ||
1018 | num_clusters); | ||
1019 | } else { | ||
1020 | tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); | ||
1021 | tl->tl_used = cpu_to_le16(index + 1); | ||
1022 | } | ||
1023 | tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); | ||
1024 | |||
1025 | status = ocfs2_journal_dirty(handle, tl_bh); | ||
1026 | if (status < 0) { | ||
1027 | mlog_errno(status); | ||
1028 | goto bail; | ||
1029 | } | ||
1030 | |||
1031 | bail: | ||
1032 | mlog_exit(status); | ||
1033 | return status; | ||
1034 | } | ||
1035 | |||
1036 | static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | ||
1037 | struct ocfs2_journal_handle *handle, | ||
1038 | struct inode *data_alloc_inode, | ||
1039 | struct buffer_head *data_alloc_bh) | ||
1040 | { | ||
1041 | int status = 0; | ||
1042 | int i; | ||
1043 | unsigned int num_clusters; | ||
1044 | u64 start_blk; | ||
1045 | struct ocfs2_truncate_rec rec; | ||
1046 | struct ocfs2_dinode *di; | ||
1047 | struct ocfs2_truncate_log *tl; | ||
1048 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1049 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
1050 | |||
1051 | mlog_entry_void(); | ||
1052 | |||
1053 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1054 | tl = &di->id2.i_dealloc; | ||
1055 | i = le16_to_cpu(tl->tl_used) - 1; | ||
1056 | while (i >= 0) { | ||
1057 | /* Caller has given us at least enough credits to | ||
1058 | * update the truncate log dinode */ | ||
1059 | status = ocfs2_journal_access(handle, tl_inode, tl_bh, | ||
1060 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1061 | if (status < 0) { | ||
1062 | mlog_errno(status); | ||
1063 | goto bail; | ||
1064 | } | ||
1065 | |||
1066 | tl->tl_used = cpu_to_le16(i); | ||
1067 | |||
1068 | status = ocfs2_journal_dirty(handle, tl_bh); | ||
1069 | if (status < 0) { | ||
1070 | mlog_errno(status); | ||
1071 | goto bail; | ||
1072 | } | ||
1073 | |||
1074 | /* TODO: Perhaps we can calculate the bulk of the | ||
1075 | * credits up front rather than extending like | ||
1076 | * this. */ | ||
1077 | status = ocfs2_extend_trans(handle, | ||
1078 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
1079 | if (status < 0) { | ||
1080 | mlog_errno(status); | ||
1081 | goto bail; | ||
1082 | } | ||
1083 | |||
1084 | rec = tl->tl_recs[i]; | ||
1085 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, | ||
1086 | le32_to_cpu(rec.t_start)); | ||
1087 | num_clusters = le32_to_cpu(rec.t_clusters); | ||
1088 | |||
1089 | /* if start_blk is not set, we ignore the record as | ||
1090 | * invalid. */ | ||
1091 | if (start_blk) { | ||
1092 | mlog(0, "free record %d, start = %u, clusters = %u\n", | ||
1093 | i, le32_to_cpu(rec.t_start), num_clusters); | ||
1094 | |||
1095 | status = ocfs2_free_clusters(handle, data_alloc_inode, | ||
1096 | data_alloc_bh, start_blk, | ||
1097 | num_clusters); | ||
1098 | if (status < 0) { | ||
1099 | mlog_errno(status); | ||
1100 | goto bail; | ||
1101 | } | ||
1102 | } | ||
1103 | i--; | ||
1104 | } | ||
1105 | |||
1106 | bail: | ||
1107 | mlog_exit(status); | ||
1108 | return status; | ||
1109 | } | ||
1110 | |||
1111 | /* Expects you to already be holding tl_inode->i_sem */ | ||
1112 | static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | ||
1113 | { | ||
1114 | int status; | ||
1115 | unsigned int num_to_flush; | ||
1116 | struct ocfs2_journal_handle *handle = NULL; | ||
1117 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1118 | struct inode *data_alloc_inode = NULL; | ||
1119 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
1120 | struct buffer_head *data_alloc_bh = NULL; | ||
1121 | struct ocfs2_dinode *di; | ||
1122 | struct ocfs2_truncate_log *tl; | ||
1123 | |||
1124 | mlog_entry_void(); | ||
1125 | |||
1126 | BUG_ON(!down_trylock(&tl_inode->i_sem)); | ||
1127 | |||
1128 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1129 | tl = &di->id2.i_dealloc; | ||
1130 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
1131 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); | ||
1132 | status = -EIO; | ||
1133 | goto bail; | ||
1134 | } | ||
1135 | |||
1136 | num_to_flush = le16_to_cpu(tl->tl_used); | ||
1137 | mlog(0, "Flush %u records from truncate log #%"MLFu64"\n", | ||
1138 | num_to_flush, OCFS2_I(tl_inode)->ip_blkno); | ||
1139 | if (!num_to_flush) { | ||
1140 | status = 0; | ||
1141 | goto bail; | ||
1142 | } | ||
1143 | |||
1144 | handle = ocfs2_alloc_handle(osb); | ||
1145 | if (!handle) { | ||
1146 | status = -ENOMEM; | ||
1147 | mlog_errno(status); | ||
1148 | goto bail; | ||
1149 | } | ||
1150 | |||
1151 | data_alloc_inode = ocfs2_get_system_file_inode(osb, | ||
1152 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
1153 | OCFS2_INVALID_SLOT); | ||
1154 | if (!data_alloc_inode) { | ||
1155 | status = -EINVAL; | ||
1156 | mlog(ML_ERROR, "Could not get bitmap inode!\n"); | ||
1157 | goto bail; | ||
1158 | } | ||
1159 | |||
1160 | ocfs2_handle_add_inode(handle, data_alloc_inode); | ||
1161 | status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1); | ||
1162 | if (status < 0) { | ||
1163 | mlog_errno(status); | ||
1164 | goto bail; | ||
1165 | } | ||
1166 | |||
1167 | handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE); | ||
1168 | if (IS_ERR(handle)) { | ||
1169 | status = PTR_ERR(handle); | ||
1170 | handle = NULL; | ||
1171 | mlog_errno(status); | ||
1172 | goto bail; | ||
1173 | } | ||
1174 | |||
1175 | status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, | ||
1176 | data_alloc_bh); | ||
1177 | if (status < 0) { | ||
1178 | mlog_errno(status); | ||
1179 | goto bail; | ||
1180 | } | ||
1181 | |||
1182 | bail: | ||
1183 | if (handle) | ||
1184 | ocfs2_commit_trans(handle); | ||
1185 | |||
1186 | if (data_alloc_inode) | ||
1187 | iput(data_alloc_inode); | ||
1188 | |||
1189 | if (data_alloc_bh) | ||
1190 | brelse(data_alloc_bh); | ||
1191 | |||
1192 | mlog_exit(status); | ||
1193 | return status; | ||
1194 | } | ||
1195 | |||
1196 | int ocfs2_flush_truncate_log(struct ocfs2_super *osb) | ||
1197 | { | ||
1198 | int status; | ||
1199 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1200 | |||
1201 | down(&tl_inode->i_sem); | ||
1202 | status = __ocfs2_flush_truncate_log(osb); | ||
1203 | up(&tl_inode->i_sem); | ||
1204 | |||
1205 | return status; | ||
1206 | } | ||
1207 | |||
1208 | static void ocfs2_truncate_log_worker(void *data) | ||
1209 | { | ||
1210 | int status; | ||
1211 | struct ocfs2_super *osb = data; | ||
1212 | |||
1213 | mlog_entry_void(); | ||
1214 | |||
1215 | status = ocfs2_flush_truncate_log(osb); | ||
1216 | if (status < 0) | ||
1217 | mlog_errno(status); | ||
1218 | |||
1219 | mlog_exit(status); | ||
1220 | } | ||
1221 | |||
1222 | #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) | ||
1223 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | ||
1224 | int cancel) | ||
1225 | { | ||
1226 | if (osb->osb_tl_inode) { | ||
1227 | /* We want to push off log flushes while truncates are | ||
1228 | * still running. */ | ||
1229 | if (cancel) | ||
1230 | cancel_delayed_work(&osb->osb_truncate_log_wq); | ||
1231 | |||
1232 | queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, | ||
1233 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); | ||
1234 | } | ||
1235 | } | ||
1236 | |||
1237 | static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, | ||
1238 | int slot_num, | ||
1239 | struct inode **tl_inode, | ||
1240 | struct buffer_head **tl_bh) | ||
1241 | { | ||
1242 | int status; | ||
1243 | struct inode *inode = NULL; | ||
1244 | struct buffer_head *bh = NULL; | ||
1245 | |||
1246 | inode = ocfs2_get_system_file_inode(osb, | ||
1247 | TRUNCATE_LOG_SYSTEM_INODE, | ||
1248 | slot_num); | ||
1249 | if (!inode) { | ||
1250 | status = -EINVAL; | ||
1251 | mlog(ML_ERROR, "Could not get load truncate log inode!\n"); | ||
1252 | goto bail; | ||
1253 | } | ||
1254 | |||
1255 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | ||
1256 | OCFS2_BH_CACHED, inode); | ||
1257 | if (status < 0) { | ||
1258 | iput(inode); | ||
1259 | mlog_errno(status); | ||
1260 | goto bail; | ||
1261 | } | ||
1262 | |||
1263 | *tl_inode = inode; | ||
1264 | *tl_bh = bh; | ||
1265 | bail: | ||
1266 | mlog_exit(status); | ||
1267 | return status; | ||
1268 | } | ||
1269 | |||
1270 | /* called during the 1st stage of node recovery. we stamp a clean | ||
1271 | * truncate log and pass back a copy for processing later. if the | ||
1272 | * truncate log does not require processing, a *tl_copy is set to | ||
1273 | * NULL. */ | ||
1274 | int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, | ||
1275 | int slot_num, | ||
1276 | struct ocfs2_dinode **tl_copy) | ||
1277 | { | ||
1278 | int status; | ||
1279 | struct inode *tl_inode = NULL; | ||
1280 | struct buffer_head *tl_bh = NULL; | ||
1281 | struct ocfs2_dinode *di; | ||
1282 | struct ocfs2_truncate_log *tl; | ||
1283 | |||
1284 | *tl_copy = NULL; | ||
1285 | |||
1286 | mlog(0, "recover truncate log from slot %d\n", slot_num); | ||
1287 | |||
1288 | status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); | ||
1289 | if (status < 0) { | ||
1290 | mlog_errno(status); | ||
1291 | goto bail; | ||
1292 | } | ||
1293 | |||
1294 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1295 | tl = &di->id2.i_dealloc; | ||
1296 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
1297 | OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); | ||
1298 | status = -EIO; | ||
1299 | goto bail; | ||
1300 | } | ||
1301 | |||
1302 | if (le16_to_cpu(tl->tl_used)) { | ||
1303 | mlog(0, "We'll have %u logs to recover\n", | ||
1304 | le16_to_cpu(tl->tl_used)); | ||
1305 | |||
1306 | *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); | ||
1307 | if (!(*tl_copy)) { | ||
1308 | status = -ENOMEM; | ||
1309 | mlog_errno(status); | ||
1310 | goto bail; | ||
1311 | } | ||
1312 | |||
1313 | /* Assuming the write-out below goes well, this copy | ||
1314 | * will be passed back to recovery for processing. */ | ||
1315 | memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); | ||
1316 | |||
1317 | /* All we need to do to clear the truncate log is set | ||
1318 | * tl_used. */ | ||
1319 | tl->tl_used = 0; | ||
1320 | |||
1321 | status = ocfs2_write_block(osb, tl_bh, tl_inode); | ||
1322 | if (status < 0) { | ||
1323 | mlog_errno(status); | ||
1324 | goto bail; | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | bail: | ||
1329 | if (tl_inode) | ||
1330 | iput(tl_inode); | ||
1331 | if (tl_bh) | ||
1332 | brelse(tl_bh); | ||
1333 | |||
1334 | if (status < 0 && (*tl_copy)) { | ||
1335 | kfree(*tl_copy); | ||
1336 | *tl_copy = NULL; | ||
1337 | } | ||
1338 | |||
1339 | mlog_exit(status); | ||
1340 | return status; | ||
1341 | } | ||
1342 | |||
1343 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, | ||
1344 | struct ocfs2_dinode *tl_copy) | ||
1345 | { | ||
1346 | int status = 0; | ||
1347 | int i; | ||
1348 | unsigned int clusters, num_recs, start_cluster; | ||
1349 | u64 start_blk; | ||
1350 | struct ocfs2_journal_handle *handle; | ||
1351 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1352 | struct ocfs2_truncate_log *tl; | ||
1353 | |||
1354 | mlog_entry_void(); | ||
1355 | |||
1356 | if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { | ||
1357 | mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); | ||
1358 | return -EINVAL; | ||
1359 | } | ||
1360 | |||
1361 | tl = &tl_copy->id2.i_dealloc; | ||
1362 | num_recs = le16_to_cpu(tl->tl_used); | ||
1363 | mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs, | ||
1364 | tl_copy->i_blkno); | ||
1365 | |||
1366 | down(&tl_inode->i_sem); | ||
1367 | for(i = 0; i < num_recs; i++) { | ||
1368 | if (ocfs2_truncate_log_needs_flush(osb)) { | ||
1369 | status = __ocfs2_flush_truncate_log(osb); | ||
1370 | if (status < 0) { | ||
1371 | mlog_errno(status); | ||
1372 | goto bail_up; | ||
1373 | } | ||
1374 | } | ||
1375 | |||
1376 | handle = ocfs2_start_trans(osb, NULL, | ||
1377 | OCFS2_TRUNCATE_LOG_UPDATE); | ||
1378 | if (IS_ERR(handle)) { | ||
1379 | status = PTR_ERR(handle); | ||
1380 | mlog_errno(status); | ||
1381 | goto bail_up; | ||
1382 | } | ||
1383 | |||
1384 | clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); | ||
1385 | start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); | ||
1386 | start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); | ||
1387 | |||
1388 | status = ocfs2_truncate_log_append(osb, handle, | ||
1389 | start_blk, clusters); | ||
1390 | ocfs2_commit_trans(handle); | ||
1391 | if (status < 0) { | ||
1392 | mlog_errno(status); | ||
1393 | goto bail_up; | ||
1394 | } | ||
1395 | } | ||
1396 | |||
1397 | bail_up: | ||
1398 | up(&tl_inode->i_sem); | ||
1399 | |||
1400 | mlog_exit(status); | ||
1401 | return status; | ||
1402 | } | ||
1403 | |||
1404 | void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) | ||
1405 | { | ||
1406 | int status; | ||
1407 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1408 | |||
1409 | mlog_entry_void(); | ||
1410 | |||
1411 | if (tl_inode) { | ||
1412 | cancel_delayed_work(&osb->osb_truncate_log_wq); | ||
1413 | flush_workqueue(ocfs2_wq); | ||
1414 | |||
1415 | status = ocfs2_flush_truncate_log(osb); | ||
1416 | if (status < 0) | ||
1417 | mlog_errno(status); | ||
1418 | |||
1419 | brelse(osb->osb_tl_bh); | ||
1420 | iput(osb->osb_tl_inode); | ||
1421 | } | ||
1422 | |||
1423 | mlog_exit_void(); | ||
1424 | } | ||
1425 | |||
1426 | int ocfs2_truncate_log_init(struct ocfs2_super *osb) | ||
1427 | { | ||
1428 | int status; | ||
1429 | struct inode *tl_inode = NULL; | ||
1430 | struct buffer_head *tl_bh = NULL; | ||
1431 | |||
1432 | mlog_entry_void(); | ||
1433 | |||
1434 | status = ocfs2_get_truncate_log_info(osb, | ||
1435 | osb->slot_num, | ||
1436 | &tl_inode, | ||
1437 | &tl_bh); | ||
1438 | if (status < 0) | ||
1439 | mlog_errno(status); | ||
1440 | |||
1441 | /* ocfs2_truncate_log_shutdown keys on the existence of | ||
1442 | * osb->osb_tl_inode so we don't set any of the osb variables | ||
1443 | * until we're sure all is well. */ | ||
1444 | INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb); | ||
1445 | osb->osb_tl_bh = tl_bh; | ||
1446 | osb->osb_tl_inode = tl_inode; | ||
1447 | |||
1448 | mlog_exit(status); | ||
1449 | return status; | ||
1450 | } | ||
1451 | |||
1452 | /* This function will figure out whether the currently last extent | ||
1453 | * block will be deleted, and if it will, what the new last extent | ||
1454 | * block will be so we can update his h_next_leaf_blk field, as well | ||
1455 | * as the dinodes i_last_eb_blk */ | ||
1456 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | ||
1457 | struct inode *inode, | ||
1458 | struct ocfs2_dinode *fe, | ||
1459 | u32 new_i_clusters, | ||
1460 | struct buffer_head *old_last_eb, | ||
1461 | struct buffer_head **new_last_eb) | ||
1462 | { | ||
1463 | int i, status = 0; | ||
1464 | u64 block = 0; | ||
1465 | struct ocfs2_extent_block *eb; | ||
1466 | struct ocfs2_extent_list *el; | ||
1467 | struct buffer_head *bh = NULL; | ||
1468 | |||
1469 | *new_last_eb = NULL; | ||
1470 | |||
1471 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1472 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
1473 | status = -EIO; | ||
1474 | goto bail; | ||
1475 | } | ||
1476 | |||
1477 | /* we have no tree, so of course, no last_eb. */ | ||
1478 | if (!fe->id2.i_list.l_tree_depth) | ||
1479 | goto bail; | ||
1480 | |||
1481 | /* trunc to zero special case - this makes tree_depth = 0 | ||
1482 | * regardless of what it is. */ | ||
1483 | if (!new_i_clusters) | ||
1484 | goto bail; | ||
1485 | |||
1486 | eb = (struct ocfs2_extent_block *) old_last_eb->b_data; | ||
1487 | el = &(eb->h_list); | ||
1488 | BUG_ON(!el->l_next_free_rec); | ||
1489 | |||
1490 | /* Make sure that this guy will actually be empty after we | ||
1491 | * clear away the data. */ | ||
1492 | if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) | ||
1493 | goto bail; | ||
1494 | |||
1495 | /* Ok, at this point, we know that last_eb will definitely | ||
1496 | * change, so lets traverse the tree and find the second to | ||
1497 | * last extent block. */ | ||
1498 | el = &(fe->id2.i_list); | ||
1499 | /* go down the tree, */ | ||
1500 | do { | ||
1501 | for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { | ||
1502 | if (le32_to_cpu(el->l_recs[i].e_cpos) < | ||
1503 | new_i_clusters) { | ||
1504 | block = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1505 | break; | ||
1506 | } | ||
1507 | } | ||
1508 | BUG_ON(i < 0); | ||
1509 | |||
1510 | if (bh) { | ||
1511 | brelse(bh); | ||
1512 | bh = NULL; | ||
1513 | } | ||
1514 | |||
1515 | status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, | ||
1516 | inode); | ||
1517 | if (status < 0) { | ||
1518 | mlog_errno(status); | ||
1519 | goto bail; | ||
1520 | } | ||
1521 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
1522 | el = &eb->h_list; | ||
1523 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1524 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1525 | status = -EIO; | ||
1526 | goto bail; | ||
1527 | } | ||
1528 | } while (el->l_tree_depth); | ||
1529 | |||
1530 | *new_last_eb = bh; | ||
1531 | get_bh(*new_last_eb); | ||
1532 | mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno)); | ||
1533 | bail: | ||
1534 | if (bh) | ||
1535 | brelse(bh); | ||
1536 | |||
1537 | return status; | ||
1538 | } | ||
1539 | |||
1540 | static int ocfs2_do_truncate(struct ocfs2_super *osb, | ||
1541 | unsigned int clusters_to_del, | ||
1542 | struct inode *inode, | ||
1543 | struct buffer_head *fe_bh, | ||
1544 | struct buffer_head *old_last_eb_bh, | ||
1545 | struct ocfs2_journal_handle *handle, | ||
1546 | struct ocfs2_truncate_context *tc) | ||
1547 | { | ||
1548 | int status, i, depth; | ||
1549 | struct ocfs2_dinode *fe; | ||
1550 | struct ocfs2_extent_block *eb; | ||
1551 | struct ocfs2_extent_block *last_eb = NULL; | ||
1552 | struct ocfs2_extent_list *el; | ||
1553 | struct buffer_head *eb_bh = NULL; | ||
1554 | struct buffer_head *last_eb_bh = NULL; | ||
1555 | u64 next_eb = 0; | ||
1556 | u64 delete_blk = 0; | ||
1557 | |||
1558 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1559 | |||
1560 | status = ocfs2_find_new_last_ext_blk(osb, | ||
1561 | inode, | ||
1562 | fe, | ||
1563 | le32_to_cpu(fe->i_clusters) - | ||
1564 | clusters_to_del, | ||
1565 | old_last_eb_bh, | ||
1566 | &last_eb_bh); | ||
1567 | if (status < 0) { | ||
1568 | mlog_errno(status); | ||
1569 | goto bail; | ||
1570 | } | ||
1571 | if (last_eb_bh) | ||
1572 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1573 | |||
1574 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
1575 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1576 | if (status < 0) { | ||
1577 | mlog_errno(status); | ||
1578 | goto bail; | ||
1579 | } | ||
1580 | el = &(fe->id2.i_list); | ||
1581 | |||
1582 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1583 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - | ||
1584 | clusters_to_del; | ||
1585 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1586 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); | ||
1587 | fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
1588 | fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
1589 | |||
1590 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1591 | |||
1592 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
1593 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
1594 | /* tree depth zero, we can just delete the clusters, otherwise | ||
1595 | * we need to record the offset of the next level extent block | ||
1596 | * as we may overwrite it. */ | ||
1597 | if (!el->l_tree_depth) | ||
1598 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1599 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1600 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1601 | else | ||
1602 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1603 | |||
1604 | if (!el->l_recs[i].e_clusters) { | ||
1605 | /* if we deleted the whole extent record, then clear | ||
1606 | * out the other fields and update the extent | ||
1607 | * list. For depth > 0 trees, we've already recorded | ||
1608 | * the extent block in 'next_eb' */ | ||
1609 | el->l_recs[i].e_cpos = 0; | ||
1610 | el->l_recs[i].e_blkno = 0; | ||
1611 | BUG_ON(!el->l_next_free_rec); | ||
1612 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1613 | } | ||
1614 | |||
1615 | depth = le16_to_cpu(el->l_tree_depth); | ||
1616 | if (!fe->i_clusters) { | ||
1617 | /* trunc to zero is a special case. */ | ||
1618 | el->l_tree_depth = 0; | ||
1619 | fe->i_last_eb_blk = 0; | ||
1620 | } else if (last_eb) | ||
1621 | fe->i_last_eb_blk = last_eb->h_blkno; | ||
1622 | |||
1623 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
1624 | if (status < 0) { | ||
1625 | mlog_errno(status); | ||
1626 | goto bail; | ||
1627 | } | ||
1628 | |||
1629 | if (last_eb) { | ||
1630 | /* If there will be a new last extent block, then by | ||
1631 | * definition, there cannot be any leaves to the right of | ||
1632 | * him. */ | ||
1633 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
1634 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1635 | if (status < 0) { | ||
1636 | mlog_errno(status); | ||
1637 | goto bail; | ||
1638 | } | ||
1639 | last_eb->h_next_leaf_blk = 0; | ||
1640 | status = ocfs2_journal_dirty(handle, last_eb_bh); | ||
1641 | if (status < 0) { | ||
1642 | mlog_errno(status); | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | /* if our tree depth > 0, update all the tree blocks below us. */ | ||
1648 | while (depth) { | ||
1649 | mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n", | ||
1650 | depth, next_eb); | ||
1651 | status = ocfs2_read_block(osb, next_eb, &eb_bh, | ||
1652 | OCFS2_BH_CACHED, inode); | ||
1653 | if (status < 0) { | ||
1654 | mlog_errno(status); | ||
1655 | goto bail; | ||
1656 | } | ||
1657 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
1658 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1659 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1660 | status = -EIO; | ||
1661 | goto bail; | ||
1662 | } | ||
1663 | el = &(eb->h_list); | ||
1664 | |||
1665 | status = ocfs2_journal_access(handle, inode, eb_bh, | ||
1666 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1667 | if (status < 0) { | ||
1668 | mlog_errno(status); | ||
1669 | goto bail; | ||
1670 | } | ||
1671 | |||
1672 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | ||
1673 | BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); | ||
1674 | |||
1675 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1676 | |||
1677 | mlog(0, "extent block %"MLFu64", before: record %d: " | ||
1678 | "(%u, %u, %"MLFu64"), next = %u\n", | ||
1679 | le64_to_cpu(eb->h_blkno), i, | ||
1680 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1681 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1682 | le64_to_cpu(el->l_recs[i].e_blkno), | ||
1683 | le16_to_cpu(el->l_next_free_rec)); | ||
1684 | |||
1685 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
1686 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
1687 | |||
1688 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1689 | /* bottom-most block requires us to delete data.*/ | ||
1690 | if (!el->l_tree_depth) | ||
1691 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1692 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1693 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1694 | if (!el->l_recs[i].e_clusters) { | ||
1695 | el->l_recs[i].e_cpos = 0; | ||
1696 | el->l_recs[i].e_blkno = 0; | ||
1697 | BUG_ON(!el->l_next_free_rec); | ||
1698 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1699 | } | ||
1700 | mlog(0, "extent block %"MLFu64", after: record %d: " | ||
1701 | "(%u, %u, %"MLFu64"), next = %u\n", | ||
1702 | le64_to_cpu(eb->h_blkno), i, | ||
1703 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1704 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1705 | le64_to_cpu(el->l_recs[i].e_blkno), | ||
1706 | le16_to_cpu(el->l_next_free_rec)); | ||
1707 | |||
1708 | status = ocfs2_journal_dirty(handle, eb_bh); | ||
1709 | if (status < 0) { | ||
1710 | mlog_errno(status); | ||
1711 | goto bail; | ||
1712 | } | ||
1713 | |||
1714 | if (!el->l_next_free_rec) { | ||
1715 | mlog(0, "deleting this extent block.\n"); | ||
1716 | |||
1717 | ocfs2_remove_from_cache(inode, eb_bh); | ||
1718 | |||
1719 | BUG_ON(eb->h_suballoc_slot); | ||
1720 | BUG_ON(el->l_recs[0].e_clusters); | ||
1721 | BUG_ON(el->l_recs[0].e_cpos); | ||
1722 | BUG_ON(el->l_recs[0].e_blkno); | ||
1723 | status = ocfs2_free_extent_block(handle, | ||
1724 | tc->tc_ext_alloc_inode, | ||
1725 | tc->tc_ext_alloc_bh, | ||
1726 | eb); | ||
1727 | if (status < 0) { | ||
1728 | mlog_errno(status); | ||
1729 | goto bail; | ||
1730 | } | ||
1731 | } | ||
1732 | brelse(eb_bh); | ||
1733 | eb_bh = NULL; | ||
1734 | depth--; | ||
1735 | } | ||
1736 | |||
1737 | BUG_ON(!delete_blk); | ||
1738 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, | ||
1739 | clusters_to_del); | ||
1740 | if (status < 0) { | ||
1741 | mlog_errno(status); | ||
1742 | goto bail; | ||
1743 | } | ||
1744 | status = 0; | ||
1745 | bail: | ||
1746 | if (!status) | ||
1747 | ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); | ||
1748 | else | ||
1749 | ocfs2_extent_map_drop(inode, 0); | ||
1750 | mlog_exit(status); | ||
1751 | return status; | ||
1752 | } | ||
1753 | |||
1754 | /* | ||
1755 | * It is expected, that by the time you call this function, | ||
1756 | * inode->i_size and fe->i_size have been adjusted. | ||
1757 | * | ||
1758 | * WARNING: This will kfree the truncate context | ||
1759 | */ | ||
1760 | int ocfs2_commit_truncate(struct ocfs2_super *osb, | ||
1761 | struct inode *inode, | ||
1762 | struct buffer_head *fe_bh, | ||
1763 | struct ocfs2_truncate_context *tc) | ||
1764 | { | ||
1765 | int status, i, credits, tl_sem = 0; | ||
1766 | u32 clusters_to_del, target_i_clusters; | ||
1767 | u64 last_eb = 0; | ||
1768 | struct ocfs2_dinode *fe; | ||
1769 | struct ocfs2_extent_block *eb; | ||
1770 | struct ocfs2_extent_list *el; | ||
1771 | struct buffer_head *last_eb_bh; | ||
1772 | struct ocfs2_journal_handle *handle = NULL; | ||
1773 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1774 | |||
1775 | mlog_entry_void(); | ||
1776 | |||
1777 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1778 | |||
1779 | target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | ||
1780 | i_size_read(inode)); | ||
1781 | |||
1782 | last_eb_bh = tc->tc_last_eb_bh; | ||
1783 | tc->tc_last_eb_bh = NULL; | ||
1784 | |||
1785 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1786 | |||
1787 | if (fe->id2.i_list.l_tree_depth) { | ||
1788 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1789 | el = &eb->h_list; | ||
1790 | } else | ||
1791 | el = &fe->id2.i_list; | ||
1792 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1793 | start: | ||
1794 | mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " | ||
1795 | "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", " | ||
1796 | "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", | ||
1797 | le32_to_cpu(fe->i_clusters), last_eb, | ||
1798 | le64_to_cpu(fe->i_last_eb_blk), | ||
1799 | le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); | ||
1800 | |||
1801 | if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { | ||
1802 | mlog(0, "last_eb changed!\n"); | ||
1803 | BUG_ON(!fe->id2.i_list.l_tree_depth); | ||
1804 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1805 | /* i_last_eb_blk may have changed, read it if | ||
1806 | * necessary. We don't have to worry about the | ||
1807 | * truncate to zero case here (where there becomes no | ||
1808 | * last_eb) because we never loop back after our work | ||
1809 | * is done. */ | ||
1810 | if (last_eb_bh) { | ||
1811 | brelse(last_eb_bh); | ||
1812 | last_eb_bh = NULL; | ||
1813 | } | ||
1814 | |||
1815 | status = ocfs2_read_block(osb, last_eb, | ||
1816 | &last_eb_bh, OCFS2_BH_CACHED, | ||
1817 | inode); | ||
1818 | if (status < 0) { | ||
1819 | mlog_errno(status); | ||
1820 | goto bail; | ||
1821 | } | ||
1822 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1823 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1824 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1825 | status = -EIO; | ||
1826 | goto bail; | ||
1827 | } | ||
1828 | el = &(eb->h_list); | ||
1829 | } | ||
1830 | |||
1831 | /* by now, el will point to the extent list on the bottom most | ||
1832 | * portion of this tree. */ | ||
1833 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1834 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) | ||
1835 | clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); | ||
1836 | else | ||
1837 | clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + | ||
1838 | le32_to_cpu(el->l_recs[i].e_cpos)) - | ||
1839 | target_i_clusters; | ||
1840 | |||
1841 | mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); | ||
1842 | |||
1843 | down(&tl_inode->i_sem); | ||
1844 | tl_sem = 1; | ||
1845 | /* ocfs2_truncate_log_needs_flush guarantees us at least one | ||
1846 | * record is free for use. If there isn't any, we flush to get | ||
1847 | * an empty truncate log. */ | ||
1848 | if (ocfs2_truncate_log_needs_flush(osb)) { | ||
1849 | status = __ocfs2_flush_truncate_log(osb); | ||
1850 | if (status < 0) { | ||
1851 | mlog_errno(status); | ||
1852 | goto bail; | ||
1853 | } | ||
1854 | } | ||
1855 | |||
1856 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, | ||
1857 | fe, el); | ||
1858 | handle = ocfs2_start_trans(osb, NULL, credits); | ||
1859 | if (IS_ERR(handle)) { | ||
1860 | status = PTR_ERR(handle); | ||
1861 | handle = NULL; | ||
1862 | mlog_errno(status); | ||
1863 | goto bail; | ||
1864 | } | ||
1865 | |||
1866 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
1867 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
1868 | if (status < 0) | ||
1869 | mlog_errno(status); | ||
1870 | |||
1871 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, | ||
1872 | last_eb_bh, handle, tc); | ||
1873 | if (status < 0) { | ||
1874 | mlog_errno(status); | ||
1875 | goto bail; | ||
1876 | } | ||
1877 | |||
1878 | up(&tl_inode->i_sem); | ||
1879 | tl_sem = 0; | ||
1880 | |||
1881 | ocfs2_commit_trans(handle); | ||
1882 | handle = NULL; | ||
1883 | |||
1884 | BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); | ||
1885 | if (le32_to_cpu(fe->i_clusters) > target_i_clusters) | ||
1886 | goto start; | ||
1887 | bail: | ||
1888 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1889 | |||
1890 | ocfs2_schedule_truncate_log_flush(osb, 1); | ||
1891 | |||
1892 | if (tl_sem) | ||
1893 | up(&tl_inode->i_sem); | ||
1894 | |||
1895 | if (handle) | ||
1896 | ocfs2_commit_trans(handle); | ||
1897 | |||
1898 | if (last_eb_bh) | ||
1899 | brelse(last_eb_bh); | ||
1900 | |||
1901 | /* This will drop the ext_alloc cluster lock for us */ | ||
1902 | ocfs2_free_truncate_context(tc); | ||
1903 | |||
1904 | mlog_exit(status); | ||
1905 | return status; | ||
1906 | } | ||
1907 | |||
1908 | |||
1909 | /* | ||
1910 | * Expects the inode to already be locked. This will figure out which | ||
1911 | * inodes need to be locked and will put them on the returned truncate | ||
1912 | * context. | ||
1913 | */ | ||
1914 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | ||
1915 | struct inode *inode, | ||
1916 | struct buffer_head *fe_bh, | ||
1917 | struct ocfs2_truncate_context **tc) | ||
1918 | { | ||
1919 | int status, metadata_delete; | ||
1920 | unsigned int new_i_clusters; | ||
1921 | struct ocfs2_dinode *fe; | ||
1922 | struct ocfs2_extent_block *eb; | ||
1923 | struct ocfs2_extent_list *el; | ||
1924 | struct buffer_head *last_eb_bh = NULL; | ||
1925 | struct inode *ext_alloc_inode = NULL; | ||
1926 | struct buffer_head *ext_alloc_bh = NULL; | ||
1927 | |||
1928 | mlog_entry_void(); | ||
1929 | |||
1930 | *tc = NULL; | ||
1931 | |||
1932 | new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | ||
1933 | i_size_read(inode)); | ||
1934 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1935 | |||
1936 | mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" | ||
1937 | "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size); | ||
1938 | |||
1939 | if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { | ||
1940 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count " | ||
1941 | "%u and size %"MLFu64" whereas struct inode has " | ||
1942 | "cluster count %u and size %llu which caused an " | ||
1943 | "invalid truncate to %u clusters.", | ||
1944 | le64_to_cpu(fe->i_blkno), | ||
1945 | le32_to_cpu(fe->i_clusters), | ||
1946 | le64_to_cpu(fe->i_size), | ||
1947 | OCFS2_I(inode)->ip_clusters, i_size_read(inode), | ||
1948 | new_i_clusters); | ||
1949 | mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); | ||
1950 | status = -EIO; | ||
1951 | goto bail; | ||
1952 | } | ||
1953 | |||
1954 | *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); | ||
1955 | if (!(*tc)) { | ||
1956 | status = -ENOMEM; | ||
1957 | mlog_errno(status); | ||
1958 | goto bail; | ||
1959 | } | ||
1960 | |||
1961 | metadata_delete = 0; | ||
1962 | if (fe->id2.i_list.l_tree_depth) { | ||
1963 | /* If we have a tree, then the truncate may result in | ||
1964 | * metadata deletes. Figure this out from the | ||
1965 | * rightmost leaf block.*/ | ||
1966 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
1967 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
1968 | if (status < 0) { | ||
1969 | mlog_errno(status); | ||
1970 | goto bail; | ||
1971 | } | ||
1972 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1973 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1974 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1975 | |||
1976 | brelse(last_eb_bh); | ||
1977 | status = -EIO; | ||
1978 | goto bail; | ||
1979 | } | ||
1980 | el = &(eb->h_list); | ||
1981 | if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) | ||
1982 | metadata_delete = 1; | ||
1983 | } | ||
1984 | |||
1985 | (*tc)->tc_last_eb_bh = last_eb_bh; | ||
1986 | |||
1987 | if (metadata_delete) { | ||
1988 | mlog(0, "Will have to delete metadata for this trunc. " | ||
1989 | "locking allocator.\n"); | ||
1990 | ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); | ||
1991 | if (!ext_alloc_inode) { | ||
1992 | status = -ENOMEM; | ||
1993 | mlog_errno(status); | ||
1994 | goto bail; | ||
1995 | } | ||
1996 | |||
1997 | down(&ext_alloc_inode->i_sem); | ||
1998 | (*tc)->tc_ext_alloc_inode = ext_alloc_inode; | ||
1999 | |||
2000 | status = ocfs2_meta_lock(ext_alloc_inode, | ||
2001 | NULL, | ||
2002 | &ext_alloc_bh, | ||
2003 | 1); | ||
2004 | if (status < 0) { | ||
2005 | mlog_errno(status); | ||
2006 | goto bail; | ||
2007 | } | ||
2008 | (*tc)->tc_ext_alloc_bh = ext_alloc_bh; | ||
2009 | (*tc)->tc_ext_alloc_locked = 1; | ||
2010 | } | ||
2011 | |||
2012 | status = 0; | ||
2013 | bail: | ||
2014 | if (status < 0) { | ||
2015 | if (*tc) | ||
2016 | ocfs2_free_truncate_context(*tc); | ||
2017 | *tc = NULL; | ||
2018 | } | ||
2019 | mlog_exit_void(); | ||
2020 | return status; | ||
2021 | } | ||
2022 | |||
2023 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) | ||
2024 | { | ||
2025 | if (tc->tc_ext_alloc_inode) { | ||
2026 | if (tc->tc_ext_alloc_locked) | ||
2027 | ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); | ||
2028 | |||
2029 | up(&tc->tc_ext_alloc_inode->i_sem); | ||
2030 | iput(tc->tc_ext_alloc_inode); | ||
2031 | } | ||
2032 | |||
2033 | if (tc->tc_ext_alloc_bh) | ||
2034 | brelse(tc->tc_ext_alloc_bh); | ||
2035 | |||
2036 | if (tc->tc_last_eb_bh) | ||
2037 | brelse(tc->tc_last_eb_bh); | ||
2038 | |||
2039 | kfree(tc); | ||
2040 | } | ||
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h new file mode 100644 index 000000000000..12ba897743f4 --- /dev/null +++ b/fs/ocfs2/alloc.h | |||
@@ -0,0 +1,82 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * alloc.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_ALLOC_H | ||
27 | #define OCFS2_ALLOC_H | ||
28 | |||
29 | struct ocfs2_alloc_context; | ||
30 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
31 | struct ocfs2_journal_handle *handle, | ||
32 | struct inode *inode, | ||
33 | struct buffer_head *fe_bh, | ||
34 | u64 blkno, | ||
35 | u32 new_clusters, | ||
36 | struct ocfs2_alloc_context *meta_ac); | ||
37 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | ||
38 | struct inode *inode, | ||
39 | struct ocfs2_dinode *fe); | ||
40 | /* how many new metadata chunks would an allocation need at maximum? */ | ||
41 | static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) | ||
42 | { | ||
43 | /* | ||
44 | * Rather than do all the work of determining how much we need | ||
45 | * (involves a ton of reads and locks), just ask for the | ||
46 | * maximal limit. That's a tree depth shift. So, one block for | ||
47 | * level of the tree (current l_tree_depth), one block for the | ||
48 | * new tree_depth==0 extent_block, and one block at the new | ||
49 | * top-of-the tree. | ||
50 | */ | ||
51 | return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; | ||
52 | } | ||
53 | |||
54 | int ocfs2_truncate_log_init(struct ocfs2_super *osb); | ||
55 | void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); | ||
56 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | ||
57 | int cancel); | ||
58 | int ocfs2_flush_truncate_log(struct ocfs2_super *osb); | ||
59 | int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, | ||
60 | int slot_num, | ||
61 | struct ocfs2_dinode **tl_copy); | ||
62 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, | ||
63 | struct ocfs2_dinode *tl_copy); | ||
64 | |||
65 | struct ocfs2_truncate_context { | ||
66 | struct inode *tc_ext_alloc_inode; | ||
67 | struct buffer_head *tc_ext_alloc_bh; | ||
68 | int tc_ext_alloc_locked; /* is it cluster locked? */ | ||
69 | /* these get destroyed once it's passed to ocfs2_commit_truncate. */ | ||
70 | struct buffer_head *tc_last_eb_bh; | ||
71 | }; | ||
72 | |||
73 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | ||
74 | struct inode *inode, | ||
75 | struct buffer_head *fe_bh, | ||
76 | struct ocfs2_truncate_context **tc); | ||
77 | int ocfs2_commit_truncate(struct ocfs2_super *osb, | ||
78 | struct inode *inode, | ||
79 | struct buffer_head *fe_bh, | ||
80 | struct ocfs2_truncate_context *tc); | ||
81 | |||
82 | #endif /* OCFS2_ALLOC_H */ | ||
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c new file mode 100644 index 000000000000..8f4467a930a5 --- /dev/null +++ b/fs/ocfs2/aops.c | |||
@@ -0,0 +1,643 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/fs.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/pagemap.h> | ||
26 | #include <asm/byteorder.h> | ||
27 | |||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | ||
29 | #include <cluster/masklog.h> | ||
30 | |||
31 | #include "ocfs2.h" | ||
32 | |||
33 | #include "alloc.h" | ||
34 | #include "aops.h" | ||
35 | #include "dlmglue.h" | ||
36 | #include "extent_map.h" | ||
37 | #include "file.h" | ||
38 | #include "inode.h" | ||
39 | #include "journal.h" | ||
40 | #include "super.h" | ||
41 | #include "symlink.h" | ||
42 | |||
43 | #include "buffer_head_io.h" | ||
44 | |||
45 | static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, | ||
46 | struct buffer_head *bh_result, int create) | ||
47 | { | ||
48 | int err = -EIO; | ||
49 | int status; | ||
50 | struct ocfs2_dinode *fe = NULL; | ||
51 | struct buffer_head *bh = NULL; | ||
52 | struct buffer_head *buffer_cache_bh = NULL; | ||
53 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
54 | void *kaddr; | ||
55 | |||
56 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | ||
57 | (unsigned long long)iblock, bh_result, create); | ||
58 | |||
59 | BUG_ON(ocfs2_inode_is_fast_symlink(inode)); | ||
60 | |||
61 | if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { | ||
62 | mlog(ML_ERROR, "block offset > PATH_MAX: %llu", | ||
63 | (unsigned long long)iblock); | ||
64 | goto bail; | ||
65 | } | ||
66 | |||
67 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
68 | OCFS2_I(inode)->ip_blkno, | ||
69 | &bh, OCFS2_BH_CACHED, inode); | ||
70 | if (status < 0) { | ||
71 | mlog_errno(status); | ||
72 | goto bail; | ||
73 | } | ||
74 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
75 | |||
76 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
77 | mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", | ||
78 | fe->i_blkno, 7, fe->i_signature); | ||
79 | goto bail; | ||
80 | } | ||
81 | |||
82 | if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
83 | le32_to_cpu(fe->i_clusters))) { | ||
84 | mlog(ML_ERROR, "block offset is outside the allocated size: " | ||
85 | "%llu\n", (unsigned long long)iblock); | ||
86 | goto bail; | ||
87 | } | ||
88 | |||
89 | /* We don't use the page cache to create symlink data, so if | ||
90 | * need be, copy it over from the buffer cache. */ | ||
91 | if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { | ||
92 | u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + | ||
93 | iblock; | ||
94 | buffer_cache_bh = sb_getblk(osb->sb, blkno); | ||
95 | if (!buffer_cache_bh) { | ||
96 | mlog(ML_ERROR, "couldn't getblock for symlink!\n"); | ||
97 | goto bail; | ||
98 | } | ||
99 | |||
100 | /* we haven't locked out transactions, so a commit | ||
101 | * could've happened. Since we've got a reference on | ||
102 | * the bh, even if it commits while we're doing the | ||
103 | * copy, the data is still good. */ | ||
104 | if (buffer_jbd(buffer_cache_bh) | ||
105 | && ocfs2_inode_is_new(inode)) { | ||
106 | kaddr = kmap_atomic(bh_result->b_page, KM_USER0); | ||
107 | if (!kaddr) { | ||
108 | mlog(ML_ERROR, "couldn't kmap!\n"); | ||
109 | goto bail; | ||
110 | } | ||
111 | memcpy(kaddr + (bh_result->b_size * iblock), | ||
112 | buffer_cache_bh->b_data, | ||
113 | bh_result->b_size); | ||
114 | kunmap_atomic(kaddr, KM_USER0); | ||
115 | set_buffer_uptodate(bh_result); | ||
116 | } | ||
117 | brelse(buffer_cache_bh); | ||
118 | } | ||
119 | |||
120 | map_bh(bh_result, inode->i_sb, | ||
121 | le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); | ||
122 | |||
123 | err = 0; | ||
124 | |||
125 | bail: | ||
126 | if (bh) | ||
127 | brelse(bh); | ||
128 | |||
129 | mlog_exit(err); | ||
130 | return err; | ||
131 | } | ||
132 | |||
133 | static int ocfs2_get_block(struct inode *inode, sector_t iblock, | ||
134 | struct buffer_head *bh_result, int create) | ||
135 | { | ||
136 | int err = 0; | ||
137 | u64 p_blkno, past_eof; | ||
138 | |||
139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | ||
140 | (unsigned long long)iblock, bh_result, create); | ||
141 | |||
142 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) | ||
143 | mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", | ||
144 | inode, inode->i_ino); | ||
145 | |||
146 | if (S_ISLNK(inode->i_mode)) { | ||
147 | /* this always does I/O for some reason. */ | ||
148 | err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | /* this can happen if another node truncs after our extend! */ | ||
153 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
155 | OCFS2_I(inode)->ip_clusters)) | ||
156 | err = -EIO; | ||
157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
158 | if (err) | ||
159 | goto bail; | ||
160 | |||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
162 | NULL); | ||
163 | if (err) { | ||
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | ||
165 | "%"MLFu64", NULL)\n", err, inode, | ||
166 | (unsigned long long)iblock, p_blkno); | ||
167 | goto bail; | ||
168 | } | ||
169 | |||
170 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
171 | |||
172 | if (bh_result->b_blocknr == 0) { | ||
173 | err = -EIO; | ||
174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" " | ||
175 | "blkno=(%"MLFu64")\n", (unsigned long long)iblock, | ||
176 | p_blkno, OCFS2_I(inode)->ip_blkno); | ||
177 | } | ||
178 | |||
179 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
180 | mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof); | ||
181 | |||
182 | if (create && (iblock >= past_eof)) | ||
183 | set_buffer_new(bh_result); | ||
184 | |||
185 | bail: | ||
186 | if (err < 0) | ||
187 | err = -EIO; | ||
188 | |||
189 | mlog_exit(err); | ||
190 | return err; | ||
191 | } | ||
192 | |||
193 | static int ocfs2_readpage(struct file *file, struct page *page) | ||
194 | { | ||
195 | struct inode *inode = page->mapping->host; | ||
196 | loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
197 | int ret, unlock = 1; | ||
198 | |||
199 | mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); | ||
200 | |||
201 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | ||
202 | if (ret != 0) { | ||
203 | if (ret == AOP_TRUNCATED_PAGE) | ||
204 | unlock = 0; | ||
205 | mlog_errno(ret); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
210 | |||
211 | /* | ||
212 | * i_size might have just been updated as we grabed the meta lock. We | ||
213 | * might now be discovering a truncate that hit on another node. | ||
214 | * block_read_full_page->get_block freaks out if it is asked to read | ||
215 | * beyond the end of a file, so we check here. Callers | ||
216 | * (generic_file_read, fault->nopage) are clever enough to check i_size | ||
217 | * and notice that the page they just read isn't needed. | ||
218 | * | ||
219 | * XXX sys_readahead() seems to get that wrong? | ||
220 | */ | ||
221 | if (start >= i_size_read(inode)) { | ||
222 | char *addr = kmap(page); | ||
223 | memset(addr, 0, PAGE_SIZE); | ||
224 | flush_dcache_page(page); | ||
225 | kunmap(page); | ||
226 | SetPageUptodate(page); | ||
227 | ret = 0; | ||
228 | goto out_alloc; | ||
229 | } | ||
230 | |||
231 | ret = ocfs2_data_lock_with_page(inode, 0, page); | ||
232 | if (ret != 0) { | ||
233 | if (ret == AOP_TRUNCATED_PAGE) | ||
234 | unlock = 0; | ||
235 | mlog_errno(ret); | ||
236 | goto out_alloc; | ||
237 | } | ||
238 | |||
239 | ret = block_read_full_page(page, ocfs2_get_block); | ||
240 | unlock = 0; | ||
241 | |||
242 | ocfs2_data_unlock(inode, 0); | ||
243 | out_alloc: | ||
244 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
245 | ocfs2_meta_unlock(inode, 0); | ||
246 | out: | ||
247 | if (unlock) | ||
248 | unlock_page(page); | ||
249 | mlog_exit(ret); | ||
250 | return ret; | ||
251 | } | ||
252 | |||
253 | /* Note: Because we don't support holes, our allocation has | ||
254 | * already happened (allocation writes zeros to the file data) | ||
255 | * so we don't have to worry about ordered writes in | ||
256 | * ocfs2_writepage. | ||
257 | * | ||
258 | * ->writepage is called during the process of invalidating the page cache | ||
259 | * during blocked lock processing. It can't block on any cluster locks | ||
260 | * to during block mapping. It's relying on the fact that the block | ||
261 | * mapping can't have disappeared under the dirty pages that it is | ||
262 | * being asked to write back. | ||
263 | */ | ||
264 | static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | ||
265 | { | ||
266 | int ret; | ||
267 | |||
268 | mlog_entry("(0x%p)\n", page); | ||
269 | |||
270 | ret = block_write_full_page(page, ocfs2_get_block, wbc); | ||
271 | |||
272 | mlog_exit(ret); | ||
273 | |||
274 | return ret; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
279 | * from loopback. It must be able to perform its own locking around | ||
280 | * ocfs2_get_block(). | ||
281 | */ | ||
282 | int ocfs2_prepare_write(struct file *file, struct page *page, | ||
283 | unsigned from, unsigned to) | ||
284 | { | ||
285 | struct inode *inode = page->mapping->host; | ||
286 | int ret; | ||
287 | |||
288 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
289 | |||
290 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | ||
291 | if (ret != 0) { | ||
292 | mlog_errno(ret); | ||
293 | goto out; | ||
294 | } | ||
295 | |||
296 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
297 | |||
298 | ret = block_prepare_write(page, from, to, ocfs2_get_block); | ||
299 | |||
300 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
301 | |||
302 | ocfs2_meta_unlock(inode, 0); | ||
303 | out: | ||
304 | mlog_exit(ret); | ||
305 | return ret; | ||
306 | } | ||
307 | |||
308 | /* Taken from ext3. We don't necessarily need the full blown | ||
309 | * functionality yet, but IMHO it's better to cut and paste the whole | ||
310 | * thing so we can avoid introducing our own bugs (and easily pick up | ||
311 | * their fixes when they happen) --Mark */ | ||
312 | static int walk_page_buffers( handle_t *handle, | ||
313 | struct buffer_head *head, | ||
314 | unsigned from, | ||
315 | unsigned to, | ||
316 | int *partial, | ||
317 | int (*fn)( handle_t *handle, | ||
318 | struct buffer_head *bh)) | ||
319 | { | ||
320 | struct buffer_head *bh; | ||
321 | unsigned block_start, block_end; | ||
322 | unsigned blocksize = head->b_size; | ||
323 | int err, ret = 0; | ||
324 | struct buffer_head *next; | ||
325 | |||
326 | for ( bh = head, block_start = 0; | ||
327 | ret == 0 && (bh != head || !block_start); | ||
328 | block_start = block_end, bh = next) | ||
329 | { | ||
330 | next = bh->b_this_page; | ||
331 | block_end = block_start + blocksize; | ||
332 | if (block_end <= from || block_start >= to) { | ||
333 | if (partial && !buffer_uptodate(bh)) | ||
334 | *partial = 1; | ||
335 | continue; | ||
336 | } | ||
337 | err = (*fn)(handle, bh); | ||
338 | if (!ret) | ||
339 | ret = err; | ||
340 | } | ||
341 | return ret; | ||
342 | } | ||
343 | |||
344 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | ||
345 | struct page *page, | ||
346 | unsigned from, | ||
347 | unsigned to) | ||
348 | { | ||
349 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
350 | struct ocfs2_journal_handle *handle = NULL; | ||
351 | int ret = 0; | ||
352 | |||
353 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
354 | if (!handle) { | ||
355 | ret = -ENOMEM; | ||
356 | mlog_errno(ret); | ||
357 | goto out; | ||
358 | } | ||
359 | |||
360 | if (ocfs2_should_order_data(inode)) { | ||
361 | ret = walk_page_buffers(handle->k_handle, | ||
362 | page_buffers(page), | ||
363 | from, to, NULL, | ||
364 | ocfs2_journal_dirty_data); | ||
365 | if (ret < 0) | ||
366 | mlog_errno(ret); | ||
367 | } | ||
368 | out: | ||
369 | if (ret) { | ||
370 | if (handle) | ||
371 | ocfs2_commit_trans(handle); | ||
372 | handle = ERR_PTR(ret); | ||
373 | } | ||
374 | return handle; | ||
375 | } | ||
376 | |||
377 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
378 | unsigned from, unsigned to) | ||
379 | { | ||
380 | int ret, extending = 0, locklevel = 0; | ||
381 | loff_t new_i_size; | ||
382 | struct buffer_head *di_bh = NULL; | ||
383 | struct inode *inode = page->mapping->host; | ||
384 | struct ocfs2_journal_handle *handle = NULL; | ||
385 | |||
386 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
387 | |||
388 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
389 | * us to sample inode->i_size here without the metadata lock: | ||
390 | * | ||
391 | * 1) We're currently holding the inode alloc lock, so no | ||
392 | * nodes can change it underneath us. | ||
393 | * | ||
394 | * 2) We've had to take the metadata lock at least once | ||
395 | * already to check for extending writes, hence insuring | ||
396 | * that our current copy is also up to date. | ||
397 | */ | ||
398 | new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; | ||
399 | if (new_i_size > i_size_read(inode)) { | ||
400 | extending = 1; | ||
401 | locklevel = 1; | ||
402 | } | ||
403 | |||
404 | ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); | ||
405 | if (ret != 0) { | ||
406 | mlog_errno(ret); | ||
407 | goto out; | ||
408 | } | ||
409 | |||
410 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
411 | if (ret != 0) { | ||
412 | mlog_errno(ret); | ||
413 | goto out_unlock_meta; | ||
414 | } | ||
415 | |||
416 | if (extending) { | ||
417 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
418 | if (IS_ERR(handle)) { | ||
419 | ret = PTR_ERR(handle); | ||
420 | handle = NULL; | ||
421 | goto out_unlock_data; | ||
422 | } | ||
423 | |||
424 | /* Mark our buffer early. We'd rather catch this error up here | ||
425 | * as opposed to after a successful commit_write which would | ||
426 | * require us to set back inode->i_size. */ | ||
427 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
428 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
429 | if (ret < 0) { | ||
430 | mlog_errno(ret); | ||
431 | goto out_commit; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | /* might update i_size */ | ||
436 | ret = generic_commit_write(file, page, from, to); | ||
437 | if (ret < 0) { | ||
438 | mlog_errno(ret); | ||
439 | goto out_commit; | ||
440 | } | ||
441 | |||
442 | if (extending) { | ||
443 | loff_t size = (u64) i_size_read(inode); | ||
444 | struct ocfs2_dinode *di = | ||
445 | (struct ocfs2_dinode *)di_bh->b_data; | ||
446 | |||
447 | /* ocfs2_mark_inode_dirty is too heavy to use here. */ | ||
448 | inode->i_blocks = ocfs2_align_bytes_to_sectors(size); | ||
449 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
450 | |||
451 | di->i_size = cpu_to_le64(size); | ||
452 | di->i_ctime = di->i_mtime = | ||
453 | cpu_to_le64(inode->i_mtime.tv_sec); | ||
454 | di->i_ctime_nsec = di->i_mtime_nsec = | ||
455 | cpu_to_le32(inode->i_mtime.tv_nsec); | ||
456 | |||
457 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
458 | if (ret < 0) { | ||
459 | mlog_errno(ret); | ||
460 | goto out_commit; | ||
461 | } | ||
462 | } | ||
463 | |||
464 | BUG_ON(extending && (i_size_read(inode) != new_i_size)); | ||
465 | |||
466 | out_commit: | ||
467 | if (handle) | ||
468 | ocfs2_commit_trans(handle); | ||
469 | out_unlock_data: | ||
470 | ocfs2_data_unlock(inode, 1); | ||
471 | out_unlock_meta: | ||
472 | ocfs2_meta_unlock(inode, locklevel); | ||
473 | out: | ||
474 | if (di_bh) | ||
475 | brelse(di_bh); | ||
476 | |||
477 | mlog_exit(ret); | ||
478 | return ret; | ||
479 | } | ||
480 | |||
481 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | ||
482 | { | ||
483 | sector_t status; | ||
484 | u64 p_blkno = 0; | ||
485 | int err = 0; | ||
486 | struct inode *inode = mapping->host; | ||
487 | |||
488 | mlog_entry("(block = %llu)\n", (unsigned long long)block); | ||
489 | |||
490 | /* We don't need to lock journal system files, since they aren't | ||
491 | * accessed concurrently from multiple nodes. | ||
492 | */ | ||
493 | if (!INODE_JOURNAL(inode)) { | ||
494 | err = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
495 | if (err) { | ||
496 | if (err != -ENOENT) | ||
497 | mlog_errno(err); | ||
498 | goto bail; | ||
499 | } | ||
500 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
501 | } | ||
502 | |||
503 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | ||
504 | NULL); | ||
505 | |||
506 | if (!INODE_JOURNAL(inode)) { | ||
507 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
508 | ocfs2_meta_unlock(inode, 0); | ||
509 | } | ||
510 | |||
511 | if (err) { | ||
512 | mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", | ||
513 | (unsigned long long)block); | ||
514 | mlog_errno(err); | ||
515 | goto bail; | ||
516 | } | ||
517 | |||
518 | |||
519 | bail: | ||
520 | status = err ? 0 : p_blkno; | ||
521 | |||
522 | mlog_exit((int)status); | ||
523 | |||
524 | return status; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * TODO: Make this into a generic get_blocks function. | ||
529 | * | ||
530 | * From do_direct_io in direct-io.c: | ||
531 | * "So what we do is to permit the ->get_blocks function to populate | ||
532 | * bh.b_size with the size of IO which is permitted at this offset and | ||
533 | * this i_blkbits." | ||
534 | * | ||
535 | * This function is called directly from get_more_blocks in direct-io.c. | ||
536 | * | ||
537 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
538 | * fs_count, map_bh, dio->rw == WRITE); | ||
539 | */ | ||
540 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | ||
541 | unsigned long max_blocks, | ||
542 | struct buffer_head *bh_result, int create) | ||
543 | { | ||
544 | int ret; | ||
545 | u64 vbo_max; /* file offset, max_blocks from iblock */ | ||
546 | u64 p_blkno; | ||
547 | int contig_blocks; | ||
548 | unsigned char blocksize_bits; | ||
549 | |||
550 | if (!inode || !bh_result) { | ||
551 | mlog(ML_ERROR, "inode or bh_result is null\n"); | ||
552 | return -EIO; | ||
553 | } | ||
554 | |||
555 | blocksize_bits = inode->i_sb->s_blocksize_bits; | ||
556 | |||
557 | /* This function won't even be called if the request isn't all | ||
558 | * nicely aligned and of the right size, so there's no need | ||
559 | * for us to check any of that. */ | ||
560 | |||
561 | vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; | ||
562 | |||
563 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
564 | if ((iblock + max_blocks) > | ||
565 | ocfs2_clusters_to_blocks(inode->i_sb, | ||
566 | OCFS2_I(inode)->ip_clusters)) { | ||
567 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
568 | ret = -EIO; | ||
569 | goto bail; | ||
570 | } | ||
571 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
572 | |||
573 | /* This figures out the size of the next contiguous block, and | ||
574 | * our logical offset */ | ||
575 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
576 | &contig_blocks); | ||
577 | if (ret) { | ||
578 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
579 | (unsigned long long)iblock); | ||
580 | ret = -EIO; | ||
581 | goto bail; | ||
582 | } | ||
583 | |||
584 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
585 | |||
586 | /* make sure we don't map more than max_blocks blocks here as | ||
587 | that's all the kernel will handle at this point. */ | ||
588 | if (max_blocks < contig_blocks) | ||
589 | contig_blocks = max_blocks; | ||
590 | bh_result->b_size = contig_blocks << blocksize_bits; | ||
591 | bail: | ||
592 | return ret; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
597 | * particularly interested in the aio/dio case. Like the core uses | ||
598 | * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from | ||
599 | * truncation on another. | ||
600 | */ | ||
601 | static void ocfs2_dio_end_io(struct kiocb *iocb, | ||
602 | loff_t offset, | ||
603 | ssize_t bytes, | ||
604 | void *private) | ||
605 | { | ||
606 | struct inode *inode = iocb->ki_filp->f_dentry->d_inode; | ||
607 | |||
608 | /* this io's submitter should not have unlocked this before we could */ | ||
609 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
610 | ocfs2_iocb_clear_rw_locked(iocb); | ||
611 | up_read(&inode->i_alloc_sem); | ||
612 | ocfs2_rw_unlock(inode, 0); | ||
613 | } | ||
614 | |||
615 | static ssize_t ocfs2_direct_IO(int rw, | ||
616 | struct kiocb *iocb, | ||
617 | const struct iovec *iov, | ||
618 | loff_t offset, | ||
619 | unsigned long nr_segs) | ||
620 | { | ||
621 | struct file *file = iocb->ki_filp; | ||
622 | struct inode *inode = file->f_dentry->d_inode->i_mapping->host; | ||
623 | int ret; | ||
624 | |||
625 | mlog_entry_void(); | ||
626 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | ||
627 | inode->i_sb->s_bdev, iov, offset, | ||
628 | nr_segs, | ||
629 | ocfs2_direct_IO_get_blocks, | ||
630 | ocfs2_dio_end_io); | ||
631 | mlog_exit(ret); | ||
632 | return ret; | ||
633 | } | ||
634 | |||
635 | struct address_space_operations ocfs2_aops = { | ||
636 | .readpage = ocfs2_readpage, | ||
637 | .writepage = ocfs2_writepage, | ||
638 | .prepare_write = ocfs2_prepare_write, | ||
639 | .commit_write = ocfs2_commit_write, | ||
640 | .bmap = ocfs2_bmap, | ||
641 | .sync_page = block_sync_page, | ||
642 | .direct_IO = ocfs2_direct_IO | ||
643 | }; | ||
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h new file mode 100644 index 000000000000..d40456d509a0 --- /dev/null +++ b/fs/ocfs2/aops.h | |||
@@ -0,0 +1,41 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef OCFS2_AOPS_H | ||
23 | #define OCFS2_AOPS_H | ||
24 | |||
25 | int ocfs2_prepare_write(struct file *file, struct page *page, | ||
26 | unsigned from, unsigned to); | ||
27 | |||
28 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | ||
29 | struct page *page, | ||
30 | unsigned from, | ||
31 | unsigned to); | ||
32 | |||
33 | /* all ocfs2_dio_end_io()'s fault */ | ||
34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | ||
35 | test_bit(0, (unsigned long *)&iocb->private) | ||
36 | #define ocfs2_iocb_set_rw_locked(iocb) \ | ||
37 | set_bit(0, (unsigned long *)&iocb->private) | ||
38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ | ||
39 | clear_bit(0, (unsigned long *)&iocb->private) | ||
40 | |||
41 | #endif /* OCFS2_FILE_H */ | ||
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c new file mode 100644 index 000000000000..d424041b38e9 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.c | |||
@@ -0,0 +1,232 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * io.c | ||
5 | * | ||
6 | * Buffer cache handling | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #include <cluster/masklog.h> | ||
32 | |||
33 | #include "ocfs2.h" | ||
34 | |||
35 | #include "alloc.h" | ||
36 | #include "inode.h" | ||
37 | #include "journal.h" | ||
38 | #include "uptodate.h" | ||
39 | |||
40 | #include "buffer_head_io.h" | ||
41 | |||
42 | int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, | ||
43 | struct inode *inode) | ||
44 | { | ||
45 | int ret = 0; | ||
46 | |||
47 | mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", | ||
48 | (unsigned long long)bh->b_blocknr, inode); | ||
49 | |||
50 | BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); | ||
51 | BUG_ON(buffer_jbd(bh)); | ||
52 | |||
53 | /* No need to check for a soft readonly file system here. non | ||
54 | * journalled writes are only ever done on system files which | ||
55 | * can get modified during recovery even if read-only. */ | ||
56 | if (ocfs2_is_hard_readonly(osb)) { | ||
57 | ret = -EROFS; | ||
58 | goto out; | ||
59 | } | ||
60 | |||
61 | down(&OCFS2_I(inode)->ip_io_sem); | ||
62 | |||
63 | lock_buffer(bh); | ||
64 | set_buffer_uptodate(bh); | ||
65 | |||
66 | /* remove from dirty list before I/O. */ | ||
67 | clear_buffer_dirty(bh); | ||
68 | |||
69 | get_bh(bh); /* for end_buffer_write_sync() */ | ||
70 | bh->b_end_io = end_buffer_write_sync; | ||
71 | submit_bh(WRITE, bh); | ||
72 | |||
73 | wait_on_buffer(bh); | ||
74 | |||
75 | if (buffer_uptodate(bh)) { | ||
76 | ocfs2_set_buffer_uptodate(inode, bh); | ||
77 | } else { | ||
78 | /* We don't need to remove the clustered uptodate | ||
79 | * information for this bh as it's not marked locally | ||
80 | * uptodate. */ | ||
81 | ret = -EIO; | ||
82 | brelse(bh); | ||
83 | } | ||
84 | |||
85 | up(&OCFS2_I(inode)->ip_io_sem); | ||
86 | out: | ||
87 | mlog_exit(ret); | ||
88 | return ret; | ||
89 | } | ||
90 | |||
91 | int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | ||
92 | struct buffer_head *bhs[], int flags, | ||
93 | struct inode *inode) | ||
94 | { | ||
95 | int status = 0; | ||
96 | struct super_block *sb; | ||
97 | int i, ignore_cache = 0; | ||
98 | struct buffer_head *bh; | ||
99 | |||
100 | mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n", | ||
101 | block, nr, flags, inode); | ||
102 | |||
103 | if (osb == NULL || osb->sb == NULL || bhs == NULL) { | ||
104 | status = -EINVAL; | ||
105 | mlog_errno(status); | ||
106 | goto bail; | ||
107 | } | ||
108 | |||
109 | if (nr < 0) { | ||
110 | mlog(ML_ERROR, "asked to read %d blocks!\n", nr); | ||
111 | status = -EINVAL; | ||
112 | mlog_errno(status); | ||
113 | goto bail; | ||
114 | } | ||
115 | |||
116 | if (nr == 0) { | ||
117 | mlog(ML_BH_IO, "No buffers will be read!\n"); | ||
118 | status = 0; | ||
119 | goto bail; | ||
120 | } | ||
121 | |||
122 | sb = osb->sb; | ||
123 | |||
124 | if (flags & OCFS2_BH_CACHED && !inode) | ||
125 | flags &= ~OCFS2_BH_CACHED; | ||
126 | |||
127 | if (inode) | ||
128 | down(&OCFS2_I(inode)->ip_io_sem); | ||
129 | for (i = 0 ; i < nr ; i++) { | ||
130 | if (bhs[i] == NULL) { | ||
131 | bhs[i] = sb_getblk(sb, block++); | ||
132 | if (bhs[i] == NULL) { | ||
133 | if (inode) | ||
134 | up(&OCFS2_I(inode)->ip_io_sem); | ||
135 | status = -EIO; | ||
136 | mlog_errno(status); | ||
137 | goto bail; | ||
138 | } | ||
139 | } | ||
140 | bh = bhs[i]; | ||
141 | ignore_cache = 0; | ||
142 | |||
143 | if (flags & OCFS2_BH_CACHED && | ||
144 | !ocfs2_buffer_uptodate(inode, bh)) { | ||
145 | mlog(ML_UPTODATE, | ||
146 | "bh (%llu), inode %"MLFu64" not uptodate\n", | ||
147 | (unsigned long long)bh->b_blocknr, | ||
148 | OCFS2_I(inode)->ip_blkno); | ||
149 | ignore_cache = 1; | ||
150 | } | ||
151 | |||
152 | /* XXX: Can we ever get this and *not* have the cached | ||
153 | * flag set? */ | ||
154 | if (buffer_jbd(bh)) { | ||
155 | if (!(flags & OCFS2_BH_CACHED) || ignore_cache) | ||
156 | mlog(ML_BH_IO, "trying to sync read a jbd " | ||
157 | "managed bh (blocknr = %llu)\n", | ||
158 | (unsigned long long)bh->b_blocknr); | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { | ||
163 | if (buffer_dirty(bh)) { | ||
164 | /* This should probably be a BUG, or | ||
165 | * at least return an error. */ | ||
166 | mlog(ML_BH_IO, "asking me to sync read a dirty " | ||
167 | "buffer! (blocknr = %llu)\n", | ||
168 | (unsigned long long)bh->b_blocknr); | ||
169 | continue; | ||
170 | } | ||
171 | |||
172 | lock_buffer(bh); | ||
173 | if (buffer_jbd(bh)) { | ||
174 | #ifdef CATCH_BH_JBD_RACES | ||
175 | mlog(ML_ERROR, "block %llu had the JBD bit set " | ||
176 | "while I was in lock_buffer!", | ||
177 | (unsigned long long)bh->b_blocknr); | ||
178 | BUG(); | ||
179 | #else | ||
180 | unlock_buffer(bh); | ||
181 | continue; | ||
182 | #endif | ||
183 | } | ||
184 | clear_buffer_uptodate(bh); | ||
185 | get_bh(bh); /* for end_buffer_read_sync() */ | ||
186 | bh->b_end_io = end_buffer_read_sync; | ||
187 | if (flags & OCFS2_BH_READAHEAD) | ||
188 | submit_bh(READA, bh); | ||
189 | else | ||
190 | submit_bh(READ, bh); | ||
191 | continue; | ||
192 | } | ||
193 | } | ||
194 | |||
195 | status = 0; | ||
196 | |||
197 | for (i = (nr - 1); i >= 0; i--) { | ||
198 | bh = bhs[i]; | ||
199 | |||
200 | /* We know this can't have changed as we hold the | ||
201 | * inode sem. Avoid doing any work on the bh if the | ||
202 | * journal has it. */ | ||
203 | if (!buffer_jbd(bh)) | ||
204 | wait_on_buffer(bh); | ||
205 | |||
206 | if (!buffer_uptodate(bh)) { | ||
207 | /* Status won't be cleared from here on out, | ||
208 | * so we can safely record this and loop back | ||
209 | * to cleanup the other buffers. Don't need to | ||
210 | * remove the clustered uptodate information | ||
211 | * for this bh as it's not marked locally | ||
212 | * uptodate. */ | ||
213 | status = -EIO; | ||
214 | brelse(bh); | ||
215 | bhs[i] = NULL; | ||
216 | continue; | ||
217 | } | ||
218 | |||
219 | if (inode) | ||
220 | ocfs2_set_buffer_uptodate(inode, bh); | ||
221 | } | ||
222 | if (inode) | ||
223 | up(&OCFS2_I(inode)->ip_io_sem); | ||
224 | |||
225 | mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, | ||
226 | (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); | ||
227 | |||
228 | bail: | ||
229 | |||
230 | mlog_exit(status); | ||
231 | return status; | ||
232 | } | ||
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h new file mode 100644 index 000000000000..6ecb90937b68 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.h | |||
@@ -0,0 +1,73 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_buffer_head.h | ||
5 | * | ||
6 | * Buffer cache handling functions defined | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_BUFFER_HEAD_IO_H | ||
27 | #define OCFS2_BUFFER_HEAD_IO_H | ||
28 | |||
29 | #include <linux/buffer_head.h> | ||
30 | |||
31 | void ocfs2_end_buffer_io_sync(struct buffer_head *bh, | ||
32 | int uptodate); | ||
33 | |||
34 | static inline int ocfs2_read_block(struct ocfs2_super *osb, | ||
35 | u64 off, | ||
36 | struct buffer_head **bh, | ||
37 | int flags, | ||
38 | struct inode *inode); | ||
39 | |||
40 | int ocfs2_write_block(struct ocfs2_super *osb, | ||
41 | struct buffer_head *bh, | ||
42 | struct inode *inode); | ||
43 | int ocfs2_read_blocks(struct ocfs2_super *osb, | ||
44 | u64 block, | ||
45 | int nr, | ||
46 | struct buffer_head *bhs[], | ||
47 | int flags, | ||
48 | struct inode *inode); | ||
49 | |||
50 | |||
51 | #define OCFS2_BH_CACHED 1 | ||
52 | #define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ | ||
53 | |||
54 | static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, | ||
55 | struct buffer_head **bh, int flags, | ||
56 | struct inode *inode) | ||
57 | { | ||
58 | int status = 0; | ||
59 | |||
60 | if (bh == NULL) { | ||
61 | printk("ocfs2: bh == NULL\n"); | ||
62 | status = -EINVAL; | ||
63 | goto bail; | ||
64 | } | ||
65 | |||
66 | status = ocfs2_read_blocks(osb, off, 1, bh, | ||
67 | flags, inode); | ||
68 | |||
69 | bail: | ||
70 | return status; | ||
71 | } | ||
72 | |||
73 | #endif /* OCFS2_BUFFER_HEAD_IO_H */ | ||
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c new file mode 100644 index 000000000000..bd85182e97bc --- /dev/null +++ b/fs/ocfs2/dcache.c | |||
@@ -0,0 +1,91 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dcache.c | ||
5 | * | ||
6 | * dentry cache handling code | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/namei.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_DCACHE | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dcache.h" | ||
38 | #include "file.h" | ||
39 | #include "inode.h" | ||
40 | |||
41 | static int ocfs2_dentry_revalidate(struct dentry *dentry, | ||
42 | struct nameidata *nd) | ||
43 | { | ||
44 | struct inode *inode = dentry->d_inode; | ||
45 | int ret = 0; /* if all else fails, just return false */ | ||
46 | struct ocfs2_super *osb; | ||
47 | |||
48 | mlog_entry("(0x%p, '%.*s')\n", dentry, | ||
49 | dentry->d_name.len, dentry->d_name.name); | ||
50 | |||
51 | /* Never trust a negative dentry - force a new lookup. */ | ||
52 | if (inode == NULL) { | ||
53 | mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, | ||
54 | dentry->d_name.name); | ||
55 | goto bail; | ||
56 | } | ||
57 | |||
58 | osb = OCFS2_SB(inode->i_sb); | ||
59 | |||
60 | BUG_ON(!osb); | ||
61 | |||
62 | if (inode != osb->root_inode) { | ||
63 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
64 | /* did we or someone else delete this inode? */ | ||
65 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
66 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
67 | mlog(0, "inode (%"MLFu64") deleted, returning false\n", | ||
68 | OCFS2_I(inode)->ip_blkno); | ||
69 | goto bail; | ||
70 | } | ||
71 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
72 | |||
73 | if (!inode->i_nlink) { | ||
74 | mlog(0, "Inode %"MLFu64" orphaned, returning false " | ||
75 | "dir = %d\n", OCFS2_I(inode)->ip_blkno, | ||
76 | S_ISDIR(inode->i_mode)); | ||
77 | goto bail; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | ret = 1; | ||
82 | |||
83 | bail: | ||
84 | mlog_exit(ret); | ||
85 | |||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | struct dentry_operations ocfs2_dentry_ops = { | ||
90 | .d_revalidate = ocfs2_dentry_revalidate, | ||
91 | }; | ||
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h new file mode 100644 index 000000000000..90072771114b --- /dev/null +++ b/fs/ocfs2/dcache.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dcache.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_DCACHE_H | ||
27 | #define OCFS2_DCACHE_H | ||
28 | |||
29 | extern struct dentry_operations ocfs2_dentry_ops; | ||
30 | |||
31 | #endif /* OCFS2_DCACHE_H */ | ||
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c new file mode 100644 index 000000000000..856e20ae8263 --- /dev/null +++ b/fs/ocfs2/dir.c | |||
@@ -0,0 +1,618 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dir.c | ||
5 | * | ||
6 | * Creates, reads, walks and deletes directory-nodes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * Portions of this code from linux/fs/ext3/dir.c | ||
11 | * | ||
12 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
13 | * Remy Card (card@masi.ibp.fr) | ||
14 | * Laboratoire MASI - Institut Blaise pascal | ||
15 | * Universite Pierre et Marie Curie (Paris VI) | ||
16 | * | ||
17 | * from | ||
18 | * | ||
19 | * linux/fs/minix/dir.c | ||
20 | * | ||
21 | * Copyright (C) 1991, 1992 Linux Torvalds | ||
22 | * | ||
23 | * This program is free software; you can redistribute it and/or | ||
24 | * modify it under the terms of the GNU General Public | ||
25 | * License as published by the Free Software Foundation; either | ||
26 | * version 2 of the License, or (at your option) any later version. | ||
27 | * | ||
28 | * This program is distributed in the hope that it will be useful, | ||
29 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
30 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
31 | * General Public License for more details. | ||
32 | * | ||
33 | * You should have received a copy of the GNU General Public | ||
34 | * License along with this program; if not, write to the | ||
35 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
36 | * Boston, MA 021110-1307, USA. | ||
37 | */ | ||
38 | |||
39 | #include <linux/fs.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/highmem.h> | ||
43 | |||
44 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
45 | #include <cluster/masklog.h> | ||
46 | |||
47 | #include "ocfs2.h" | ||
48 | |||
49 | #include "alloc.h" | ||
50 | #include "dir.h" | ||
51 | #include "dlmglue.h" | ||
52 | #include "extent_map.h" | ||
53 | #include "file.h" | ||
54 | #include "inode.h" | ||
55 | #include "journal.h" | ||
56 | #include "namei.h" | ||
57 | #include "suballoc.h" | ||
58 | #include "uptodate.h" | ||
59 | |||
60 | #include "buffer_head_io.h" | ||
61 | |||
62 | static unsigned char ocfs2_filetype_table[] = { | ||
63 | DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK | ||
64 | }; | ||
65 | |||
66 | static int ocfs2_extend_dir(struct ocfs2_super *osb, | ||
67 | struct inode *dir, | ||
68 | struct buffer_head *parent_fe_bh, | ||
69 | struct buffer_head **new_de_bh); | ||
70 | /* | ||
71 | * ocfs2_readdir() | ||
72 | * | ||
73 | */ | ||
74 | int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) | ||
75 | { | ||
76 | int error = 0; | ||
77 | unsigned long offset, blk; | ||
78 | int i, num, stored; | ||
79 | struct buffer_head * bh, * tmp; | ||
80 | struct ocfs2_dir_entry * de; | ||
81 | int err; | ||
82 | struct inode *inode = filp->f_dentry->d_inode; | ||
83 | struct super_block * sb = inode->i_sb; | ||
84 | int have_disk_lock = 0; | ||
85 | |||
86 | mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
87 | |||
88 | stored = 0; | ||
89 | bh = NULL; | ||
90 | |||
91 | error = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
92 | if (error < 0) { | ||
93 | if (error != -ENOENT) | ||
94 | mlog_errno(error); | ||
95 | /* we haven't got any yet, so propagate the error. */ | ||
96 | stored = error; | ||
97 | goto bail; | ||
98 | } | ||
99 | have_disk_lock = 1; | ||
100 | |||
101 | offset = filp->f_pos & (sb->s_blocksize - 1); | ||
102 | |||
103 | while (!error && !stored && filp->f_pos < i_size_read(inode)) { | ||
104 | blk = (filp->f_pos) >> sb->s_blocksize_bits; | ||
105 | bh = ocfs2_bread(inode, blk, &err, 0); | ||
106 | if (!bh) { | ||
107 | mlog(ML_ERROR, "directory #%"MLFu64" contains a hole " | ||
108 | "at offset %lld\n", | ||
109 | OCFS2_I(inode)->ip_blkno, | ||
110 | filp->f_pos); | ||
111 | filp->f_pos += sb->s_blocksize - offset; | ||
112 | continue; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Do the readahead (8k) | ||
117 | */ | ||
118 | if (!offset) { | ||
119 | for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; | ||
120 | i > 0; i--) { | ||
121 | tmp = ocfs2_bread(inode, ++blk, &err, 1); | ||
122 | if (tmp) | ||
123 | brelse(tmp); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | revalidate: | ||
128 | /* If the dir block has changed since the last call to | ||
129 | * readdir(2), then we might be pointing to an invalid | ||
130 | * dirent right now. Scan from the start of the block | ||
131 | * to make sure. */ | ||
132 | if (filp->f_version != inode->i_version) { | ||
133 | for (i = 0; i < sb->s_blocksize && i < offset; ) { | ||
134 | de = (struct ocfs2_dir_entry *) (bh->b_data + i); | ||
135 | /* It's too expensive to do a full | ||
136 | * dirent test each time round this | ||
137 | * loop, but we do have to test at | ||
138 | * least that it is non-zero. A | ||
139 | * failure will be detected in the | ||
140 | * dirent test below. */ | ||
141 | if (le16_to_cpu(de->rec_len) < | ||
142 | OCFS2_DIR_REC_LEN(1)) | ||
143 | break; | ||
144 | i += le16_to_cpu(de->rec_len); | ||
145 | } | ||
146 | offset = i; | ||
147 | filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) | ||
148 | | offset; | ||
149 | filp->f_version = inode->i_version; | ||
150 | } | ||
151 | |||
152 | while (!error && filp->f_pos < i_size_read(inode) | ||
153 | && offset < sb->s_blocksize) { | ||
154 | de = (struct ocfs2_dir_entry *) (bh->b_data + offset); | ||
155 | if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { | ||
156 | /* On error, skip the f_pos to the | ||
157 | next block. */ | ||
158 | filp->f_pos = (filp->f_pos | | ||
159 | (sb->s_blocksize - 1)) + 1; | ||
160 | brelse(bh); | ||
161 | goto bail; | ||
162 | } | ||
163 | offset += le16_to_cpu(de->rec_len); | ||
164 | if (le64_to_cpu(de->inode)) { | ||
165 | /* We might block in the next section | ||
166 | * if the data destination is | ||
167 | * currently swapped out. So, use a | ||
168 | * version stamp to detect whether or | ||
169 | * not the directory has been modified | ||
170 | * during the copy operation. | ||
171 | */ | ||
172 | unsigned long version = filp->f_version; | ||
173 | unsigned char d_type = DT_UNKNOWN; | ||
174 | |||
175 | if (de->file_type < OCFS2_FT_MAX) | ||
176 | d_type = ocfs2_filetype_table[de->file_type]; | ||
177 | error = filldir(dirent, de->name, | ||
178 | de->name_len, | ||
179 | filp->f_pos, | ||
180 | ino_from_blkno(sb, le64_to_cpu(de->inode)), | ||
181 | d_type); | ||
182 | if (error) | ||
183 | break; | ||
184 | if (version != filp->f_version) | ||
185 | goto revalidate; | ||
186 | stored ++; | ||
187 | } | ||
188 | filp->f_pos += le16_to_cpu(de->rec_len); | ||
189 | } | ||
190 | offset = 0; | ||
191 | brelse(bh); | ||
192 | } | ||
193 | |||
194 | stored = 0; | ||
195 | bail: | ||
196 | if (have_disk_lock) | ||
197 | ocfs2_meta_unlock(inode, 0); | ||
198 | |||
199 | mlog_exit(stored); | ||
200 | |||
201 | return stored; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * NOTE: this should always be called with parent dir i_sem taken. | ||
206 | */ | ||
207 | int ocfs2_find_files_on_disk(const char *name, | ||
208 | int namelen, | ||
209 | u64 *blkno, | ||
210 | struct inode *inode, | ||
211 | struct buffer_head **dirent_bh, | ||
212 | struct ocfs2_dir_entry **dirent) | ||
213 | { | ||
214 | int status = -ENOENT; | ||
215 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
216 | |||
217 | mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, " | ||
218 | "inode=%p)\n", | ||
219 | osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode); | ||
220 | |||
221 | *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); | ||
222 | if (!*dirent_bh || !*dirent) { | ||
223 | status = -ENOENT; | ||
224 | goto leave; | ||
225 | } | ||
226 | |||
227 | *blkno = le64_to_cpu((*dirent)->inode); | ||
228 | |||
229 | status = 0; | ||
230 | leave: | ||
231 | if (status < 0) { | ||
232 | *dirent = NULL; | ||
233 | if (*dirent_bh) { | ||
234 | brelse(*dirent_bh); | ||
235 | *dirent_bh = NULL; | ||
236 | } | ||
237 | } | ||
238 | |||
239 | mlog_exit(status); | ||
240 | return status; | ||
241 | } | ||
242 | |||
243 | /* Check for a name within a directory. | ||
244 | * | ||
245 | * Return 0 if the name does not exist | ||
246 | * Return -EEXIST if the directory contains the name | ||
247 | * | ||
248 | * Callers should have i_sem + a cluster lock on dir | ||
249 | */ | ||
250 | int ocfs2_check_dir_for_entry(struct inode *dir, | ||
251 | const char *name, | ||
252 | int namelen) | ||
253 | { | ||
254 | int ret; | ||
255 | struct buffer_head *dirent_bh = NULL; | ||
256 | struct ocfs2_dir_entry *dirent = NULL; | ||
257 | |||
258 | mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno, | ||
259 | namelen, name); | ||
260 | |||
261 | ret = -EEXIST; | ||
262 | dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); | ||
263 | if (dirent_bh) | ||
264 | goto bail; | ||
265 | |||
266 | ret = 0; | ||
267 | bail: | ||
268 | if (dirent_bh) | ||
269 | brelse(dirent_bh); | ||
270 | |||
271 | mlog_exit(ret); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * routine to check that the specified directory is empty (for rmdir) | ||
277 | */ | ||
278 | int ocfs2_empty_dir(struct inode *inode) | ||
279 | { | ||
280 | unsigned long offset; | ||
281 | struct buffer_head * bh; | ||
282 | struct ocfs2_dir_entry * de, * de1; | ||
283 | struct super_block * sb; | ||
284 | int err; | ||
285 | |||
286 | sb = inode->i_sb; | ||
287 | if ((i_size_read(inode) < | ||
288 | (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) || | ||
289 | !(bh = ocfs2_bread(inode, 0, &err, 0))) { | ||
290 | mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " | ||
291 | "no data block\n", | ||
292 | OCFS2_I(inode)->ip_blkno); | ||
293 | return 1; | ||
294 | } | ||
295 | |||
296 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
297 | de1 = (struct ocfs2_dir_entry *) | ||
298 | ((char *)de + le16_to_cpu(de->rec_len)); | ||
299 | if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || | ||
300 | !le64_to_cpu(de1->inode) || | ||
301 | strcmp(".", de->name) || | ||
302 | strcmp("..", de1->name)) { | ||
303 | mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " | ||
304 | "no `.' or `..'\n", | ||
305 | OCFS2_I(inode)->ip_blkno); | ||
306 | brelse(bh); | ||
307 | return 1; | ||
308 | } | ||
309 | offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); | ||
310 | de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); | ||
311 | while (offset < i_size_read(inode) ) { | ||
312 | if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { | ||
313 | brelse(bh); | ||
314 | bh = ocfs2_bread(inode, | ||
315 | offset >> sb->s_blocksize_bits, &err, 0); | ||
316 | if (!bh) { | ||
317 | mlog(ML_ERROR, "directory #%"MLFu64" contains " | ||
318 | "a hole at offset %lu\n", | ||
319 | OCFS2_I(inode)->ip_blkno, offset); | ||
320 | offset += sb->s_blocksize; | ||
321 | continue; | ||
322 | } | ||
323 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
324 | } | ||
325 | if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { | ||
326 | brelse(bh); | ||
327 | return 1; | ||
328 | } | ||
329 | if (le64_to_cpu(de->inode)) { | ||
330 | brelse(bh); | ||
331 | return 0; | ||
332 | } | ||
333 | offset += le16_to_cpu(de->rec_len); | ||
334 | de = (struct ocfs2_dir_entry *) | ||
335 | ((char *)de + le16_to_cpu(de->rec_len)); | ||
336 | } | ||
337 | brelse(bh); | ||
338 | return 1; | ||
339 | } | ||
340 | |||
341 | /* returns a bh of the 1st new block in the allocation. */ | ||
342 | int ocfs2_do_extend_dir(struct super_block *sb, | ||
343 | struct ocfs2_journal_handle *handle, | ||
344 | struct inode *dir, | ||
345 | struct buffer_head *parent_fe_bh, | ||
346 | struct ocfs2_alloc_context *data_ac, | ||
347 | struct ocfs2_alloc_context *meta_ac, | ||
348 | struct buffer_head **new_bh) | ||
349 | { | ||
350 | int status; | ||
351 | int extend; | ||
352 | u64 p_blkno; | ||
353 | |||
354 | spin_lock(&OCFS2_I(dir)->ip_lock); | ||
355 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); | ||
356 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
357 | |||
358 | if (extend) { | ||
359 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, | ||
360 | parent_fe_bh, handle, | ||
361 | data_ac, meta_ac, NULL); | ||
362 | BUG_ON(status == -EAGAIN); | ||
363 | if (status < 0) { | ||
364 | mlog_errno(status); | ||
365 | goto bail; | ||
366 | } | ||
367 | } | ||
368 | |||
369 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> | ||
370 | (sb->s_blocksize_bits - 9)), | ||
371 | 1, &p_blkno, NULL); | ||
372 | if (status < 0) { | ||
373 | mlog_errno(status); | ||
374 | goto bail; | ||
375 | } | ||
376 | |||
377 | *new_bh = sb_getblk(sb, p_blkno); | ||
378 | if (!*new_bh) { | ||
379 | status = -EIO; | ||
380 | mlog_errno(status); | ||
381 | goto bail; | ||
382 | } | ||
383 | status = 0; | ||
384 | bail: | ||
385 | mlog_exit(status); | ||
386 | return status; | ||
387 | } | ||
388 | |||
389 | /* assumes you already have a cluster lock on the directory. */ | ||
390 | static int ocfs2_extend_dir(struct ocfs2_super *osb, | ||
391 | struct inode *dir, | ||
392 | struct buffer_head *parent_fe_bh, | ||
393 | struct buffer_head **new_de_bh) | ||
394 | { | ||
395 | int status = 0; | ||
396 | int credits, num_free_extents; | ||
397 | loff_t dir_i_size; | ||
398 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
399 | struct ocfs2_alloc_context *data_ac = NULL; | ||
400 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
401 | struct ocfs2_journal_handle *handle = NULL; | ||
402 | struct buffer_head *new_bh = NULL; | ||
403 | struct ocfs2_dir_entry * de; | ||
404 | struct super_block *sb = osb->sb; | ||
405 | |||
406 | mlog_entry_void(); | ||
407 | |||
408 | dir_i_size = i_size_read(dir); | ||
409 | mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n", | ||
410 | OCFS2_I(dir)->ip_blkno, dir_i_size); | ||
411 | |||
412 | handle = ocfs2_alloc_handle(osb); | ||
413 | if (handle == NULL) { | ||
414 | status = -ENOMEM; | ||
415 | mlog_errno(status); | ||
416 | goto bail; | ||
417 | } | ||
418 | |||
419 | /* dir->i_size is always block aligned. */ | ||
420 | spin_lock(&OCFS2_I(dir)->ip_lock); | ||
421 | if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { | ||
422 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
423 | num_free_extents = ocfs2_num_free_extents(osb, dir, fe); | ||
424 | if (num_free_extents < 0) { | ||
425 | status = num_free_extents; | ||
426 | mlog_errno(status); | ||
427 | goto bail; | ||
428 | } | ||
429 | |||
430 | if (!num_free_extents) { | ||
431 | status = ocfs2_reserve_new_metadata(osb, handle, | ||
432 | fe, &meta_ac); | ||
433 | if (status < 0) { | ||
434 | if (status != -ENOSPC) | ||
435 | mlog_errno(status); | ||
436 | goto bail; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
441 | if (status < 0) { | ||
442 | if (status != -ENOSPC) | ||
443 | mlog_errno(status); | ||
444 | goto bail; | ||
445 | } | ||
446 | |||
447 | credits = ocfs2_calc_extend_credits(sb, fe, 1); | ||
448 | } else { | ||
449 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
450 | credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; | ||
451 | } | ||
452 | |||
453 | handle = ocfs2_start_trans(osb, handle, credits); | ||
454 | if (IS_ERR(handle)) { | ||
455 | status = PTR_ERR(handle); | ||
456 | handle = NULL; | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, | ||
462 | data_ac, meta_ac, &new_bh); | ||
463 | if (status < 0) { | ||
464 | mlog_errno(status); | ||
465 | goto bail; | ||
466 | } | ||
467 | |||
468 | ocfs2_set_new_buffer_uptodate(dir, new_bh); | ||
469 | |||
470 | status = ocfs2_journal_access(handle, dir, new_bh, | ||
471 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
472 | if (status < 0) { | ||
473 | mlog_errno(status); | ||
474 | goto bail; | ||
475 | } | ||
476 | memset(new_bh->b_data, 0, sb->s_blocksize); | ||
477 | de = (struct ocfs2_dir_entry *) new_bh->b_data; | ||
478 | de->inode = 0; | ||
479 | de->rec_len = cpu_to_le16(sb->s_blocksize); | ||
480 | status = ocfs2_journal_dirty(handle, new_bh); | ||
481 | if (status < 0) { | ||
482 | mlog_errno(status); | ||
483 | goto bail; | ||
484 | } | ||
485 | |||
486 | dir_i_size += dir->i_sb->s_blocksize; | ||
487 | i_size_write(dir, dir_i_size); | ||
488 | dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); | ||
489 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); | ||
490 | if (status < 0) { | ||
491 | mlog_errno(status); | ||
492 | goto bail; | ||
493 | } | ||
494 | |||
495 | *new_de_bh = new_bh; | ||
496 | get_bh(*new_de_bh); | ||
497 | bail: | ||
498 | if (handle) | ||
499 | ocfs2_commit_trans(handle); | ||
500 | |||
501 | if (data_ac) | ||
502 | ocfs2_free_alloc_context(data_ac); | ||
503 | if (meta_ac) | ||
504 | ocfs2_free_alloc_context(meta_ac); | ||
505 | |||
506 | if (new_bh) | ||
507 | brelse(new_bh); | ||
508 | |||
509 | mlog_exit(status); | ||
510 | return status; | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Search the dir for a good spot, extending it if necessary. The | ||
515 | * block containing an appropriate record is returned in ret_de_bh. | ||
516 | */ | ||
517 | int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, | ||
518 | struct inode *dir, | ||
519 | struct buffer_head *parent_fe_bh, | ||
520 | const char *name, | ||
521 | int namelen, | ||
522 | struct buffer_head **ret_de_bh) | ||
523 | { | ||
524 | unsigned long offset; | ||
525 | struct buffer_head * bh = NULL; | ||
526 | unsigned short rec_len; | ||
527 | struct ocfs2_dinode *fe; | ||
528 | struct ocfs2_dir_entry *de; | ||
529 | struct super_block *sb; | ||
530 | int status; | ||
531 | |||
532 | mlog_entry_void(); | ||
533 | |||
534 | mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n", | ||
535 | namelen, OCFS2_I(dir)->ip_blkno); | ||
536 | |||
537 | BUG_ON(!S_ISDIR(dir->i_mode)); | ||
538 | fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
539 | BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); | ||
540 | |||
541 | sb = dir->i_sb; | ||
542 | |||
543 | if (!namelen) { | ||
544 | status = -EINVAL; | ||
545 | mlog_errno(status); | ||
546 | goto bail; | ||
547 | } | ||
548 | |||
549 | bh = ocfs2_bread(dir, 0, &status, 0); | ||
550 | if (!bh) { | ||
551 | mlog_errno(status); | ||
552 | goto bail; | ||
553 | } | ||
554 | |||
555 | rec_len = OCFS2_DIR_REC_LEN(namelen); | ||
556 | offset = 0; | ||
557 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
558 | while (1) { | ||
559 | if ((char *)de >= sb->s_blocksize + bh->b_data) { | ||
560 | brelse(bh); | ||
561 | bh = NULL; | ||
562 | |||
563 | if (i_size_read(dir) <= offset) { | ||
564 | status = ocfs2_extend_dir(osb, | ||
565 | dir, | ||
566 | parent_fe_bh, | ||
567 | &bh); | ||
568 | if (status < 0) { | ||
569 | mlog_errno(status); | ||
570 | goto bail; | ||
571 | } | ||
572 | BUG_ON(!bh); | ||
573 | *ret_de_bh = bh; | ||
574 | get_bh(*ret_de_bh); | ||
575 | goto bail; | ||
576 | } | ||
577 | bh = ocfs2_bread(dir, | ||
578 | offset >> sb->s_blocksize_bits, | ||
579 | &status, | ||
580 | 0); | ||
581 | if (!bh) { | ||
582 | mlog_errno(status); | ||
583 | goto bail; | ||
584 | } | ||
585 | /* move to next block */ | ||
586 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
587 | } | ||
588 | if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { | ||
589 | status = -ENOENT; | ||
590 | goto bail; | ||
591 | } | ||
592 | if (ocfs2_match(namelen, name, de)) { | ||
593 | status = -EEXIST; | ||
594 | goto bail; | ||
595 | } | ||
596 | if (((le64_to_cpu(de->inode) == 0) && | ||
597 | (le16_to_cpu(de->rec_len) >= rec_len)) || | ||
598 | (le16_to_cpu(de->rec_len) >= | ||
599 | (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { | ||
600 | /* Ok, we found a spot. Return this bh and let | ||
601 | * the caller actually fill it in. */ | ||
602 | *ret_de_bh = bh; | ||
603 | get_bh(*ret_de_bh); | ||
604 | status = 0; | ||
605 | goto bail; | ||
606 | } | ||
607 | offset += le16_to_cpu(de->rec_len); | ||
608 | de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); | ||
609 | } | ||
610 | |||
611 | status = 0; | ||
612 | bail: | ||
613 | if (bh) | ||
614 | brelse(bh); | ||
615 | |||
616 | mlog_exit(status); | ||
617 | return status; | ||
618 | } | ||
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h new file mode 100644 index 000000000000..5f614ec9649c --- /dev/null +++ b/fs/ocfs2/dir.h | |||
@@ -0,0 +1,54 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dir.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_DIR_H | ||
27 | #define OCFS2_DIR_H | ||
28 | |||
29 | int ocfs2_check_dir_for_entry(struct inode *dir, | ||
30 | const char *name, | ||
31 | int namelen); | ||
32 | int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ | ||
33 | int ocfs2_find_files_on_disk(const char *name, | ||
34 | int namelen, | ||
35 | u64 *blkno, | ||
36 | struct inode *inode, | ||
37 | struct buffer_head **dirent_bh, | ||
38 | struct ocfs2_dir_entry **dirent); | ||
39 | int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); | ||
40 | int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, | ||
41 | struct inode *dir, | ||
42 | struct buffer_head *parent_fe_bh, | ||
43 | const char *name, | ||
44 | int namelen, | ||
45 | struct buffer_head **ret_de_bh); | ||
46 | struct ocfs2_alloc_context; | ||
47 | int ocfs2_do_extend_dir(struct super_block *sb, | ||
48 | struct ocfs2_journal_handle *handle, | ||
49 | struct inode *dir, | ||
50 | struct buffer_head *parent_fe_bh, | ||
51 | struct ocfs2_alloc_context *data_ac, | ||
52 | struct ocfs2_alloc_context *meta_ac, | ||
53 | struct buffer_head **new_bh); | ||
54 | #endif /* OCFS2_DIR_H */ | ||
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c new file mode 100644 index 000000000000..e971ec2f8407 --- /dev/null +++ b/fs/ocfs2/dlmglue.c | |||
@@ -0,0 +1,2904 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmglue.c | ||
5 | * | ||
6 | * Code which implements an OCFS2 specific interface to our DLM. | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/crc32.h> | ||
32 | #include <linux/kthread.h> | ||
33 | #include <linux/pagemap.h> | ||
34 | #include <linux/debugfs.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | |||
37 | #include <cluster/heartbeat.h> | ||
38 | #include <cluster/nodemanager.h> | ||
39 | #include <cluster/tcp.h> | ||
40 | |||
41 | #include <dlm/dlmapi.h> | ||
42 | |||
43 | #define MLOG_MASK_PREFIX ML_DLM_GLUE | ||
44 | #include <cluster/masklog.h> | ||
45 | |||
46 | #include "ocfs2.h" | ||
47 | |||
48 | #include "alloc.h" | ||
49 | #include "dlmglue.h" | ||
50 | #include "extent_map.h" | ||
51 | #include "heartbeat.h" | ||
52 | #include "inode.h" | ||
53 | #include "journal.h" | ||
54 | #include "slot_map.h" | ||
55 | #include "super.h" | ||
56 | #include "uptodate.h" | ||
57 | #include "vote.h" | ||
58 | |||
59 | #include "buffer_head_io.h" | ||
60 | |||
61 | struct ocfs2_mask_waiter { | ||
62 | struct list_head mw_item; | ||
63 | int mw_status; | ||
64 | struct completion mw_complete; | ||
65 | unsigned long mw_mask; | ||
66 | unsigned long mw_goal; | ||
67 | }; | ||
68 | |||
69 | static void ocfs2_inode_ast_func(void *opaque); | ||
70 | static void ocfs2_inode_bast_func(void *opaque, | ||
71 | int level); | ||
72 | static void ocfs2_super_ast_func(void *opaque); | ||
73 | static void ocfs2_super_bast_func(void *opaque, | ||
74 | int level); | ||
75 | static void ocfs2_rename_ast_func(void *opaque); | ||
76 | static void ocfs2_rename_bast_func(void *opaque, | ||
77 | int level); | ||
78 | |||
79 | /* so far, all locks have gotten along with the same unlock ast */ | ||
80 | static void ocfs2_unlock_ast_func(void *opaque, | ||
81 | enum dlm_status status); | ||
82 | static int ocfs2_do_unblock_meta(struct inode *inode, | ||
83 | int *requeue); | ||
84 | static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, | ||
85 | int *requeue); | ||
86 | static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, | ||
87 | int *requeue); | ||
88 | static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, | ||
89 | int *requeue); | ||
90 | static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, | ||
91 | int *requeue); | ||
92 | typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); | ||
93 | static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, | ||
94 | struct ocfs2_lock_res *lockres, | ||
95 | int *requeue, | ||
96 | ocfs2_convert_worker_t *worker); | ||
97 | |||
98 | struct ocfs2_lock_res_ops { | ||
99 | void (*ast)(void *); | ||
100 | void (*bast)(void *, int); | ||
101 | void (*unlock_ast)(void *, enum dlm_status); | ||
102 | int (*unblock)(struct ocfs2_lock_res *, int *); | ||
103 | }; | ||
104 | |||
105 | static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { | ||
106 | .ast = ocfs2_inode_ast_func, | ||
107 | .bast = ocfs2_inode_bast_func, | ||
108 | .unlock_ast = ocfs2_unlock_ast_func, | ||
109 | .unblock = ocfs2_unblock_inode_lock, | ||
110 | }; | ||
111 | |||
112 | static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { | ||
113 | .ast = ocfs2_inode_ast_func, | ||
114 | .bast = ocfs2_inode_bast_func, | ||
115 | .unlock_ast = ocfs2_unlock_ast_func, | ||
116 | .unblock = ocfs2_unblock_meta, | ||
117 | }; | ||
118 | |||
119 | static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | ||
120 | int blocking); | ||
121 | |||
122 | static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { | ||
123 | .ast = ocfs2_inode_ast_func, | ||
124 | .bast = ocfs2_inode_bast_func, | ||
125 | .unlock_ast = ocfs2_unlock_ast_func, | ||
126 | .unblock = ocfs2_unblock_data, | ||
127 | }; | ||
128 | |||
129 | static struct ocfs2_lock_res_ops ocfs2_super_lops = { | ||
130 | .ast = ocfs2_super_ast_func, | ||
131 | .bast = ocfs2_super_bast_func, | ||
132 | .unlock_ast = ocfs2_unlock_ast_func, | ||
133 | .unblock = ocfs2_unblock_osb_lock, | ||
134 | }; | ||
135 | |||
136 | static struct ocfs2_lock_res_ops ocfs2_rename_lops = { | ||
137 | .ast = ocfs2_rename_ast_func, | ||
138 | .bast = ocfs2_rename_bast_func, | ||
139 | .unlock_ast = ocfs2_unlock_ast_func, | ||
140 | .unblock = ocfs2_unblock_osb_lock, | ||
141 | }; | ||
142 | |||
143 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) | ||
144 | { | ||
145 | return lockres->l_type == OCFS2_LOCK_TYPE_META || | ||
146 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || | ||
147 | lockres->l_type == OCFS2_LOCK_TYPE_RW; | ||
148 | } | ||
149 | |||
150 | static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) | ||
151 | { | ||
152 | return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; | ||
153 | } | ||
154 | |||
155 | static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) | ||
156 | { | ||
157 | return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; | ||
158 | } | ||
159 | |||
160 | static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) | ||
161 | { | ||
162 | BUG_ON(!ocfs2_is_super_lock(lockres) | ||
163 | && !ocfs2_is_rename_lock(lockres)); | ||
164 | |||
165 | return (struct ocfs2_super *) lockres->l_priv; | ||
166 | } | ||
167 | |||
168 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) | ||
169 | { | ||
170 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
171 | |||
172 | return (struct inode *) lockres->l_priv; | ||
173 | } | ||
174 | |||
175 | static int ocfs2_lock_create(struct ocfs2_super *osb, | ||
176 | struct ocfs2_lock_res *lockres, | ||
177 | int level, | ||
178 | int dlm_flags); | ||
179 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, | ||
180 | int wanted); | ||
181 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | ||
182 | struct ocfs2_lock_res *lockres, | ||
183 | int level); | ||
184 | static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); | ||
185 | static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); | ||
186 | static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); | ||
187 | static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); | ||
188 | static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, | ||
189 | struct ocfs2_lock_res *lockres); | ||
190 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | ||
191 | int convert); | ||
192 | #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ | ||
193 | mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ | ||
194 | "resource %s: %s\n", dlm_errname(_stat), _func, \ | ||
195 | _lockres->l_name, dlm_errmsg(_stat)); \ | ||
196 | } while (0) | ||
197 | static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, | ||
198 | struct ocfs2_lock_res *lockres); | ||
199 | static int ocfs2_meta_lock_update(struct inode *inode, | ||
200 | struct buffer_head **bh); | ||
201 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); | ||
202 | static inline int ocfs2_highest_compat_lock_level(int level); | ||
203 | static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, | ||
204 | struct ocfs2_lock_res *lockres, | ||
205 | int new_level); | ||
206 | |||
207 | static char *ocfs2_lock_type_strings[] = { | ||
208 | [OCFS2_LOCK_TYPE_META] = "Meta", | ||
209 | [OCFS2_LOCK_TYPE_DATA] = "Data", | ||
210 | [OCFS2_LOCK_TYPE_SUPER] = "Super", | ||
211 | [OCFS2_LOCK_TYPE_RENAME] = "Rename", | ||
212 | /* Need to differntiate from [R]ename.. serializing writes is the | ||
213 | * important job it does, anyway. */ | ||
214 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", | ||
215 | }; | ||
216 | |||
217 | static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) | ||
218 | { | ||
219 | mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); | ||
220 | return ocfs2_lock_type_strings[type]; | ||
221 | } | ||
222 | |||
223 | static void ocfs2_build_lock_name(enum ocfs2_lock_type type, | ||
224 | u64 blkno, | ||
225 | u32 generation, | ||
226 | char *name) | ||
227 | { | ||
228 | int len; | ||
229 | |||
230 | mlog_entry_void(); | ||
231 | |||
232 | BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); | ||
233 | |||
234 | len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x", | ||
235 | ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno, | ||
236 | generation); | ||
237 | |||
238 | BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); | ||
239 | |||
240 | mlog(0, "built lock resource with name: %s\n", name); | ||
241 | |||
242 | mlog_exit_void(); | ||
243 | } | ||
244 | |||
245 | static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; | ||
246 | |||
247 | static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, | ||
248 | struct ocfs2_dlm_debug *dlm_debug) | ||
249 | { | ||
250 | mlog(0, "Add tracking for lockres %s\n", res->l_name); | ||
251 | |||
252 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
253 | list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); | ||
254 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
255 | } | ||
256 | |||
257 | static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) | ||
258 | { | ||
259 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
260 | if (!list_empty(&res->l_debug_list)) | ||
261 | list_del_init(&res->l_debug_list); | ||
262 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
263 | } | ||
264 | |||
265 | static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, | ||
266 | struct ocfs2_lock_res *res, | ||
267 | enum ocfs2_lock_type type, | ||
268 | u64 blkno, | ||
269 | u32 generation, | ||
270 | struct ocfs2_lock_res_ops *ops, | ||
271 | void *priv) | ||
272 | { | ||
273 | ocfs2_build_lock_name(type, blkno, generation, res->l_name); | ||
274 | |||
275 | res->l_type = type; | ||
276 | res->l_ops = ops; | ||
277 | res->l_priv = priv; | ||
278 | |||
279 | res->l_level = LKM_IVMODE; | ||
280 | res->l_requested = LKM_IVMODE; | ||
281 | res->l_blocking = LKM_IVMODE; | ||
282 | res->l_action = OCFS2_AST_INVALID; | ||
283 | res->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
284 | |||
285 | res->l_flags = OCFS2_LOCK_INITIALIZED; | ||
286 | |||
287 | ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); | ||
288 | } | ||
289 | |||
290 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) | ||
291 | { | ||
292 | /* This also clears out the lock status block */ | ||
293 | memset(res, 0, sizeof(struct ocfs2_lock_res)); | ||
294 | spin_lock_init(&res->l_lock); | ||
295 | init_waitqueue_head(&res->l_event); | ||
296 | INIT_LIST_HEAD(&res->l_blocked_list); | ||
297 | INIT_LIST_HEAD(&res->l_mask_waiters); | ||
298 | } | ||
299 | |||
300 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | ||
301 | enum ocfs2_lock_type type, | ||
302 | struct inode *inode) | ||
303 | { | ||
304 | struct ocfs2_lock_res_ops *ops; | ||
305 | |||
306 | switch(type) { | ||
307 | case OCFS2_LOCK_TYPE_RW: | ||
308 | ops = &ocfs2_inode_rw_lops; | ||
309 | break; | ||
310 | case OCFS2_LOCK_TYPE_META: | ||
311 | ops = &ocfs2_inode_meta_lops; | ||
312 | break; | ||
313 | case OCFS2_LOCK_TYPE_DATA: | ||
314 | ops = &ocfs2_inode_data_lops; | ||
315 | break; | ||
316 | default: | ||
317 | mlog_bug_on_msg(1, "type: %d\n", type); | ||
318 | ops = NULL; /* thanks, gcc */ | ||
319 | break; | ||
320 | }; | ||
321 | |||
322 | ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, | ||
323 | OCFS2_I(inode)->ip_blkno, | ||
324 | inode->i_generation, ops, inode); | ||
325 | } | ||
326 | |||
327 | static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, | ||
328 | struct ocfs2_super *osb) | ||
329 | { | ||
330 | /* Superblock lockres doesn't come from a slab so we call init | ||
331 | * once on it manually. */ | ||
332 | ocfs2_lock_res_init_once(res); | ||
333 | ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, | ||
334 | OCFS2_SUPER_BLOCK_BLKNO, 0, | ||
335 | &ocfs2_super_lops, osb); | ||
336 | } | ||
337 | |||
338 | static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, | ||
339 | struct ocfs2_super *osb) | ||
340 | { | ||
341 | /* Rename lockres doesn't come from a slab so we call init | ||
342 | * once on it manually. */ | ||
343 | ocfs2_lock_res_init_once(res); | ||
344 | ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, | ||
345 | &ocfs2_rename_lops, osb); | ||
346 | } | ||
347 | |||
348 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res) | ||
349 | { | ||
350 | mlog_entry_void(); | ||
351 | |||
352 | if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) | ||
353 | return; | ||
354 | |||
355 | ocfs2_remove_lockres_tracking(res); | ||
356 | |||
357 | mlog_bug_on_msg(!list_empty(&res->l_blocked_list), | ||
358 | "Lockres %s is on the blocked list\n", | ||
359 | res->l_name); | ||
360 | mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), | ||
361 | "Lockres %s has mask waiters pending\n", | ||
362 | res->l_name); | ||
363 | mlog_bug_on_msg(spin_is_locked(&res->l_lock), | ||
364 | "Lockres %s is locked\n", | ||
365 | res->l_name); | ||
366 | mlog_bug_on_msg(res->l_ro_holders, | ||
367 | "Lockres %s has %u ro holders\n", | ||
368 | res->l_name, res->l_ro_holders); | ||
369 | mlog_bug_on_msg(res->l_ex_holders, | ||
370 | "Lockres %s has %u ex holders\n", | ||
371 | res->l_name, res->l_ex_holders); | ||
372 | |||
373 | /* Need to clear out the lock status block for the dlm */ | ||
374 | memset(&res->l_lksb, 0, sizeof(res->l_lksb)); | ||
375 | |||
376 | res->l_flags = 0UL; | ||
377 | mlog_exit_void(); | ||
378 | } | ||
379 | |||
380 | static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, | ||
381 | int level) | ||
382 | { | ||
383 | mlog_entry_void(); | ||
384 | |||
385 | BUG_ON(!lockres); | ||
386 | |||
387 | switch(level) { | ||
388 | case LKM_EXMODE: | ||
389 | lockres->l_ex_holders++; | ||
390 | break; | ||
391 | case LKM_PRMODE: | ||
392 | lockres->l_ro_holders++; | ||
393 | break; | ||
394 | default: | ||
395 | BUG(); | ||
396 | } | ||
397 | |||
398 | mlog_exit_void(); | ||
399 | } | ||
400 | |||
401 | static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, | ||
402 | int level) | ||
403 | { | ||
404 | mlog_entry_void(); | ||
405 | |||
406 | BUG_ON(!lockres); | ||
407 | |||
408 | switch(level) { | ||
409 | case LKM_EXMODE: | ||
410 | BUG_ON(!lockres->l_ex_holders); | ||
411 | lockres->l_ex_holders--; | ||
412 | break; | ||
413 | case LKM_PRMODE: | ||
414 | BUG_ON(!lockres->l_ro_holders); | ||
415 | lockres->l_ro_holders--; | ||
416 | break; | ||
417 | default: | ||
418 | BUG(); | ||
419 | } | ||
420 | mlog_exit_void(); | ||
421 | } | ||
422 | |||
423 | /* WARNING: This function lives in a world where the only three lock | ||
424 | * levels are EX, PR, and NL. It *will* have to be adjusted when more | ||
425 | * lock types are added. */ | ||
426 | static inline int ocfs2_highest_compat_lock_level(int level) | ||
427 | { | ||
428 | int new_level = LKM_EXMODE; | ||
429 | |||
430 | if (level == LKM_EXMODE) | ||
431 | new_level = LKM_NLMODE; | ||
432 | else if (level == LKM_PRMODE) | ||
433 | new_level = LKM_PRMODE; | ||
434 | return new_level; | ||
435 | } | ||
436 | |||
437 | static void lockres_set_flags(struct ocfs2_lock_res *lockres, | ||
438 | unsigned long newflags) | ||
439 | { | ||
440 | struct list_head *pos, *tmp; | ||
441 | struct ocfs2_mask_waiter *mw; | ||
442 | |||
443 | assert_spin_locked(&lockres->l_lock); | ||
444 | |||
445 | lockres->l_flags = newflags; | ||
446 | |||
447 | list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { | ||
448 | mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); | ||
449 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | ||
450 | continue; | ||
451 | |||
452 | list_del_init(&mw->mw_item); | ||
453 | mw->mw_status = 0; | ||
454 | complete(&mw->mw_complete); | ||
455 | } | ||
456 | } | ||
457 | static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) | ||
458 | { | ||
459 | lockres_set_flags(lockres, lockres->l_flags | or); | ||
460 | } | ||
461 | static void lockres_clear_flags(struct ocfs2_lock_res *lockres, | ||
462 | unsigned long clear) | ||
463 | { | ||
464 | lockres_set_flags(lockres, lockres->l_flags & ~clear); | ||
465 | } | ||
466 | |||
467 | static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) | ||
468 | { | ||
469 | mlog_entry_void(); | ||
470 | |||
471 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
472 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); | ||
473 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
474 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | ||
475 | |||
476 | lockres->l_level = lockres->l_requested; | ||
477 | if (lockres->l_level <= | ||
478 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) { | ||
479 | lockres->l_blocking = LKM_NLMODE; | ||
480 | lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); | ||
481 | } | ||
482 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
483 | |||
484 | mlog_exit_void(); | ||
485 | } | ||
486 | |||
487 | static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) | ||
488 | { | ||
489 | mlog_entry_void(); | ||
490 | |||
491 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
492 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); | ||
493 | |||
494 | /* Convert from RO to EX doesn't really need anything as our | ||
495 | * information is already up to data. Convert from NL to | ||
496 | * *anything* however should mark ourselves as needing an | ||
497 | * update */ | ||
498 | if (lockres->l_level == LKM_NLMODE) | ||
499 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
500 | |||
501 | lockres->l_level = lockres->l_requested; | ||
502 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
503 | |||
504 | mlog_exit_void(); | ||
505 | } | ||
506 | |||
507 | static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) | ||
508 | { | ||
509 | mlog_entry_void(); | ||
510 | |||
511 | BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
512 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | ||
513 | |||
514 | if (lockres->l_requested > LKM_NLMODE && | ||
515 | !(lockres->l_flags & OCFS2_LOCK_LOCAL)) | ||
516 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
517 | |||
518 | lockres->l_level = lockres->l_requested; | ||
519 | lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); | ||
520 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
521 | |||
522 | mlog_exit_void(); | ||
523 | } | ||
524 | |||
525 | static void ocfs2_inode_ast_func(void *opaque) | ||
526 | { | ||
527 | struct ocfs2_lock_res *lockres = opaque; | ||
528 | struct inode *inode; | ||
529 | struct dlm_lockstatus *lksb; | ||
530 | unsigned long flags; | ||
531 | |||
532 | mlog_entry_void(); | ||
533 | |||
534 | inode = ocfs2_lock_res_inode(lockres); | ||
535 | |||
536 | mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n", | ||
537 | OCFS2_I(inode)->ip_blkno, lockres->l_action, | ||
538 | ocfs2_lock_type_string(lockres->l_type)); | ||
539 | |||
540 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
541 | |||
542 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
543 | |||
544 | lksb = &(lockres->l_lksb); | ||
545 | if (lksb->status != DLM_NORMAL) { | ||
546 | mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " | ||
547 | "on inode %"MLFu64"\n", lksb->status, | ||
548 | OCFS2_I(inode)->ip_blkno); | ||
549 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
550 | mlog_exit_void(); | ||
551 | return; | ||
552 | } | ||
553 | |||
554 | switch(lockres->l_action) { | ||
555 | case OCFS2_AST_ATTACH: | ||
556 | ocfs2_generic_handle_attach_action(lockres); | ||
557 | lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); | ||
558 | break; | ||
559 | case OCFS2_AST_CONVERT: | ||
560 | ocfs2_generic_handle_convert_action(lockres); | ||
561 | break; | ||
562 | case OCFS2_AST_DOWNCONVERT: | ||
563 | ocfs2_generic_handle_downconvert_action(lockres); | ||
564 | break; | ||
565 | default: | ||
566 | mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " | ||
567 | "lockres flags = 0x%lx, unlock action: %u\n", | ||
568 | lockres->l_name, lockres->l_action, lockres->l_flags, | ||
569 | lockres->l_unlock_action); | ||
570 | |||
571 | BUG(); | ||
572 | } | ||
573 | |||
574 | /* data and rw locking ignores refresh flag for now. */ | ||
575 | if (lockres->l_type != OCFS2_LOCK_TYPE_META) | ||
576 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
577 | |||
578 | /* set it to something invalid so if we get called again we | ||
579 | * can catch it. */ | ||
580 | lockres->l_action = OCFS2_AST_INVALID; | ||
581 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
582 | wake_up(&lockres->l_event); | ||
583 | |||
584 | mlog_exit_void(); | ||
585 | } | ||
586 | |||
587 | static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, | ||
588 | int level) | ||
589 | { | ||
590 | int needs_downconvert = 0; | ||
591 | mlog_entry_void(); | ||
592 | |||
593 | assert_spin_locked(&lockres->l_lock); | ||
594 | |||
595 | lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); | ||
596 | |||
597 | if (level > lockres->l_blocking) { | ||
598 | /* only schedule a downconvert if we haven't already scheduled | ||
599 | * one that goes low enough to satisfy the level we're | ||
600 | * blocking. this also catches the case where we get | ||
601 | * duplicate BASTs */ | ||
602 | if (ocfs2_highest_compat_lock_level(level) < | ||
603 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) | ||
604 | needs_downconvert = 1; | ||
605 | |||
606 | lockres->l_blocking = level; | ||
607 | } | ||
608 | |||
609 | mlog_exit(needs_downconvert); | ||
610 | return needs_downconvert; | ||
611 | } | ||
612 | |||
613 | static void ocfs2_generic_bast_func(struct ocfs2_super *osb, | ||
614 | struct ocfs2_lock_res *lockres, | ||
615 | int level) | ||
616 | { | ||
617 | int needs_downconvert; | ||
618 | unsigned long flags; | ||
619 | |||
620 | mlog_entry_void(); | ||
621 | |||
622 | BUG_ON(level <= LKM_NLMODE); | ||
623 | |||
624 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
625 | needs_downconvert = ocfs2_generic_handle_bast(lockres, level); | ||
626 | if (needs_downconvert) | ||
627 | ocfs2_schedule_blocked_lock(osb, lockres); | ||
628 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
629 | |||
630 | ocfs2_kick_vote_thread(osb); | ||
631 | |||
632 | wake_up(&lockres->l_event); | ||
633 | mlog_exit_void(); | ||
634 | } | ||
635 | |||
636 | static void ocfs2_inode_bast_func(void *opaque, int level) | ||
637 | { | ||
638 | struct ocfs2_lock_res *lockres = opaque; | ||
639 | struct inode *inode; | ||
640 | struct ocfs2_super *osb; | ||
641 | |||
642 | mlog_entry_void(); | ||
643 | |||
644 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
645 | |||
646 | inode = ocfs2_lock_res_inode(lockres); | ||
647 | osb = OCFS2_SB(inode->i_sb); | ||
648 | |||
649 | mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d " | ||
650 | "type = %s\n", OCFS2_I(inode)->ip_blkno, level, | ||
651 | lockres->l_level, | ||
652 | ocfs2_lock_type_string(lockres->l_type)); | ||
653 | |||
654 | ocfs2_generic_bast_func(osb, lockres, level); | ||
655 | |||
656 | mlog_exit_void(); | ||
657 | } | ||
658 | |||
659 | static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, | ||
660 | int ignore_refresh) | ||
661 | { | ||
662 | struct dlm_lockstatus *lksb = &lockres->l_lksb; | ||
663 | unsigned long flags; | ||
664 | |||
665 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
666 | |||
667 | if (lksb->status != DLM_NORMAL) { | ||
668 | mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", | ||
669 | lockres->l_name, lksb->status); | ||
670 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
671 | return; | ||
672 | } | ||
673 | |||
674 | switch(lockres->l_action) { | ||
675 | case OCFS2_AST_ATTACH: | ||
676 | ocfs2_generic_handle_attach_action(lockres); | ||
677 | break; | ||
678 | case OCFS2_AST_CONVERT: | ||
679 | ocfs2_generic_handle_convert_action(lockres); | ||
680 | break; | ||
681 | case OCFS2_AST_DOWNCONVERT: | ||
682 | ocfs2_generic_handle_downconvert_action(lockres); | ||
683 | break; | ||
684 | default: | ||
685 | BUG(); | ||
686 | } | ||
687 | |||
688 | if (ignore_refresh) | ||
689 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
690 | |||
691 | /* set it to something invalid so if we get called again we | ||
692 | * can catch it. */ | ||
693 | lockres->l_action = OCFS2_AST_INVALID; | ||
694 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
695 | |||
696 | wake_up(&lockres->l_event); | ||
697 | } | ||
698 | |||
699 | static void ocfs2_super_ast_func(void *opaque) | ||
700 | { | ||
701 | struct ocfs2_lock_res *lockres = opaque; | ||
702 | |||
703 | mlog_entry_void(); | ||
704 | mlog(0, "Superblock AST fired\n"); | ||
705 | |||
706 | BUG_ON(!ocfs2_is_super_lock(lockres)); | ||
707 | ocfs2_generic_ast_func(lockres, 0); | ||
708 | |||
709 | mlog_exit_void(); | ||
710 | } | ||
711 | |||
712 | static void ocfs2_super_bast_func(void *opaque, | ||
713 | int level) | ||
714 | { | ||
715 | struct ocfs2_lock_res *lockres = opaque; | ||
716 | struct ocfs2_super *osb; | ||
717 | |||
718 | mlog_entry_void(); | ||
719 | mlog(0, "Superblock BAST fired\n"); | ||
720 | |||
721 | BUG_ON(!ocfs2_is_super_lock(lockres)); | ||
722 | osb = ocfs2_lock_res_super(lockres); | ||
723 | ocfs2_generic_bast_func(osb, lockres, level); | ||
724 | |||
725 | mlog_exit_void(); | ||
726 | } | ||
727 | |||
728 | static void ocfs2_rename_ast_func(void *opaque) | ||
729 | { | ||
730 | struct ocfs2_lock_res *lockres = opaque; | ||
731 | |||
732 | mlog_entry_void(); | ||
733 | |||
734 | mlog(0, "Rename AST fired\n"); | ||
735 | |||
736 | BUG_ON(!ocfs2_is_rename_lock(lockres)); | ||
737 | |||
738 | ocfs2_generic_ast_func(lockres, 1); | ||
739 | |||
740 | mlog_exit_void(); | ||
741 | } | ||
742 | |||
743 | static void ocfs2_rename_bast_func(void *opaque, | ||
744 | int level) | ||
745 | { | ||
746 | struct ocfs2_lock_res *lockres = opaque; | ||
747 | struct ocfs2_super *osb; | ||
748 | |||
749 | mlog_entry_void(); | ||
750 | |||
751 | mlog(0, "Rename BAST fired\n"); | ||
752 | |||
753 | BUG_ON(!ocfs2_is_rename_lock(lockres)); | ||
754 | |||
755 | osb = ocfs2_lock_res_super(lockres); | ||
756 | ocfs2_generic_bast_func(osb, lockres, level); | ||
757 | |||
758 | mlog_exit_void(); | ||
759 | } | ||
760 | |||
761 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | ||
762 | int convert) | ||
763 | { | ||
764 | unsigned long flags; | ||
765 | |||
766 | mlog_entry_void(); | ||
767 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
768 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
769 | if (convert) | ||
770 | lockres->l_action = OCFS2_AST_INVALID; | ||
771 | else | ||
772 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
773 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
774 | |||
775 | wake_up(&lockres->l_event); | ||
776 | mlog_exit_void(); | ||
777 | } | ||
778 | |||
779 | /* Note: If we detect another process working on the lock (i.e., | ||
780 | * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller | ||
781 | * to do the right thing in that case. | ||
782 | */ | ||
783 | static int ocfs2_lock_create(struct ocfs2_super *osb, | ||
784 | struct ocfs2_lock_res *lockres, | ||
785 | int level, | ||
786 | int dlm_flags) | ||
787 | { | ||
788 | int ret = 0; | ||
789 | enum dlm_status status; | ||
790 | unsigned long flags; | ||
791 | |||
792 | mlog_entry_void(); | ||
793 | |||
794 | mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, | ||
795 | dlm_flags); | ||
796 | |||
797 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
798 | if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || | ||
799 | (lockres->l_flags & OCFS2_LOCK_BUSY)) { | ||
800 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
801 | goto bail; | ||
802 | } | ||
803 | |||
804 | lockres->l_action = OCFS2_AST_ATTACH; | ||
805 | lockres->l_requested = level; | ||
806 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
807 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
808 | |||
809 | status = dlmlock(osb->dlm, | ||
810 | level, | ||
811 | &lockres->l_lksb, | ||
812 | dlm_flags, | ||
813 | lockres->l_name, | ||
814 | lockres->l_ops->ast, | ||
815 | lockres, | ||
816 | lockres->l_ops->bast); | ||
817 | if (status != DLM_NORMAL) { | ||
818 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
819 | ret = -EINVAL; | ||
820 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
821 | } | ||
822 | |||
823 | mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); | ||
824 | |||
825 | bail: | ||
826 | mlog_exit(ret); | ||
827 | return ret; | ||
828 | } | ||
829 | |||
830 | static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, | ||
831 | int flag) | ||
832 | { | ||
833 | unsigned long flags; | ||
834 | int ret; | ||
835 | |||
836 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
837 | ret = lockres->l_flags & flag; | ||
838 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
839 | |||
840 | return ret; | ||
841 | } | ||
842 | |||
843 | static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) | ||
844 | |||
845 | { | ||
846 | wait_event(lockres->l_event, | ||
847 | !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); | ||
848 | } | ||
849 | |||
850 | static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) | ||
851 | |||
852 | { | ||
853 | wait_event(lockres->l_event, | ||
854 | !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); | ||
855 | } | ||
856 | |||
857 | /* predict what lock level we'll be dropping down to on behalf | ||
858 | * of another node, and return true if the currently wanted | ||
859 | * level will be compatible with it. */ | ||
860 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, | ||
861 | int wanted) | ||
862 | { | ||
863 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
864 | |||
865 | return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
866 | } | ||
867 | |||
868 | static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) | ||
869 | { | ||
870 | INIT_LIST_HEAD(&mw->mw_item); | ||
871 | init_completion(&mw->mw_complete); | ||
872 | } | ||
873 | |||
874 | static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) | ||
875 | { | ||
876 | wait_for_completion(&mw->mw_complete); | ||
877 | /* Re-arm the completion in case we want to wait on it again */ | ||
878 | INIT_COMPLETION(mw->mw_complete); | ||
879 | return mw->mw_status; | ||
880 | } | ||
881 | |||
882 | static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, | ||
883 | struct ocfs2_mask_waiter *mw, | ||
884 | unsigned long mask, | ||
885 | unsigned long goal) | ||
886 | { | ||
887 | BUG_ON(!list_empty(&mw->mw_item)); | ||
888 | |||
889 | assert_spin_locked(&lockres->l_lock); | ||
890 | |||
891 | list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); | ||
892 | mw->mw_mask = mask; | ||
893 | mw->mw_goal = goal; | ||
894 | } | ||
895 | |||
896 | /* returns 0 if the mw that was removed was already satisfied, -EBUSY | ||
897 | * if the mask still hadn't reached its goal */ | ||
898 | static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, | ||
899 | struct ocfs2_mask_waiter *mw) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | int ret = 0; | ||
903 | |||
904 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
905 | if (!list_empty(&mw->mw_item)) { | ||
906 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | ||
907 | ret = -EBUSY; | ||
908 | |||
909 | list_del_init(&mw->mw_item); | ||
910 | init_completion(&mw->mw_complete); | ||
911 | } | ||
912 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
913 | |||
914 | return ret; | ||
915 | |||
916 | } | ||
917 | |||
918 | static int ocfs2_cluster_lock(struct ocfs2_super *osb, | ||
919 | struct ocfs2_lock_res *lockres, | ||
920 | int level, | ||
921 | int lkm_flags, | ||
922 | int arg_flags) | ||
923 | { | ||
924 | struct ocfs2_mask_waiter mw; | ||
925 | enum dlm_status status; | ||
926 | int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); | ||
927 | int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ | ||
928 | unsigned long flags; | ||
929 | |||
930 | mlog_entry_void(); | ||
931 | |||
932 | ocfs2_init_mask_waiter(&mw); | ||
933 | |||
934 | again: | ||
935 | wait = 0; | ||
936 | |||
937 | if (catch_signals && signal_pending(current)) { | ||
938 | ret = -ERESTARTSYS; | ||
939 | goto out; | ||
940 | } | ||
941 | |||
942 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
943 | |||
944 | mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, | ||
945 | "Cluster lock called on freeing lockres %s! flags " | ||
946 | "0x%lx\n", lockres->l_name, lockres->l_flags); | ||
947 | |||
948 | /* We only compare against the currently granted level | ||
949 | * here. If the lock is blocked waiting on a downconvert, | ||
950 | * we'll get caught below. */ | ||
951 | if (lockres->l_flags & OCFS2_LOCK_BUSY && | ||
952 | level > lockres->l_level) { | ||
953 | /* is someone sitting in dlm_lock? If so, wait on | ||
954 | * them. */ | ||
955 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); | ||
956 | wait = 1; | ||
957 | goto unlock; | ||
958 | } | ||
959 | |||
960 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { | ||
961 | /* lock has not been created yet. */ | ||
962 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
963 | |||
964 | ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); | ||
965 | if (ret < 0) { | ||
966 | mlog_errno(ret); | ||
967 | goto out; | ||
968 | } | ||
969 | goto again; | ||
970 | } | ||
971 | |||
972 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED && | ||
973 | !ocfs2_may_continue_on_blocked_lock(lockres, level)) { | ||
974 | /* is the lock is currently blocked on behalf of | ||
975 | * another node */ | ||
976 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); | ||
977 | wait = 1; | ||
978 | goto unlock; | ||
979 | } | ||
980 | |||
981 | if (level > lockres->l_level) { | ||
982 | if (lockres->l_action != OCFS2_AST_INVALID) | ||
983 | mlog(ML_ERROR, "lockres %s has action %u pending\n", | ||
984 | lockres->l_name, lockres->l_action); | ||
985 | |||
986 | lockres->l_action = OCFS2_AST_CONVERT; | ||
987 | lockres->l_requested = level; | ||
988 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
989 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
990 | |||
991 | BUG_ON(level == LKM_IVMODE); | ||
992 | BUG_ON(level == LKM_NLMODE); | ||
993 | |||
994 | mlog(0, "lock %s, convert from %d to level = %d\n", | ||
995 | lockres->l_name, lockres->l_level, level); | ||
996 | |||
997 | /* call dlm_lock to upgrade lock now */ | ||
998 | status = dlmlock(osb->dlm, | ||
999 | level, | ||
1000 | &lockres->l_lksb, | ||
1001 | lkm_flags|LKM_CONVERT|LKM_VALBLK, | ||
1002 | lockres->l_name, | ||
1003 | lockres->l_ops->ast, | ||
1004 | lockres, | ||
1005 | lockres->l_ops->bast); | ||
1006 | if (status != DLM_NORMAL) { | ||
1007 | if ((lkm_flags & LKM_NOQUEUE) && | ||
1008 | (status == DLM_NOTQUEUED)) | ||
1009 | ret = -EAGAIN; | ||
1010 | else { | ||
1011 | ocfs2_log_dlm_error("dlmlock", status, | ||
1012 | lockres); | ||
1013 | ret = -EINVAL; | ||
1014 | } | ||
1015 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
1016 | goto out; | ||
1017 | } | ||
1018 | |||
1019 | mlog(0, "lock %s, successfull return from dlmlock\n", | ||
1020 | lockres->l_name); | ||
1021 | |||
1022 | /* At this point we've gone inside the dlm and need to | ||
1023 | * complete our work regardless. */ | ||
1024 | catch_signals = 0; | ||
1025 | |||
1026 | /* wait for busy to clear and carry on */ | ||
1027 | goto again; | ||
1028 | } | ||
1029 | |||
1030 | /* Ok, if we get here then we're good to go. */ | ||
1031 | ocfs2_inc_holders(lockres, level); | ||
1032 | |||
1033 | ret = 0; | ||
1034 | unlock: | ||
1035 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1036 | out: | ||
1037 | /* | ||
1038 | * This is helping work around a lock inversion between the page lock | ||
1039 | * and dlm locks. One path holds the page lock while calling aops | ||
1040 | * which block acquiring dlm locks. The voting thread holds dlm | ||
1041 | * locks while acquiring page locks while down converting data locks. | ||
1042 | * This block is helping an aop path notice the inversion and back | ||
1043 | * off to unlock its page lock before trying the dlm lock again. | ||
1044 | */ | ||
1045 | if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && | ||
1046 | mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { | ||
1047 | wait = 0; | ||
1048 | if (lockres_remove_mask_waiter(lockres, &mw)) | ||
1049 | ret = -EAGAIN; | ||
1050 | else | ||
1051 | goto again; | ||
1052 | } | ||
1053 | if (wait) { | ||
1054 | ret = ocfs2_wait_for_mask(&mw); | ||
1055 | if (ret == 0) | ||
1056 | goto again; | ||
1057 | mlog_errno(ret); | ||
1058 | } | ||
1059 | |||
1060 | mlog_exit(ret); | ||
1061 | return ret; | ||
1062 | } | ||
1063 | |||
1064 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | ||
1065 | struct ocfs2_lock_res *lockres, | ||
1066 | int level) | ||
1067 | { | ||
1068 | unsigned long flags; | ||
1069 | |||
1070 | mlog_entry_void(); | ||
1071 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1072 | ocfs2_dec_holders(lockres, level); | ||
1073 | ocfs2_vote_on_unlock(osb, lockres); | ||
1074 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1075 | mlog_exit_void(); | ||
1076 | } | ||
1077 | |||
1078 | static int ocfs2_create_new_inode_lock(struct inode *inode, | ||
1079 | struct ocfs2_lock_res *lockres) | ||
1080 | { | ||
1081 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1082 | unsigned long flags; | ||
1083 | |||
1084 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1085 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | ||
1086 | lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); | ||
1087 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1088 | |||
1089 | return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); | ||
1090 | } | ||
1091 | |||
1092 | /* Grants us an EX lock on the data and metadata resources, skipping | ||
1093 | * the normal cluster directory lookup. Use this ONLY on newly created | ||
1094 | * inodes which other nodes can't possibly see, and which haven't been | ||
1095 | * hashed in the inode hash yet. This can give us a good performance | ||
1096 | * increase as it'll skip the network broadcast normally associated | ||
1097 | * with creating a new lock resource. */ | ||
1098 | int ocfs2_create_new_inode_locks(struct inode *inode) | ||
1099 | { | ||
1100 | int ret; | ||
1101 | |||
1102 | BUG_ON(!inode); | ||
1103 | BUG_ON(!ocfs2_inode_is_new(inode)); | ||
1104 | |||
1105 | mlog_entry_void(); | ||
1106 | |||
1107 | mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
1108 | |||
1109 | /* NOTE: That we don't increment any of the holder counts, nor | ||
1110 | * do we add anything to a journal handle. Since this is | ||
1111 | * supposed to be a new inode which the cluster doesn't know | ||
1112 | * about yet, there is no need to. As far as the LVB handling | ||
1113 | * is concerned, this is basically like acquiring an EX lock | ||
1114 | * on a resource which has an invalid one -- we'll set it | ||
1115 | * valid when we release the EX. */ | ||
1116 | |||
1117 | ret = ocfs2_create_new_inode_lock(inode, | ||
1118 | &OCFS2_I(inode)->ip_rw_lockres); | ||
1119 | if (ret) { | ||
1120 | mlog_errno(ret); | ||
1121 | goto bail; | ||
1122 | } | ||
1123 | |||
1124 | ret = ocfs2_create_new_inode_lock(inode, | ||
1125 | &OCFS2_I(inode)->ip_meta_lockres); | ||
1126 | if (ret) { | ||
1127 | mlog_errno(ret); | ||
1128 | goto bail; | ||
1129 | } | ||
1130 | |||
1131 | ret = ocfs2_create_new_inode_lock(inode, | ||
1132 | &OCFS2_I(inode)->ip_data_lockres); | ||
1133 | if (ret) { | ||
1134 | mlog_errno(ret); | ||
1135 | goto bail; | ||
1136 | } | ||
1137 | |||
1138 | bail: | ||
1139 | mlog_exit(ret); | ||
1140 | return ret; | ||
1141 | } | ||
1142 | |||
1143 | int ocfs2_rw_lock(struct inode *inode, int write) | ||
1144 | { | ||
1145 | int status, level; | ||
1146 | struct ocfs2_lock_res *lockres; | ||
1147 | |||
1148 | BUG_ON(!inode); | ||
1149 | |||
1150 | mlog_entry_void(); | ||
1151 | |||
1152 | mlog(0, "inode %"MLFu64" take %s RW lock\n", | ||
1153 | OCFS2_I(inode)->ip_blkno, | ||
1154 | write ? "EXMODE" : "PRMODE"); | ||
1155 | |||
1156 | lockres = &OCFS2_I(inode)->ip_rw_lockres; | ||
1157 | |||
1158 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1159 | |||
1160 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, | ||
1161 | 0); | ||
1162 | if (status < 0) | ||
1163 | mlog_errno(status); | ||
1164 | |||
1165 | mlog_exit(status); | ||
1166 | return status; | ||
1167 | } | ||
1168 | |||
1169 | void ocfs2_rw_unlock(struct inode *inode, int write) | ||
1170 | { | ||
1171 | int level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1172 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; | ||
1173 | |||
1174 | mlog_entry_void(); | ||
1175 | |||
1176 | mlog(0, "inode %"MLFu64" drop %s RW lock\n", | ||
1177 | OCFS2_I(inode)->ip_blkno, | ||
1178 | write ? "EXMODE" : "PRMODE"); | ||
1179 | |||
1180 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1181 | |||
1182 | mlog_exit_void(); | ||
1183 | } | ||
1184 | |||
1185 | int ocfs2_data_lock_full(struct inode *inode, | ||
1186 | int write, | ||
1187 | int arg_flags) | ||
1188 | { | ||
1189 | int status = 0, level; | ||
1190 | struct ocfs2_lock_res *lockres; | ||
1191 | |||
1192 | BUG_ON(!inode); | ||
1193 | |||
1194 | mlog_entry_void(); | ||
1195 | |||
1196 | mlog(0, "inode %"MLFu64" take %s DATA lock\n", | ||
1197 | OCFS2_I(inode)->ip_blkno, | ||
1198 | write ? "EXMODE" : "PRMODE"); | ||
1199 | |||
1200 | /* We'll allow faking a readonly data lock for | ||
1201 | * rodevices. */ | ||
1202 | if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { | ||
1203 | if (write) { | ||
1204 | status = -EROFS; | ||
1205 | mlog_errno(status); | ||
1206 | } | ||
1207 | goto out; | ||
1208 | } | ||
1209 | |||
1210 | lockres = &OCFS2_I(inode)->ip_data_lockres; | ||
1211 | |||
1212 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1213 | |||
1214 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, | ||
1215 | 0, arg_flags); | ||
1216 | if (status < 0 && status != -EAGAIN) | ||
1217 | mlog_errno(status); | ||
1218 | |||
1219 | out: | ||
1220 | mlog_exit(status); | ||
1221 | return status; | ||
1222 | } | ||
1223 | |||
1224 | /* see ocfs2_meta_lock_with_page() */ | ||
1225 | int ocfs2_data_lock_with_page(struct inode *inode, | ||
1226 | int write, | ||
1227 | struct page *page) | ||
1228 | { | ||
1229 | int ret; | ||
1230 | |||
1231 | ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); | ||
1232 | if (ret == -EAGAIN) { | ||
1233 | unlock_page(page); | ||
1234 | if (ocfs2_data_lock(inode, write) == 0) | ||
1235 | ocfs2_data_unlock(inode, write); | ||
1236 | ret = AOP_TRUNCATED_PAGE; | ||
1237 | } | ||
1238 | |||
1239 | return ret; | ||
1240 | } | ||
1241 | |||
1242 | static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, | ||
1243 | struct ocfs2_lock_res *lockres) | ||
1244 | { | ||
1245 | int kick = 0; | ||
1246 | |||
1247 | mlog_entry_void(); | ||
1248 | |||
1249 | /* If we know that another node is waiting on our lock, kick | ||
1250 | * the vote thread * pre-emptively when we reach a release | ||
1251 | * condition. */ | ||
1252 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { | ||
1253 | switch(lockres->l_blocking) { | ||
1254 | case LKM_EXMODE: | ||
1255 | if (!lockres->l_ex_holders && !lockres->l_ro_holders) | ||
1256 | kick = 1; | ||
1257 | break; | ||
1258 | case LKM_PRMODE: | ||
1259 | if (!lockres->l_ex_holders) | ||
1260 | kick = 1; | ||
1261 | break; | ||
1262 | default: | ||
1263 | BUG(); | ||
1264 | } | ||
1265 | } | ||
1266 | |||
1267 | if (kick) | ||
1268 | ocfs2_kick_vote_thread(osb); | ||
1269 | |||
1270 | mlog_exit_void(); | ||
1271 | } | ||
1272 | |||
1273 | void ocfs2_data_unlock(struct inode *inode, | ||
1274 | int write) | ||
1275 | { | ||
1276 | int level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1277 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; | ||
1278 | |||
1279 | mlog_entry_void(); | ||
1280 | |||
1281 | mlog(0, "inode %"MLFu64" drop %s DATA lock\n", | ||
1282 | OCFS2_I(inode)->ip_blkno, | ||
1283 | write ? "EXMODE" : "PRMODE"); | ||
1284 | |||
1285 | if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) | ||
1286 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1287 | |||
1288 | mlog_exit_void(); | ||
1289 | } | ||
1290 | |||
1291 | #define OCFS2_SEC_BITS 34 | ||
1292 | #define OCFS2_SEC_SHIFT (64 - 34) | ||
1293 | #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) | ||
1294 | |||
1295 | /* LVB only has room for 64 bits of time here so we pack it for | ||
1296 | * now. */ | ||
1297 | static u64 ocfs2_pack_timespec(struct timespec *spec) | ||
1298 | { | ||
1299 | u64 res; | ||
1300 | u64 sec = spec->tv_sec; | ||
1301 | u32 nsec = spec->tv_nsec; | ||
1302 | |||
1303 | res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); | ||
1304 | |||
1305 | return res; | ||
1306 | } | ||
1307 | |||
1308 | /* Call this with the lockres locked. I am reasonably sure we don't | ||
1309 | * need ip_lock in this function as anyone who would be changing those | ||
1310 | * values is supposed to be blocked in ocfs2_meta_lock right now. */ | ||
1311 | static void __ocfs2_stuff_meta_lvb(struct inode *inode) | ||
1312 | { | ||
1313 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1314 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; | ||
1315 | struct ocfs2_meta_lvb *lvb; | ||
1316 | |||
1317 | mlog_entry_void(); | ||
1318 | |||
1319 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1320 | |||
1321 | lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); | ||
1322 | lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); | ||
1323 | lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); | ||
1324 | lvb->lvb_iuid = cpu_to_be32(inode->i_uid); | ||
1325 | lvb->lvb_igid = cpu_to_be32(inode->i_gid); | ||
1326 | lvb->lvb_imode = cpu_to_be16(inode->i_mode); | ||
1327 | lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); | ||
1328 | lvb->lvb_iatime_packed = | ||
1329 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); | ||
1330 | lvb->lvb_ictime_packed = | ||
1331 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); | ||
1332 | lvb->lvb_imtime_packed = | ||
1333 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); | ||
1334 | |||
1335 | mlog_meta_lvb(0, lockres); | ||
1336 | |||
1337 | mlog_exit_void(); | ||
1338 | } | ||
1339 | |||
1340 | static void ocfs2_unpack_timespec(struct timespec *spec, | ||
1341 | u64 packed_time) | ||
1342 | { | ||
1343 | spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; | ||
1344 | spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; | ||
1345 | } | ||
1346 | |||
1347 | static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | ||
1348 | { | ||
1349 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1350 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; | ||
1351 | struct ocfs2_meta_lvb *lvb; | ||
1352 | |||
1353 | mlog_entry_void(); | ||
1354 | |||
1355 | mlog_meta_lvb(0, lockres); | ||
1356 | |||
1357 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1358 | |||
1359 | /* We're safe here without the lockres lock... */ | ||
1360 | spin_lock(&oi->ip_lock); | ||
1361 | oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); | ||
1362 | i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); | ||
1363 | |||
1364 | /* fast-symlinks are a special case */ | ||
1365 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) | ||
1366 | inode->i_blocks = 0; | ||
1367 | else | ||
1368 | inode->i_blocks = | ||
1369 | ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
1370 | |||
1371 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); | ||
1372 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); | ||
1373 | inode->i_mode = be16_to_cpu(lvb->lvb_imode); | ||
1374 | inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); | ||
1375 | ocfs2_unpack_timespec(&inode->i_atime, | ||
1376 | be64_to_cpu(lvb->lvb_iatime_packed)); | ||
1377 | ocfs2_unpack_timespec(&inode->i_mtime, | ||
1378 | be64_to_cpu(lvb->lvb_imtime_packed)); | ||
1379 | ocfs2_unpack_timespec(&inode->i_ctime, | ||
1380 | be64_to_cpu(lvb->lvb_ictime_packed)); | ||
1381 | spin_unlock(&oi->ip_lock); | ||
1382 | |||
1383 | mlog_exit_void(); | ||
1384 | } | ||
1385 | |||
1386 | static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) | ||
1387 | { | ||
1388 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1389 | |||
1390 | if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) | ||
1391 | return 1; | ||
1392 | return 0; | ||
1393 | } | ||
1394 | |||
1395 | /* Determine whether a lock resource needs to be refreshed, and | ||
1396 | * arbitrate who gets to refresh it. | ||
1397 | * | ||
1398 | * 0 means no refresh needed. | ||
1399 | * | ||
1400 | * > 0 means you need to refresh this and you MUST call | ||
1401 | * ocfs2_complete_lock_res_refresh afterwards. */ | ||
1402 | static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) | ||
1403 | { | ||
1404 | unsigned long flags; | ||
1405 | int status = 0; | ||
1406 | |||
1407 | mlog_entry_void(); | ||
1408 | |||
1409 | refresh_check: | ||
1410 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1411 | if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { | ||
1412 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1413 | goto bail; | ||
1414 | } | ||
1415 | |||
1416 | if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { | ||
1417 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1418 | |||
1419 | ocfs2_wait_on_refreshing_lock(lockres); | ||
1420 | goto refresh_check; | ||
1421 | } | ||
1422 | |||
1423 | /* Ok, I'll be the one to refresh this lock. */ | ||
1424 | lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); | ||
1425 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1426 | |||
1427 | status = 1; | ||
1428 | bail: | ||
1429 | mlog_exit(status); | ||
1430 | return status; | ||
1431 | } | ||
1432 | |||
1433 | /* If status is non zero, I'll mark it as not being in refresh | ||
1434 | * anymroe, but i won't clear the needs refresh flag. */ | ||
1435 | static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, | ||
1436 | int status) | ||
1437 | { | ||
1438 | unsigned long flags; | ||
1439 | mlog_entry_void(); | ||
1440 | |||
1441 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1442 | lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); | ||
1443 | if (!status) | ||
1444 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
1445 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1446 | |||
1447 | wake_up(&lockres->l_event); | ||
1448 | |||
1449 | mlog_exit_void(); | ||
1450 | } | ||
1451 | |||
1452 | /* may or may not return a bh if it went to disk. */ | ||
1453 | static int ocfs2_meta_lock_update(struct inode *inode, | ||
1454 | struct buffer_head **bh) | ||
1455 | { | ||
1456 | int status = 0; | ||
1457 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1458 | struct ocfs2_lock_res *lockres; | ||
1459 | struct ocfs2_dinode *fe; | ||
1460 | |||
1461 | mlog_entry_void(); | ||
1462 | |||
1463 | spin_lock(&oi->ip_lock); | ||
1464 | if (oi->ip_flags & OCFS2_INODE_DELETED) { | ||
1465 | mlog(0, "Orphaned inode %"MLFu64" was deleted while we " | ||
1466 | "were waiting on a lock. ip_flags = 0x%x\n", | ||
1467 | oi->ip_blkno, oi->ip_flags); | ||
1468 | spin_unlock(&oi->ip_lock); | ||
1469 | status = -ENOENT; | ||
1470 | goto bail; | ||
1471 | } | ||
1472 | spin_unlock(&oi->ip_lock); | ||
1473 | |||
1474 | lockres = &oi->ip_meta_lockres; | ||
1475 | |||
1476 | if (!ocfs2_should_refresh_lock_res(lockres)) | ||
1477 | goto bail; | ||
1478 | |||
1479 | /* This will discard any caching information we might have had | ||
1480 | * for the inode metadata. */ | ||
1481 | ocfs2_metadata_cache_purge(inode); | ||
1482 | |||
1483 | /* will do nothing for inode types that don't use the extent | ||
1484 | * map (directories, bitmap files, etc) */ | ||
1485 | ocfs2_extent_map_trunc(inode, 0); | ||
1486 | |||
1487 | if (ocfs2_meta_lvb_is_trustable(lockres)) { | ||
1488 | mlog(0, "Trusting LVB on inode %"MLFu64"\n", | ||
1489 | oi->ip_blkno); | ||
1490 | ocfs2_refresh_inode_from_lvb(inode); | ||
1491 | } else { | ||
1492 | /* Boo, we have to go to disk. */ | ||
1493 | /* read bh, cast, ocfs2_refresh_inode */ | ||
1494 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, | ||
1495 | bh, OCFS2_BH_CACHED, inode); | ||
1496 | if (status < 0) { | ||
1497 | mlog_errno(status); | ||
1498 | goto bail_refresh; | ||
1499 | } | ||
1500 | fe = (struct ocfs2_dinode *) (*bh)->b_data; | ||
1501 | |||
1502 | /* This is a good chance to make sure we're not | ||
1503 | * locking an invalid object. | ||
1504 | * | ||
1505 | * We bug on a stale inode here because we checked | ||
1506 | * above whether it was wiped from disk. The wiping | ||
1507 | * node provides a guarantee that we receive that | ||
1508 | * message and can mark the inode before dropping any | ||
1509 | * locks associated with it. */ | ||
1510 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1511 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
1512 | status = -EIO; | ||
1513 | goto bail_refresh; | ||
1514 | } | ||
1515 | mlog_bug_on_msg(inode->i_generation != | ||
1516 | le32_to_cpu(fe->i_generation), | ||
1517 | "Invalid dinode %"MLFu64" disk generation: %u " | ||
1518 | "inode->i_generation: %u\n", | ||
1519 | oi->ip_blkno, le32_to_cpu(fe->i_generation), | ||
1520 | inode->i_generation); | ||
1521 | mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || | ||
1522 | !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), | ||
1523 | "Stale dinode %"MLFu64" dtime: %"MLFu64" " | ||
1524 | "flags: 0x%x\n", oi->ip_blkno, | ||
1525 | le64_to_cpu(fe->i_dtime), | ||
1526 | le32_to_cpu(fe->i_flags)); | ||
1527 | |||
1528 | ocfs2_refresh_inode(inode, fe); | ||
1529 | } | ||
1530 | |||
1531 | status = 0; | ||
1532 | bail_refresh: | ||
1533 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
1534 | bail: | ||
1535 | mlog_exit(status); | ||
1536 | return status; | ||
1537 | } | ||
1538 | |||
1539 | static int ocfs2_assign_bh(struct inode *inode, | ||
1540 | struct buffer_head **ret_bh, | ||
1541 | struct buffer_head *passed_bh) | ||
1542 | { | ||
1543 | int status; | ||
1544 | |||
1545 | if (passed_bh) { | ||
1546 | /* Ok, the update went to disk for us, use the | ||
1547 | * returned bh. */ | ||
1548 | *ret_bh = passed_bh; | ||
1549 | get_bh(*ret_bh); | ||
1550 | |||
1551 | return 0; | ||
1552 | } | ||
1553 | |||
1554 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
1555 | OCFS2_I(inode)->ip_blkno, | ||
1556 | ret_bh, | ||
1557 | OCFS2_BH_CACHED, | ||
1558 | inode); | ||
1559 | if (status < 0) | ||
1560 | mlog_errno(status); | ||
1561 | |||
1562 | return status; | ||
1563 | } | ||
1564 | |||
1565 | /* | ||
1566 | * returns < 0 error if the callback will never be called, otherwise | ||
1567 | * the result of the lock will be communicated via the callback. | ||
1568 | */ | ||
1569 | int ocfs2_meta_lock_full(struct inode *inode, | ||
1570 | struct ocfs2_journal_handle *handle, | ||
1571 | struct buffer_head **ret_bh, | ||
1572 | int ex, | ||
1573 | int arg_flags) | ||
1574 | { | ||
1575 | int status, level, dlm_flags, acquired; | ||
1576 | struct ocfs2_lock_res *lockres; | ||
1577 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1578 | struct buffer_head *local_bh = NULL; | ||
1579 | |||
1580 | BUG_ON(!inode); | ||
1581 | |||
1582 | mlog_entry_void(); | ||
1583 | |||
1584 | mlog(0, "inode %"MLFu64", take %s META lock\n", | ||
1585 | OCFS2_I(inode)->ip_blkno, | ||
1586 | ex ? "EXMODE" : "PRMODE"); | ||
1587 | |||
1588 | status = 0; | ||
1589 | acquired = 0; | ||
1590 | /* We'll allow faking a readonly metadata lock for | ||
1591 | * rodevices. */ | ||
1592 | if (ocfs2_is_hard_readonly(osb)) { | ||
1593 | if (ex) | ||
1594 | status = -EROFS; | ||
1595 | goto bail; | ||
1596 | } | ||
1597 | |||
1598 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | ||
1599 | wait_event(osb->recovery_event, | ||
1600 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1601 | |||
1602 | acquired = 0; | ||
1603 | lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
1604 | level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1605 | dlm_flags = 0; | ||
1606 | if (arg_flags & OCFS2_META_LOCK_NOQUEUE) | ||
1607 | dlm_flags |= LKM_NOQUEUE; | ||
1608 | |||
1609 | status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); | ||
1610 | if (status < 0) { | ||
1611 | if (status != -EAGAIN && status != -EIOCBRETRY) | ||
1612 | mlog_errno(status); | ||
1613 | goto bail; | ||
1614 | } | ||
1615 | |||
1616 | /* Notify the error cleanup path to drop the cluster lock. */ | ||
1617 | acquired = 1; | ||
1618 | |||
1619 | /* We wait twice because a node may have died while we were in | ||
1620 | * the lower dlm layers. The second time though, we've | ||
1621 | * committed to owning this lock so we don't allow signals to | ||
1622 | * abort the operation. */ | ||
1623 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | ||
1624 | wait_event(osb->recovery_event, | ||
1625 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1626 | |||
1627 | /* This is fun. The caller may want a bh back, or it may | ||
1628 | * not. ocfs2_meta_lock_update definitely wants one in, but | ||
1629 | * may or may not read one, depending on what's in the | ||
1630 | * LVB. The result of all of this is that we've *only* gone to | ||
1631 | * disk if we have to, so the complexity is worthwhile. */ | ||
1632 | status = ocfs2_meta_lock_update(inode, &local_bh); | ||
1633 | if (status < 0) { | ||
1634 | if (status != -ENOENT) | ||
1635 | mlog_errno(status); | ||
1636 | goto bail; | ||
1637 | } | ||
1638 | |||
1639 | if (ret_bh) { | ||
1640 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); | ||
1641 | if (status < 0) { | ||
1642 | mlog_errno(status); | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | if (handle) { | ||
1648 | status = ocfs2_handle_add_lock(handle, inode); | ||
1649 | if (status < 0) | ||
1650 | mlog_errno(status); | ||
1651 | } | ||
1652 | |||
1653 | bail: | ||
1654 | if (status < 0) { | ||
1655 | if (ret_bh && (*ret_bh)) { | ||
1656 | brelse(*ret_bh); | ||
1657 | *ret_bh = NULL; | ||
1658 | } | ||
1659 | if (acquired) | ||
1660 | ocfs2_meta_unlock(inode, ex); | ||
1661 | } | ||
1662 | |||
1663 | if (local_bh) | ||
1664 | brelse(local_bh); | ||
1665 | |||
1666 | mlog_exit(status); | ||
1667 | return status; | ||
1668 | } | ||
1669 | |||
1670 | /* | ||
1671 | * This is working around a lock inversion between tasks acquiring DLM locks | ||
1672 | * while holding a page lock and the vote thread which blocks dlm lock acquiry | ||
1673 | * while acquiring page locks. | ||
1674 | * | ||
1675 | * ** These _with_page variantes are only intended to be called from aop | ||
1676 | * methods that hold page locks and return a very specific *positive* error | ||
1677 | * code that aop methods pass up to the VFS -- test for errors with != 0. ** | ||
1678 | * | ||
1679 | * The DLM is called such that it returns -EAGAIN if it would have blocked | ||
1680 | * waiting for the vote thread. In that case we unlock our page so the vote | ||
1681 | * thread can make progress. Once we've done this we have to return | ||
1682 | * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up | ||
1683 | * into the VFS who will then immediately retry the aop call. | ||
1684 | * | ||
1685 | * We do a blocking lock and immediate unlock before returning, though, so that | ||
1686 | * the lock has a great chance of being cached on this node by the time the VFS | ||
1687 | * calls back to retry the aop. This has a potential to livelock as nodes | ||
1688 | * ping locks back and forth, but that's a risk we're willing to take to avoid | ||
1689 | * the lock inversion simply. | ||
1690 | */ | ||
1691 | int ocfs2_meta_lock_with_page(struct inode *inode, | ||
1692 | struct ocfs2_journal_handle *handle, | ||
1693 | struct buffer_head **ret_bh, | ||
1694 | int ex, | ||
1695 | struct page *page) | ||
1696 | { | ||
1697 | int ret; | ||
1698 | |||
1699 | ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, | ||
1700 | OCFS2_LOCK_NONBLOCK); | ||
1701 | if (ret == -EAGAIN) { | ||
1702 | unlock_page(page); | ||
1703 | if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) | ||
1704 | ocfs2_meta_unlock(inode, ex); | ||
1705 | ret = AOP_TRUNCATED_PAGE; | ||
1706 | } | ||
1707 | |||
1708 | return ret; | ||
1709 | } | ||
1710 | |||
1711 | void ocfs2_meta_unlock(struct inode *inode, | ||
1712 | int ex) | ||
1713 | { | ||
1714 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1715 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
1716 | |||
1717 | mlog_entry_void(); | ||
1718 | |||
1719 | mlog(0, "inode %"MLFu64" drop %s META lock\n", | ||
1720 | OCFS2_I(inode)->ip_blkno, | ||
1721 | ex ? "EXMODE" : "PRMODE"); | ||
1722 | |||
1723 | if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) | ||
1724 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1725 | |||
1726 | mlog_exit_void(); | ||
1727 | } | ||
1728 | |||
1729 | int ocfs2_super_lock(struct ocfs2_super *osb, | ||
1730 | int ex) | ||
1731 | { | ||
1732 | int status; | ||
1733 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1734 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | ||
1735 | struct buffer_head *bh; | ||
1736 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1737 | |||
1738 | mlog_entry_void(); | ||
1739 | |||
1740 | if (ocfs2_is_hard_readonly(osb)) | ||
1741 | return -EROFS; | ||
1742 | |||
1743 | status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); | ||
1744 | if (status < 0) { | ||
1745 | mlog_errno(status); | ||
1746 | goto bail; | ||
1747 | } | ||
1748 | |||
1749 | /* The super block lock path is really in the best position to | ||
1750 | * know when resources covered by the lock need to be | ||
1751 | * refreshed, so we do it here. Of course, making sense of | ||
1752 | * everything is up to the caller :) */ | ||
1753 | status = ocfs2_should_refresh_lock_res(lockres); | ||
1754 | if (status < 0) { | ||
1755 | mlog_errno(status); | ||
1756 | goto bail; | ||
1757 | } | ||
1758 | if (status) { | ||
1759 | bh = si->si_bh; | ||
1760 | status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, | ||
1761 | si->si_inode); | ||
1762 | if (status == 0) | ||
1763 | ocfs2_update_slot_info(si); | ||
1764 | |||
1765 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
1766 | |||
1767 | if (status < 0) | ||
1768 | mlog_errno(status); | ||
1769 | } | ||
1770 | bail: | ||
1771 | mlog_exit(status); | ||
1772 | return status; | ||
1773 | } | ||
1774 | |||
1775 | void ocfs2_super_unlock(struct ocfs2_super *osb, | ||
1776 | int ex) | ||
1777 | { | ||
1778 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1779 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | ||
1780 | |||
1781 | ocfs2_cluster_unlock(osb, lockres, level); | ||
1782 | } | ||
1783 | |||
1784 | int ocfs2_rename_lock(struct ocfs2_super *osb) | ||
1785 | { | ||
1786 | int status; | ||
1787 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; | ||
1788 | |||
1789 | if (ocfs2_is_hard_readonly(osb)) | ||
1790 | return -EROFS; | ||
1791 | |||
1792 | status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); | ||
1793 | if (status < 0) | ||
1794 | mlog_errno(status); | ||
1795 | |||
1796 | return status; | ||
1797 | } | ||
1798 | |||
1799 | void ocfs2_rename_unlock(struct ocfs2_super *osb) | ||
1800 | { | ||
1801 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; | ||
1802 | |||
1803 | ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); | ||
1804 | } | ||
1805 | |||
1806 | /* Reference counting of the dlm debug structure. We want this because | ||
1807 | * open references on the debug inodes can live on after a mount, so | ||
1808 | * we can't rely on the ocfs2_super to always exist. */ | ||
1809 | static void ocfs2_dlm_debug_free(struct kref *kref) | ||
1810 | { | ||
1811 | struct ocfs2_dlm_debug *dlm_debug; | ||
1812 | |||
1813 | dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); | ||
1814 | |||
1815 | kfree(dlm_debug); | ||
1816 | } | ||
1817 | |||
1818 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) | ||
1819 | { | ||
1820 | if (dlm_debug) | ||
1821 | kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); | ||
1822 | } | ||
1823 | |||
1824 | static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) | ||
1825 | { | ||
1826 | kref_get(&debug->d_refcnt); | ||
1827 | } | ||
1828 | |||
1829 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) | ||
1830 | { | ||
1831 | struct ocfs2_dlm_debug *dlm_debug; | ||
1832 | |||
1833 | dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); | ||
1834 | if (!dlm_debug) { | ||
1835 | mlog_errno(-ENOMEM); | ||
1836 | goto out; | ||
1837 | } | ||
1838 | |||
1839 | kref_init(&dlm_debug->d_refcnt); | ||
1840 | INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); | ||
1841 | dlm_debug->d_locking_state = NULL; | ||
1842 | out: | ||
1843 | return dlm_debug; | ||
1844 | } | ||
1845 | |||
1846 | /* Access to this is arbitrated for us via seq_file->sem. */ | ||
1847 | struct ocfs2_dlm_seq_priv { | ||
1848 | struct ocfs2_dlm_debug *p_dlm_debug; | ||
1849 | struct ocfs2_lock_res p_iter_res; | ||
1850 | struct ocfs2_lock_res p_tmp_res; | ||
1851 | }; | ||
1852 | |||
1853 | static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, | ||
1854 | struct ocfs2_dlm_seq_priv *priv) | ||
1855 | { | ||
1856 | struct ocfs2_lock_res *iter, *ret = NULL; | ||
1857 | struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; | ||
1858 | |||
1859 | assert_spin_locked(&ocfs2_dlm_tracking_lock); | ||
1860 | |||
1861 | list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { | ||
1862 | /* discover the head of the list */ | ||
1863 | if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { | ||
1864 | mlog(0, "End of list found, %p\n", ret); | ||
1865 | break; | ||
1866 | } | ||
1867 | |||
1868 | /* We track our "dummy" iteration lockres' by a NULL | ||
1869 | * l_ops field. */ | ||
1870 | if (iter->l_ops != NULL) { | ||
1871 | ret = iter; | ||
1872 | break; | ||
1873 | } | ||
1874 | } | ||
1875 | |||
1876 | return ret; | ||
1877 | } | ||
1878 | |||
1879 | static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) | ||
1880 | { | ||
1881 | struct ocfs2_dlm_seq_priv *priv = m->private; | ||
1882 | struct ocfs2_lock_res *iter; | ||
1883 | |||
1884 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
1885 | iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); | ||
1886 | if (iter) { | ||
1887 | /* Since lockres' have the lifetime of their container | ||
1888 | * (which can be inodes, ocfs2_supers, etc) we want to | ||
1889 | * copy this out to a temporary lockres while still | ||
1890 | * under the spinlock. Obviously after this we can't | ||
1891 | * trust any pointers on the copy returned, but that's | ||
1892 | * ok as the information we want isn't typically held | ||
1893 | * in them. */ | ||
1894 | priv->p_tmp_res = *iter; | ||
1895 | iter = &priv->p_tmp_res; | ||
1896 | } | ||
1897 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
1898 | |||
1899 | return iter; | ||
1900 | } | ||
1901 | |||
1902 | static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) | ||
1903 | { | ||
1904 | } | ||
1905 | |||
1906 | static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
1907 | { | ||
1908 | struct ocfs2_dlm_seq_priv *priv = m->private; | ||
1909 | struct ocfs2_lock_res *iter = v; | ||
1910 | struct ocfs2_lock_res *dummy = &priv->p_iter_res; | ||
1911 | |||
1912 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
1913 | iter = ocfs2_dlm_next_res(iter, priv); | ||
1914 | list_del_init(&dummy->l_debug_list); | ||
1915 | if (iter) { | ||
1916 | list_add(&dummy->l_debug_list, &iter->l_debug_list); | ||
1917 | priv->p_tmp_res = *iter; | ||
1918 | iter = &priv->p_tmp_res; | ||
1919 | } | ||
1920 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
1921 | |||
1922 | return iter; | ||
1923 | } | ||
1924 | |||
1925 | /* So that debugfs.ocfs2 can determine which format is being used */ | ||
1926 | #define OCFS2_DLM_DEBUG_STR_VERSION 1 | ||
1927 | static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) | ||
1928 | { | ||
1929 | int i; | ||
1930 | char *lvb; | ||
1931 | struct ocfs2_lock_res *lockres = v; | ||
1932 | |||
1933 | if (!lockres) | ||
1934 | return -EINVAL; | ||
1935 | |||
1936 | seq_printf(m, "0x%x\t" | ||
1937 | "%.*s\t" | ||
1938 | "%d\t" | ||
1939 | "0x%lx\t" | ||
1940 | "0x%x\t" | ||
1941 | "0x%x\t" | ||
1942 | "%u\t" | ||
1943 | "%u\t" | ||
1944 | "%d\t" | ||
1945 | "%d\t", | ||
1946 | OCFS2_DLM_DEBUG_STR_VERSION, | ||
1947 | OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, | ||
1948 | lockres->l_level, | ||
1949 | lockres->l_flags, | ||
1950 | lockres->l_action, | ||
1951 | lockres->l_unlock_action, | ||
1952 | lockres->l_ro_holders, | ||
1953 | lockres->l_ex_holders, | ||
1954 | lockres->l_requested, | ||
1955 | lockres->l_blocking); | ||
1956 | |||
1957 | /* Dump the raw LVB */ | ||
1958 | lvb = lockres->l_lksb.lvb; | ||
1959 | for(i = 0; i < DLM_LVB_LEN; i++) | ||
1960 | seq_printf(m, "0x%x\t", lvb[i]); | ||
1961 | |||
1962 | /* End the line */ | ||
1963 | seq_printf(m, "\n"); | ||
1964 | return 0; | ||
1965 | } | ||
1966 | |||
1967 | static struct seq_operations ocfs2_dlm_seq_ops = { | ||
1968 | .start = ocfs2_dlm_seq_start, | ||
1969 | .stop = ocfs2_dlm_seq_stop, | ||
1970 | .next = ocfs2_dlm_seq_next, | ||
1971 | .show = ocfs2_dlm_seq_show, | ||
1972 | }; | ||
1973 | |||
1974 | static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) | ||
1975 | { | ||
1976 | struct seq_file *seq = (struct seq_file *) file->private_data; | ||
1977 | struct ocfs2_dlm_seq_priv *priv = seq->private; | ||
1978 | struct ocfs2_lock_res *res = &priv->p_iter_res; | ||
1979 | |||
1980 | ocfs2_remove_lockres_tracking(res); | ||
1981 | ocfs2_put_dlm_debug(priv->p_dlm_debug); | ||
1982 | return seq_release_private(inode, file); | ||
1983 | } | ||
1984 | |||
1985 | static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) | ||
1986 | { | ||
1987 | int ret; | ||
1988 | struct ocfs2_dlm_seq_priv *priv; | ||
1989 | struct seq_file *seq; | ||
1990 | struct ocfs2_super *osb; | ||
1991 | |||
1992 | priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); | ||
1993 | if (!priv) { | ||
1994 | ret = -ENOMEM; | ||
1995 | mlog_errno(ret); | ||
1996 | goto out; | ||
1997 | } | ||
1998 | osb = (struct ocfs2_super *) inode->u.generic_ip; | ||
1999 | ocfs2_get_dlm_debug(osb->osb_dlm_debug); | ||
2000 | priv->p_dlm_debug = osb->osb_dlm_debug; | ||
2001 | INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); | ||
2002 | |||
2003 | ret = seq_open(file, &ocfs2_dlm_seq_ops); | ||
2004 | if (ret) { | ||
2005 | kfree(priv); | ||
2006 | mlog_errno(ret); | ||
2007 | goto out; | ||
2008 | } | ||
2009 | |||
2010 | seq = (struct seq_file *) file->private_data; | ||
2011 | seq->private = priv; | ||
2012 | |||
2013 | ocfs2_add_lockres_tracking(&priv->p_iter_res, | ||
2014 | priv->p_dlm_debug); | ||
2015 | |||
2016 | out: | ||
2017 | return ret; | ||
2018 | } | ||
2019 | |||
2020 | static struct file_operations ocfs2_dlm_debug_fops = { | ||
2021 | .open = ocfs2_dlm_debug_open, | ||
2022 | .release = ocfs2_dlm_debug_release, | ||
2023 | .read = seq_read, | ||
2024 | .llseek = seq_lseek, | ||
2025 | }; | ||
2026 | |||
2027 | static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) | ||
2028 | { | ||
2029 | int ret = 0; | ||
2030 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | ||
2031 | |||
2032 | dlm_debug->d_locking_state = debugfs_create_file("locking_state", | ||
2033 | S_IFREG|S_IRUSR, | ||
2034 | osb->osb_debug_root, | ||
2035 | osb, | ||
2036 | &ocfs2_dlm_debug_fops); | ||
2037 | if (!dlm_debug->d_locking_state) { | ||
2038 | ret = -EINVAL; | ||
2039 | mlog(ML_ERROR, | ||
2040 | "Unable to create locking state debugfs file.\n"); | ||
2041 | goto out; | ||
2042 | } | ||
2043 | |||
2044 | ocfs2_get_dlm_debug(dlm_debug); | ||
2045 | out: | ||
2046 | return ret; | ||
2047 | } | ||
2048 | |||
2049 | static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) | ||
2050 | { | ||
2051 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | ||
2052 | |||
2053 | if (dlm_debug) { | ||
2054 | debugfs_remove(dlm_debug->d_locking_state); | ||
2055 | ocfs2_put_dlm_debug(dlm_debug); | ||
2056 | } | ||
2057 | } | ||
2058 | |||
2059 | int ocfs2_dlm_init(struct ocfs2_super *osb) | ||
2060 | { | ||
2061 | int status; | ||
2062 | u32 dlm_key; | ||
2063 | struct dlm_ctxt *dlm; | ||
2064 | |||
2065 | mlog_entry_void(); | ||
2066 | |||
2067 | status = ocfs2_dlm_init_debug(osb); | ||
2068 | if (status < 0) { | ||
2069 | mlog_errno(status); | ||
2070 | goto bail; | ||
2071 | } | ||
2072 | |||
2073 | /* launch vote thread */ | ||
2074 | osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", | ||
2075 | osb->osb_id); | ||
2076 | if (IS_ERR(osb->vote_task)) { | ||
2077 | status = PTR_ERR(osb->vote_task); | ||
2078 | osb->vote_task = NULL; | ||
2079 | mlog_errno(status); | ||
2080 | goto bail; | ||
2081 | } | ||
2082 | |||
2083 | /* used by the dlm code to make message headers unique, each | ||
2084 | * node in this domain must agree on this. */ | ||
2085 | dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); | ||
2086 | |||
2087 | /* for now, uuid == domain */ | ||
2088 | dlm = dlm_register_domain(osb->uuid_str, dlm_key); | ||
2089 | if (IS_ERR(dlm)) { | ||
2090 | status = PTR_ERR(dlm); | ||
2091 | mlog_errno(status); | ||
2092 | goto bail; | ||
2093 | } | ||
2094 | |||
2095 | ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); | ||
2096 | ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); | ||
2097 | |||
2098 | dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); | ||
2099 | |||
2100 | osb->dlm = dlm; | ||
2101 | |||
2102 | status = 0; | ||
2103 | bail: | ||
2104 | if (status < 0) { | ||
2105 | ocfs2_dlm_shutdown_debug(osb); | ||
2106 | if (osb->vote_task) | ||
2107 | kthread_stop(osb->vote_task); | ||
2108 | } | ||
2109 | |||
2110 | mlog_exit(status); | ||
2111 | return status; | ||
2112 | } | ||
2113 | |||
2114 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb) | ||
2115 | { | ||
2116 | mlog_entry_void(); | ||
2117 | |||
2118 | dlm_unregister_eviction_cb(&osb->osb_eviction_cb); | ||
2119 | |||
2120 | ocfs2_drop_osb_locks(osb); | ||
2121 | |||
2122 | if (osb->vote_task) { | ||
2123 | kthread_stop(osb->vote_task); | ||
2124 | osb->vote_task = NULL; | ||
2125 | } | ||
2126 | |||
2127 | ocfs2_lock_res_free(&osb->osb_super_lockres); | ||
2128 | ocfs2_lock_res_free(&osb->osb_rename_lockres); | ||
2129 | |||
2130 | dlm_unregister_domain(osb->dlm); | ||
2131 | osb->dlm = NULL; | ||
2132 | |||
2133 | ocfs2_dlm_shutdown_debug(osb); | ||
2134 | |||
2135 | mlog_exit_void(); | ||
2136 | } | ||
2137 | |||
2138 | static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) | ||
2139 | { | ||
2140 | struct ocfs2_lock_res *lockres = opaque; | ||
2141 | unsigned long flags; | ||
2142 | |||
2143 | mlog_entry_void(); | ||
2144 | |||
2145 | mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, | ||
2146 | lockres->l_unlock_action); | ||
2147 | |||
2148 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2149 | /* We tried to cancel a convert request, but it was already | ||
2150 | * granted. All we want to do here is clear our unlock | ||
2151 | * state. The wake_up call done at the bottom is redundant | ||
2152 | * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't | ||
2153 | * hurt anything anyway */ | ||
2154 | if (status == DLM_CANCELGRANT && | ||
2155 | lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { | ||
2156 | mlog(0, "Got cancelgrant for %s\n", lockres->l_name); | ||
2157 | |||
2158 | /* We don't clear the busy flag in this case as it | ||
2159 | * should have been cleared by the ast which the dlm | ||
2160 | * has called. */ | ||
2161 | goto complete_unlock; | ||
2162 | } | ||
2163 | |||
2164 | if (status != DLM_NORMAL) { | ||
2165 | mlog(ML_ERROR, "Dlm passes status %d for lock %s, " | ||
2166 | "unlock_action %d\n", status, lockres->l_name, | ||
2167 | lockres->l_unlock_action); | ||
2168 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2169 | return; | ||
2170 | } | ||
2171 | |||
2172 | switch(lockres->l_unlock_action) { | ||
2173 | case OCFS2_UNLOCK_CANCEL_CONVERT: | ||
2174 | mlog(0, "Cancel convert success for %s\n", lockres->l_name); | ||
2175 | lockres->l_action = OCFS2_AST_INVALID; | ||
2176 | break; | ||
2177 | case OCFS2_UNLOCK_DROP_LOCK: | ||
2178 | lockres->l_level = LKM_IVMODE; | ||
2179 | break; | ||
2180 | default: | ||
2181 | BUG(); | ||
2182 | } | ||
2183 | |||
2184 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
2185 | complete_unlock: | ||
2186 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
2187 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2188 | |||
2189 | wake_up(&lockres->l_event); | ||
2190 | |||
2191 | mlog_exit_void(); | ||
2192 | } | ||
2193 | |||
2194 | typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); | ||
2195 | |||
2196 | struct drop_lock_cb { | ||
2197 | ocfs2_pre_drop_cb_t *drop_func; | ||
2198 | void *drop_data; | ||
2199 | }; | ||
2200 | |||
2201 | static int ocfs2_drop_lock(struct ocfs2_super *osb, | ||
2202 | struct ocfs2_lock_res *lockres, | ||
2203 | struct drop_lock_cb *dcb) | ||
2204 | { | ||
2205 | enum dlm_status status; | ||
2206 | unsigned long flags; | ||
2207 | |||
2208 | /* We didn't get anywhere near actually using this lockres. */ | ||
2209 | if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) | ||
2210 | goto out; | ||
2211 | |||
2212 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2213 | |||
2214 | mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), | ||
2215 | "lockres %s, flags 0x%lx\n", | ||
2216 | lockres->l_name, lockres->l_flags); | ||
2217 | |||
2218 | while (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2219 | mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " | ||
2220 | "%u, unlock_action = %u\n", | ||
2221 | lockres->l_name, lockres->l_flags, lockres->l_action, | ||
2222 | lockres->l_unlock_action); | ||
2223 | |||
2224 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2225 | |||
2226 | /* XXX: Today we just wait on any busy | ||
2227 | * locks... Perhaps we need to cancel converts in the | ||
2228 | * future? */ | ||
2229 | ocfs2_wait_on_busy_lock(lockres); | ||
2230 | |||
2231 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2232 | } | ||
2233 | |||
2234 | if (dcb) | ||
2235 | dcb->drop_func(lockres, dcb->drop_data); | ||
2236 | |||
2237 | if (lockres->l_flags & OCFS2_LOCK_BUSY) | ||
2238 | mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", | ||
2239 | lockres->l_name); | ||
2240 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) | ||
2241 | mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); | ||
2242 | |||
2243 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { | ||
2244 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2245 | goto out; | ||
2246 | } | ||
2247 | |||
2248 | lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); | ||
2249 | |||
2250 | /* make sure we never get here while waiting for an ast to | ||
2251 | * fire. */ | ||
2252 | BUG_ON(lockres->l_action != OCFS2_AST_INVALID); | ||
2253 | |||
2254 | /* is this necessary? */ | ||
2255 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
2256 | lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; | ||
2257 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2258 | |||
2259 | mlog(0, "lock %s\n", lockres->l_name); | ||
2260 | |||
2261 | status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, | ||
2262 | lockres->l_ops->unlock_ast, lockres); | ||
2263 | if (status != DLM_NORMAL) { | ||
2264 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | ||
2265 | mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); | ||
2266 | dlm_print_one_lock(lockres->l_lksb.lockid); | ||
2267 | BUG(); | ||
2268 | } | ||
2269 | mlog(0, "lock %s, successfull return from dlmunlock\n", | ||
2270 | lockres->l_name); | ||
2271 | |||
2272 | ocfs2_wait_on_busy_lock(lockres); | ||
2273 | out: | ||
2274 | mlog_exit(0); | ||
2275 | return 0; | ||
2276 | } | ||
2277 | |||
2278 | /* Mark the lockres as being dropped. It will no longer be | ||
2279 | * queued if blocking, but we still may have to wait on it | ||
2280 | * being dequeued from the vote thread before we can consider | ||
2281 | * it safe to drop. | ||
2282 | * | ||
2283 | * You can *not* attempt to call cluster_lock on this lockres anymore. */ | ||
2284 | void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) | ||
2285 | { | ||
2286 | int status; | ||
2287 | struct ocfs2_mask_waiter mw; | ||
2288 | unsigned long flags; | ||
2289 | |||
2290 | ocfs2_init_mask_waiter(&mw); | ||
2291 | |||
2292 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2293 | lockres->l_flags |= OCFS2_LOCK_FREEING; | ||
2294 | while (lockres->l_flags & OCFS2_LOCK_QUEUED) { | ||
2295 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); | ||
2296 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2297 | |||
2298 | mlog(0, "Waiting on lockres %s\n", lockres->l_name); | ||
2299 | |||
2300 | status = ocfs2_wait_for_mask(&mw); | ||
2301 | if (status) | ||
2302 | mlog_errno(status); | ||
2303 | |||
2304 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2305 | } | ||
2306 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2307 | } | ||
2308 | |||
2309 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) | ||
2310 | { | ||
2311 | int status; | ||
2312 | |||
2313 | mlog_entry_void(); | ||
2314 | |||
2315 | ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); | ||
2316 | |||
2317 | status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); | ||
2318 | if (status < 0) | ||
2319 | mlog_errno(status); | ||
2320 | |||
2321 | ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); | ||
2322 | |||
2323 | status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); | ||
2324 | if (status < 0) | ||
2325 | mlog_errno(status); | ||
2326 | |||
2327 | mlog_exit(status); | ||
2328 | } | ||
2329 | |||
2330 | static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) | ||
2331 | { | ||
2332 | struct inode *inode = data; | ||
2333 | |||
2334 | /* the metadata lock requires a bit more work as we have an | ||
2335 | * LVB to worry about. */ | ||
2336 | if (lockres->l_flags & OCFS2_LOCK_ATTACHED && | ||
2337 | lockres->l_level == LKM_EXMODE && | ||
2338 | !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) | ||
2339 | __ocfs2_stuff_meta_lvb(inode); | ||
2340 | } | ||
2341 | |||
2342 | int ocfs2_drop_inode_locks(struct inode *inode) | ||
2343 | { | ||
2344 | int status, err; | ||
2345 | struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; | ||
2346 | |||
2347 | mlog_entry_void(); | ||
2348 | |||
2349 | /* No need to call ocfs2_mark_lockres_freeing here - | ||
2350 | * ocfs2_clear_inode has done it for us. */ | ||
2351 | |||
2352 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2353 | &OCFS2_I(inode)->ip_data_lockres, | ||
2354 | NULL); | ||
2355 | if (err < 0) | ||
2356 | mlog_errno(err); | ||
2357 | |||
2358 | status = err; | ||
2359 | |||
2360 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2361 | &OCFS2_I(inode)->ip_meta_lockres, | ||
2362 | &meta_dcb); | ||
2363 | if (err < 0) | ||
2364 | mlog_errno(err); | ||
2365 | if (err < 0 && !status) | ||
2366 | status = err; | ||
2367 | |||
2368 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2369 | &OCFS2_I(inode)->ip_rw_lockres, | ||
2370 | NULL); | ||
2371 | if (err < 0) | ||
2372 | mlog_errno(err); | ||
2373 | if (err < 0 && !status) | ||
2374 | status = err; | ||
2375 | |||
2376 | mlog_exit(status); | ||
2377 | return status; | ||
2378 | } | ||
2379 | |||
2380 | static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, | ||
2381 | int new_level) | ||
2382 | { | ||
2383 | assert_spin_locked(&lockres->l_lock); | ||
2384 | |||
2385 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | ||
2386 | |||
2387 | if (lockres->l_level <= new_level) { | ||
2388 | mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", | ||
2389 | lockres->l_level, new_level); | ||
2390 | BUG(); | ||
2391 | } | ||
2392 | |||
2393 | mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", | ||
2394 | lockres->l_name, new_level, lockres->l_blocking); | ||
2395 | |||
2396 | lockres->l_action = OCFS2_AST_DOWNCONVERT; | ||
2397 | lockres->l_requested = new_level; | ||
2398 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
2399 | } | ||
2400 | |||
2401 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, | ||
2402 | struct ocfs2_lock_res *lockres, | ||
2403 | int new_level, | ||
2404 | int lvb) | ||
2405 | { | ||
2406 | int ret, dlm_flags = LKM_CONVERT; | ||
2407 | enum dlm_status status; | ||
2408 | |||
2409 | mlog_entry_void(); | ||
2410 | |||
2411 | if (lvb) | ||
2412 | dlm_flags |= LKM_VALBLK; | ||
2413 | |||
2414 | status = dlmlock(osb->dlm, | ||
2415 | new_level, | ||
2416 | &lockres->l_lksb, | ||
2417 | dlm_flags, | ||
2418 | lockres->l_name, | ||
2419 | lockres->l_ops->ast, | ||
2420 | lockres, | ||
2421 | lockres->l_ops->bast); | ||
2422 | if (status != DLM_NORMAL) { | ||
2423 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
2424 | ret = -EINVAL; | ||
2425 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
2426 | goto bail; | ||
2427 | } | ||
2428 | |||
2429 | ret = 0; | ||
2430 | bail: | ||
2431 | mlog_exit(ret); | ||
2432 | return ret; | ||
2433 | } | ||
2434 | |||
2435 | /* returns 1 when the caller should unlock and call dlmunlock */ | ||
2436 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, | ||
2437 | struct ocfs2_lock_res *lockres) | ||
2438 | { | ||
2439 | assert_spin_locked(&lockres->l_lock); | ||
2440 | |||
2441 | mlog_entry_void(); | ||
2442 | mlog(0, "lock %s\n", lockres->l_name); | ||
2443 | |||
2444 | if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { | ||
2445 | /* If we're already trying to cancel a lock conversion | ||
2446 | * then just drop the spinlock and allow the caller to | ||
2447 | * requeue this lock. */ | ||
2448 | |||
2449 | mlog(0, "Lockres %s, skip convert\n", lockres->l_name); | ||
2450 | return 0; | ||
2451 | } | ||
2452 | |||
2453 | /* were we in a convert when we got the bast fire? */ | ||
2454 | BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && | ||
2455 | lockres->l_action != OCFS2_AST_DOWNCONVERT); | ||
2456 | /* set things up for the unlockast to know to just | ||
2457 | * clear out the ast_action and unset busy, etc. */ | ||
2458 | lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; | ||
2459 | |||
2460 | mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), | ||
2461 | "lock %s, invalid flags: 0x%lx\n", | ||
2462 | lockres->l_name, lockres->l_flags); | ||
2463 | |||
2464 | return 1; | ||
2465 | } | ||
2466 | |||
2467 | static int ocfs2_cancel_convert(struct ocfs2_super *osb, | ||
2468 | struct ocfs2_lock_res *lockres) | ||
2469 | { | ||
2470 | int ret; | ||
2471 | enum dlm_status status; | ||
2472 | |||
2473 | mlog_entry_void(); | ||
2474 | mlog(0, "lock %s\n", lockres->l_name); | ||
2475 | |||
2476 | ret = 0; | ||
2477 | status = dlmunlock(osb->dlm, | ||
2478 | &lockres->l_lksb, | ||
2479 | LKM_CANCEL, | ||
2480 | lockres->l_ops->unlock_ast, | ||
2481 | lockres); | ||
2482 | if (status != DLM_NORMAL) { | ||
2483 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | ||
2484 | ret = -EINVAL; | ||
2485 | ocfs2_recover_from_dlm_error(lockres, 0); | ||
2486 | } | ||
2487 | |||
2488 | mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); | ||
2489 | |||
2490 | mlog_exit(ret); | ||
2491 | return ret; | ||
2492 | } | ||
2493 | |||
2494 | static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, | ||
2495 | struct ocfs2_lock_res *lockres, | ||
2496 | int new_level) | ||
2497 | { | ||
2498 | int ret; | ||
2499 | |||
2500 | mlog_entry_void(); | ||
2501 | |||
2502 | BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); | ||
2503 | |||
2504 | if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { | ||
2505 | ret = 0; | ||
2506 | mlog(0, "lockres %s currently being refreshed -- backing " | ||
2507 | "off!\n", lockres->l_name); | ||
2508 | } else if (new_level == LKM_PRMODE) | ||
2509 | ret = !lockres->l_ex_holders && | ||
2510 | ocfs2_inode_fully_checkpointed(inode); | ||
2511 | else /* Must be NLMODE we're converting to. */ | ||
2512 | ret = !lockres->l_ro_holders && !lockres->l_ex_holders && | ||
2513 | ocfs2_inode_fully_checkpointed(inode); | ||
2514 | |||
2515 | mlog_exit(ret); | ||
2516 | return ret; | ||
2517 | } | ||
2518 | |||
2519 | static int ocfs2_do_unblock_meta(struct inode *inode, | ||
2520 | int *requeue) | ||
2521 | { | ||
2522 | int new_level; | ||
2523 | int set_lvb = 0; | ||
2524 | int ret = 0; | ||
2525 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
2526 | unsigned long flags; | ||
2527 | |||
2528 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2529 | |||
2530 | mlog_entry_void(); | ||
2531 | |||
2532 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2533 | |||
2534 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
2535 | |||
2536 | mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, | ||
2537 | lockres->l_blocking); | ||
2538 | |||
2539 | BUG_ON(lockres->l_level != LKM_EXMODE && | ||
2540 | lockres->l_level != LKM_PRMODE); | ||
2541 | |||
2542 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2543 | *requeue = 1; | ||
2544 | ret = ocfs2_prepare_cancel_convert(osb, lockres); | ||
2545 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2546 | if (ret) { | ||
2547 | ret = ocfs2_cancel_convert(osb, lockres); | ||
2548 | if (ret < 0) | ||
2549 | mlog_errno(ret); | ||
2550 | } | ||
2551 | goto leave; | ||
2552 | } | ||
2553 | |||
2554 | new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
2555 | |||
2556 | mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", | ||
2557 | lockres->l_level, lockres->l_blocking, new_level); | ||
2558 | |||
2559 | if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { | ||
2560 | if (lockres->l_level == LKM_EXMODE) | ||
2561 | set_lvb = 1; | ||
2562 | |||
2563 | /* If the lock hasn't been refreshed yet (rare), then | ||
2564 | * our memory inode values are old and we skip | ||
2565 | * stuffing the lvb. There's no need to actually clear | ||
2566 | * out the lvb here as it's value is still valid. */ | ||
2567 | if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { | ||
2568 | if (set_lvb) | ||
2569 | __ocfs2_stuff_meta_lvb(inode); | ||
2570 | } else | ||
2571 | mlog(0, "lockres %s: downconverting stale lock!\n", | ||
2572 | lockres->l_name); | ||
2573 | |||
2574 | mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " | ||
2575 | "l_blocking=%d, new_level=%d\n", | ||
2576 | lockres->l_level, lockres->l_blocking, new_level); | ||
2577 | |||
2578 | ocfs2_prepare_downconvert(lockres, new_level); | ||
2579 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2580 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); | ||
2581 | goto leave; | ||
2582 | } | ||
2583 | if (!ocfs2_inode_fully_checkpointed(inode)) | ||
2584 | ocfs2_start_checkpoint(osb); | ||
2585 | |||
2586 | *requeue = 1; | ||
2587 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2588 | ret = 0; | ||
2589 | leave: | ||
2590 | mlog_exit(ret); | ||
2591 | return ret; | ||
2592 | } | ||
2593 | |||
2594 | static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, | ||
2595 | struct ocfs2_lock_res *lockres, | ||
2596 | int *requeue, | ||
2597 | ocfs2_convert_worker_t *worker) | ||
2598 | { | ||
2599 | unsigned long flags; | ||
2600 | int blocking; | ||
2601 | int new_level; | ||
2602 | int ret = 0; | ||
2603 | |||
2604 | mlog_entry_void(); | ||
2605 | |||
2606 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2607 | |||
2608 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
2609 | |||
2610 | recheck: | ||
2611 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2612 | *requeue = 1; | ||
2613 | ret = ocfs2_prepare_cancel_convert(osb, lockres); | ||
2614 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2615 | if (ret) { | ||
2616 | ret = ocfs2_cancel_convert(osb, lockres); | ||
2617 | if (ret < 0) | ||
2618 | mlog_errno(ret); | ||
2619 | } | ||
2620 | goto leave; | ||
2621 | } | ||
2622 | |||
2623 | /* if we're blocking an exclusive and we have *any* holders, | ||
2624 | * then requeue. */ | ||
2625 | if ((lockres->l_blocking == LKM_EXMODE) | ||
2626 | && (lockres->l_ex_holders || lockres->l_ro_holders)) { | ||
2627 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2628 | *requeue = 1; | ||
2629 | ret = 0; | ||
2630 | goto leave; | ||
2631 | } | ||
2632 | |||
2633 | /* If it's a PR we're blocking, then only | ||
2634 | * requeue if we've got any EX holders */ | ||
2635 | if (lockres->l_blocking == LKM_PRMODE && | ||
2636 | lockres->l_ex_holders) { | ||
2637 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2638 | *requeue = 1; | ||
2639 | ret = 0; | ||
2640 | goto leave; | ||
2641 | } | ||
2642 | |||
2643 | /* If we get here, then we know that there are no more | ||
2644 | * incompatible holders (and anyone asking for an incompatible | ||
2645 | * lock is blocked). We can now downconvert the lock */ | ||
2646 | if (!worker) | ||
2647 | goto downconvert; | ||
2648 | |||
2649 | /* Some lockres types want to do a bit of work before | ||
2650 | * downconverting a lock. Allow that here. The worker function | ||
2651 | * may sleep, so we save off a copy of what we're blocking as | ||
2652 | * it may change while we're not holding the spin lock. */ | ||
2653 | blocking = lockres->l_blocking; | ||
2654 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2655 | |||
2656 | worker(lockres, blocking); | ||
2657 | |||
2658 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2659 | if (blocking != lockres->l_blocking) { | ||
2660 | /* If this changed underneath us, then we can't drop | ||
2661 | * it just yet. */ | ||
2662 | goto recheck; | ||
2663 | } | ||
2664 | |||
2665 | downconvert: | ||
2666 | *requeue = 0; | ||
2667 | new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
2668 | |||
2669 | ocfs2_prepare_downconvert(lockres, new_level); | ||
2670 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2671 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); | ||
2672 | leave: | ||
2673 | mlog_exit(ret); | ||
2674 | return ret; | ||
2675 | } | ||
2676 | |||
2677 | static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | ||
2678 | int blocking) | ||
2679 | { | ||
2680 | struct inode *inode; | ||
2681 | struct address_space *mapping; | ||
2682 | |||
2683 | mlog_entry_void(); | ||
2684 | |||
2685 | inode = ocfs2_lock_res_inode(lockres); | ||
2686 | mapping = inode->i_mapping; | ||
2687 | |||
2688 | if (filemap_fdatawrite(mapping)) { | ||
2689 | mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!", | ||
2690 | OCFS2_I(inode)->ip_blkno); | ||
2691 | } | ||
2692 | sync_mapping_buffers(mapping); | ||
2693 | if (blocking == LKM_EXMODE) { | ||
2694 | truncate_inode_pages(mapping, 0); | ||
2695 | unmap_mapping_range(mapping, 0, 0, 0); | ||
2696 | } else { | ||
2697 | /* We only need to wait on the I/O if we're not also | ||
2698 | * truncating pages because truncate_inode_pages waits | ||
2699 | * for us above. We don't truncate pages if we're | ||
2700 | * blocking anything < EXMODE because we want to keep | ||
2701 | * them around in that case. */ | ||
2702 | filemap_fdatawait(mapping); | ||
2703 | } | ||
2704 | |||
2705 | mlog_exit_void(); | ||
2706 | } | ||
2707 | |||
2708 | int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, | ||
2709 | int *requeue) | ||
2710 | { | ||
2711 | int status; | ||
2712 | struct inode *inode; | ||
2713 | struct ocfs2_super *osb; | ||
2714 | |||
2715 | mlog_entry_void(); | ||
2716 | |||
2717 | inode = ocfs2_lock_res_inode(lockres); | ||
2718 | osb = OCFS2_SB(inode->i_sb); | ||
2719 | |||
2720 | mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
2721 | |||
2722 | status = ocfs2_generic_unblock_lock(osb, | ||
2723 | lockres, | ||
2724 | requeue, | ||
2725 | ocfs2_data_convert_worker); | ||
2726 | if (status < 0) | ||
2727 | mlog_errno(status); | ||
2728 | |||
2729 | mlog(0, "inode %"MLFu64", requeue = %d\n", | ||
2730 | OCFS2_I(inode)->ip_blkno, *requeue); | ||
2731 | |||
2732 | mlog_exit(status); | ||
2733 | return status; | ||
2734 | } | ||
2735 | |||
2736 | static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, | ||
2737 | int *requeue) | ||
2738 | { | ||
2739 | int status; | ||
2740 | struct inode *inode; | ||
2741 | |||
2742 | mlog_entry_void(); | ||
2743 | |||
2744 | mlog(0, "Unblock lockres %s\n", lockres->l_name); | ||
2745 | |||
2746 | inode = ocfs2_lock_res_inode(lockres); | ||
2747 | |||
2748 | status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), | ||
2749 | lockres, | ||
2750 | requeue, | ||
2751 | NULL); | ||
2752 | if (status < 0) | ||
2753 | mlog_errno(status); | ||
2754 | |||
2755 | mlog_exit(status); | ||
2756 | return status; | ||
2757 | } | ||
2758 | |||
2759 | |||
2760 | int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, | ||
2761 | int *requeue) | ||
2762 | { | ||
2763 | int status; | ||
2764 | struct inode *inode; | ||
2765 | |||
2766 | mlog_entry_void(); | ||
2767 | |||
2768 | inode = ocfs2_lock_res_inode(lockres); | ||
2769 | |||
2770 | mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
2771 | |||
2772 | status = ocfs2_do_unblock_meta(inode, requeue); | ||
2773 | if (status < 0) | ||
2774 | mlog_errno(status); | ||
2775 | |||
2776 | mlog(0, "inode %"MLFu64", requeue = %d\n", | ||
2777 | OCFS2_I(inode)->ip_blkno, *requeue); | ||
2778 | |||
2779 | mlog_exit(status); | ||
2780 | return status; | ||
2781 | } | ||
2782 | |||
2783 | /* Generic unblock function for any lockres whose private data is an | ||
2784 | * ocfs2_super pointer. */ | ||
2785 | static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, | ||
2786 | int *requeue) | ||
2787 | { | ||
2788 | int status; | ||
2789 | struct ocfs2_super *osb; | ||
2790 | |||
2791 | mlog_entry_void(); | ||
2792 | |||
2793 | mlog(0, "Unblock lockres %s\n", lockres->l_name); | ||
2794 | |||
2795 | osb = ocfs2_lock_res_super(lockres); | ||
2796 | |||
2797 | status = ocfs2_generic_unblock_lock(osb, | ||
2798 | lockres, | ||
2799 | requeue, | ||
2800 | NULL); | ||
2801 | if (status < 0) | ||
2802 | mlog_errno(status); | ||
2803 | |||
2804 | mlog_exit(status); | ||
2805 | return status; | ||
2806 | } | ||
2807 | |||
2808 | void ocfs2_process_blocked_lock(struct ocfs2_super *osb, | ||
2809 | struct ocfs2_lock_res *lockres) | ||
2810 | { | ||
2811 | int status; | ||
2812 | int requeue = 0; | ||
2813 | unsigned long flags; | ||
2814 | |||
2815 | /* Our reference to the lockres in this function can be | ||
2816 | * considered valid until we remove the OCFS2_LOCK_QUEUED | ||
2817 | * flag. */ | ||
2818 | |||
2819 | mlog_entry_void(); | ||
2820 | |||
2821 | BUG_ON(!lockres); | ||
2822 | BUG_ON(!lockres->l_ops); | ||
2823 | BUG_ON(!lockres->l_ops->unblock); | ||
2824 | |||
2825 | mlog(0, "lockres %s blocked.\n", lockres->l_name); | ||
2826 | |||
2827 | /* Detect whether a lock has been marked as going away while | ||
2828 | * the vote thread was processing other things. A lock can | ||
2829 | * still be marked with OCFS2_LOCK_FREEING after this check, | ||
2830 | * but short circuiting here will still save us some | ||
2831 | * performance. */ | ||
2832 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2833 | if (lockres->l_flags & OCFS2_LOCK_FREEING) | ||
2834 | goto unqueue; | ||
2835 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2836 | |||
2837 | status = lockres->l_ops->unblock(lockres, &requeue); | ||
2838 | if (status < 0) | ||
2839 | mlog_errno(status); | ||
2840 | |||
2841 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2842 | unqueue: | ||
2843 | if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { | ||
2844 | lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); | ||
2845 | } else | ||
2846 | ocfs2_schedule_blocked_lock(osb, lockres); | ||
2847 | |||
2848 | mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, | ||
2849 | requeue ? "yes" : "no"); | ||
2850 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2851 | |||
2852 | mlog_exit_void(); | ||
2853 | } | ||
2854 | |||
2855 | static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, | ||
2856 | struct ocfs2_lock_res *lockres) | ||
2857 | { | ||
2858 | mlog_entry_void(); | ||
2859 | |||
2860 | assert_spin_locked(&lockres->l_lock); | ||
2861 | |||
2862 | if (lockres->l_flags & OCFS2_LOCK_FREEING) { | ||
2863 | /* Do not schedule a lock for downconvert when it's on | ||
2864 | * the way to destruction - any nodes wanting access | ||
2865 | * to the resource will get it soon. */ | ||
2866 | mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", | ||
2867 | lockres->l_name, lockres->l_flags); | ||
2868 | return; | ||
2869 | } | ||
2870 | |||
2871 | lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); | ||
2872 | |||
2873 | spin_lock(&osb->vote_task_lock); | ||
2874 | if (list_empty(&lockres->l_blocked_list)) { | ||
2875 | list_add_tail(&lockres->l_blocked_list, | ||
2876 | &osb->blocked_lock_list); | ||
2877 | osb->blocked_lock_count++; | ||
2878 | } | ||
2879 | spin_unlock(&osb->vote_task_lock); | ||
2880 | |||
2881 | mlog_exit_void(); | ||
2882 | } | ||
2883 | |||
2884 | /* This aids in debugging situations where a bad LVB might be involved. */ | ||
2885 | void ocfs2_dump_meta_lvb_info(u64 level, | ||
2886 | const char *function, | ||
2887 | unsigned int line, | ||
2888 | struct ocfs2_lock_res *lockres) | ||
2889 | { | ||
2890 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
2891 | |||
2892 | mlog(level, "LVB information for %s (called from %s:%u):\n", | ||
2893 | lockres->l_name, function, line); | ||
2894 | mlog(level, "version: %u, clusters: %u\n", | ||
2895 | be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); | ||
2896 | mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n", | ||
2897 | be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid), | ||
2898 | be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode)); | ||
2899 | mlog(level, "nlink %u, atime_packed 0x%"MLFx64", " | ||
2900 | "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n", | ||
2901 | be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed), | ||
2902 | be64_to_cpu(lvb->lvb_ictime_packed), | ||
2903 | be64_to_cpu(lvb->lvb_imtime_packed)); | ||
2904 | } | ||
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h new file mode 100644 index 000000000000..8f2d1db2d9ea --- /dev/null +++ b/fs/ocfs2/dlmglue.h | |||
@@ -0,0 +1,111 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmglue.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef DLMGLUE_H | ||
28 | #define DLMGLUE_H | ||
29 | |||
30 | #define OCFS2_LVB_VERSION 2 | ||
31 | |||
32 | struct ocfs2_meta_lvb { | ||
33 | __be32 lvb_version; | ||
34 | __be32 lvb_iclusters; | ||
35 | __be32 lvb_iuid; | ||
36 | __be32 lvb_igid; | ||
37 | __be64 lvb_iatime_packed; | ||
38 | __be64 lvb_ictime_packed; | ||
39 | __be64 lvb_imtime_packed; | ||
40 | __be64 lvb_isize; | ||
41 | __be16 lvb_imode; | ||
42 | __be16 lvb_inlink; | ||
43 | __be32 lvb_reserved[3]; | ||
44 | }; | ||
45 | |||
46 | /* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ | ||
47 | /* don't wait on recovery. */ | ||
48 | #define OCFS2_META_LOCK_RECOVERY (0x01) | ||
49 | /* Instruct the dlm not to queue ourselves on the other node. */ | ||
50 | #define OCFS2_META_LOCK_NOQUEUE (0x02) | ||
51 | /* don't block waiting for the vote thread, instead return -EAGAIN */ | ||
52 | #define OCFS2_LOCK_NONBLOCK (0x04) | ||
53 | |||
54 | int ocfs2_dlm_init(struct ocfs2_super *osb); | ||
55 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb); | ||
56 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); | ||
57 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | ||
58 | enum ocfs2_lock_type type, | ||
59 | struct inode *inode); | ||
60 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res); | ||
61 | int ocfs2_create_new_inode_locks(struct inode *inode); | ||
62 | int ocfs2_drop_inode_locks(struct inode *inode); | ||
63 | int ocfs2_data_lock_full(struct inode *inode, | ||
64 | int write, | ||
65 | int arg_flags); | ||
66 | #define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0) | ||
67 | int ocfs2_data_lock_with_page(struct inode *inode, | ||
68 | int write, | ||
69 | struct page *page); | ||
70 | void ocfs2_data_unlock(struct inode *inode, | ||
71 | int write); | ||
72 | int ocfs2_rw_lock(struct inode *inode, int write); | ||
73 | void ocfs2_rw_unlock(struct inode *inode, int write); | ||
74 | int ocfs2_meta_lock_full(struct inode *inode, | ||
75 | struct ocfs2_journal_handle *handle, | ||
76 | struct buffer_head **ret_bh, | ||
77 | int ex, | ||
78 | int arg_flags); | ||
79 | int ocfs2_meta_lock_with_page(struct inode *inode, | ||
80 | struct ocfs2_journal_handle *handle, | ||
81 | struct buffer_head **ret_bh, | ||
82 | int ex, | ||
83 | struct page *page); | ||
84 | /* 99% of the time we don't want to supply any additional flags -- | ||
85 | * those are for very specific cases only. */ | ||
86 | #define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0) | ||
87 | void ocfs2_meta_unlock(struct inode *inode, | ||
88 | int ex); | ||
89 | int ocfs2_super_lock(struct ocfs2_super *osb, | ||
90 | int ex); | ||
91 | void ocfs2_super_unlock(struct ocfs2_super *osb, | ||
92 | int ex); | ||
93 | int ocfs2_rename_lock(struct ocfs2_super *osb); | ||
94 | void ocfs2_rename_unlock(struct ocfs2_super *osb); | ||
95 | void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); | ||
96 | |||
97 | /* for the vote thread */ | ||
98 | void ocfs2_process_blocked_lock(struct ocfs2_super *osb, | ||
99 | struct ocfs2_lock_res *lockres); | ||
100 | |||
101 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); | ||
102 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); | ||
103 | |||
104 | /* aids in debugging and tracking lvbs */ | ||
105 | void ocfs2_dump_meta_lvb_info(u64 level, | ||
106 | const char *function, | ||
107 | unsigned int line, | ||
108 | struct ocfs2_lock_res *lockres); | ||
109 | #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) | ||
110 | |||
111 | #endif /* DLMGLUE_H */ | ||
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h new file mode 100644 index 000000000000..f226b2207628 --- /dev/null +++ b/fs/ocfs2/endian.h | |||
@@ -0,0 +1,45 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef OCFS2_ENDIAN_H | ||
23 | #define OCFS2_ENDIAN_H | ||
24 | |||
25 | static inline void le16_add_cpu(__le16 *var, u16 val) | ||
26 | { | ||
27 | *var = cpu_to_le16(le16_to_cpu(*var) + val); | ||
28 | } | ||
29 | |||
30 | static inline void le32_add_cpu(__le32 *var, u32 val) | ||
31 | { | ||
32 | *var = cpu_to_le32(le32_to_cpu(*var) + val); | ||
33 | } | ||
34 | |||
35 | static inline void le32_and_cpu(__le32 *var, u32 val) | ||
36 | { | ||
37 | *var = cpu_to_le32(le32_to_cpu(*var) & val); | ||
38 | } | ||
39 | |||
40 | static inline void be32_add_cpu(__be32 *var, u32 val) | ||
41 | { | ||
42 | *var = cpu_to_be32(be32_to_cpu(*var) + val); | ||
43 | } | ||
44 | |||
45 | #endif /* OCFS2_ENDIAN_H */ | ||
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c new file mode 100644 index 000000000000..5810160d92a8 --- /dev/null +++ b/fs/ocfs2/export.c | |||
@@ -0,0 +1,248 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * export.c | ||
5 | * | ||
6 | * Functions to facilitate NFS exporting | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | |||
29 | #define MLOG_MASK_PREFIX ML_EXPORT | ||
30 | #include <cluster/masklog.h> | ||
31 | |||
32 | #include "ocfs2.h" | ||
33 | |||
34 | #include "dir.h" | ||
35 | #include "dlmglue.h" | ||
36 | #include "export.h" | ||
37 | #include "inode.h" | ||
38 | |||
39 | #include "buffer_head_io.h" | ||
40 | |||
41 | struct ocfs2_inode_handle | ||
42 | { | ||
43 | u64 ih_blkno; | ||
44 | u32 ih_generation; | ||
45 | }; | ||
46 | |||
47 | static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) | ||
48 | { | ||
49 | struct ocfs2_inode_handle *handle = vobjp; | ||
50 | struct inode *inode; | ||
51 | struct dentry *result; | ||
52 | |||
53 | mlog_entry("(0x%p, 0x%p)\n", sb, handle); | ||
54 | |||
55 | if (handle->ih_blkno == 0) { | ||
56 | mlog_errno(-ESTALE); | ||
57 | return ERR_PTR(-ESTALE); | ||
58 | } | ||
59 | |||
60 | inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); | ||
61 | |||
62 | if (IS_ERR(inode)) { | ||
63 | mlog_errno(PTR_ERR(inode)); | ||
64 | return (void *)inode; | ||
65 | } | ||
66 | |||
67 | if (handle->ih_generation != inode->i_generation) { | ||
68 | iput(inode); | ||
69 | mlog_errno(-ESTALE); | ||
70 | return ERR_PTR(-ESTALE); | ||
71 | } | ||
72 | |||
73 | result = d_alloc_anon(inode); | ||
74 | |||
75 | if (!result) { | ||
76 | iput(inode); | ||
77 | mlog_errno(-ENOMEM); | ||
78 | return ERR_PTR(-ENOMEM); | ||
79 | } | ||
80 | |||
81 | mlog_exit_ptr(result); | ||
82 | return result; | ||
83 | } | ||
84 | |||
85 | static struct dentry *ocfs2_get_parent(struct dentry *child) | ||
86 | { | ||
87 | int status; | ||
88 | u64 blkno; | ||
89 | struct dentry *parent; | ||
90 | struct inode *inode; | ||
91 | struct inode *dir = child->d_inode; | ||
92 | struct buffer_head *dirent_bh = NULL; | ||
93 | struct ocfs2_dir_entry *dirent; | ||
94 | |||
95 | mlog_entry("(0x%p, '%.*s')\n", child, | ||
96 | child->d_name.len, child->d_name.name); | ||
97 | |||
98 | mlog(0, "find parent of directory %"MLFu64"\n", | ||
99 | OCFS2_I(dir)->ip_blkno); | ||
100 | |||
101 | status = ocfs2_meta_lock(dir, NULL, NULL, 0); | ||
102 | if (status < 0) { | ||
103 | if (status != -ENOENT) | ||
104 | mlog_errno(status); | ||
105 | parent = ERR_PTR(status); | ||
106 | goto bail; | ||
107 | } | ||
108 | |||
109 | status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, | ||
110 | &dirent); | ||
111 | if (status < 0) { | ||
112 | parent = ERR_PTR(-ENOENT); | ||
113 | goto bail_unlock; | ||
114 | } | ||
115 | |||
116 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | ||
117 | if (IS_ERR(inode)) { | ||
118 | mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); | ||
119 | parent = ERR_PTR(-EACCES); | ||
120 | goto bail_unlock; | ||
121 | } | ||
122 | |||
123 | parent = d_alloc_anon(inode); | ||
124 | if (!parent) { | ||
125 | iput(inode); | ||
126 | parent = ERR_PTR(-ENOMEM); | ||
127 | } | ||
128 | |||
129 | bail_unlock: | ||
130 | ocfs2_meta_unlock(dir, 0); | ||
131 | |||
132 | if (dirent_bh) | ||
133 | brelse(dirent_bh); | ||
134 | |||
135 | bail: | ||
136 | mlog_exit_ptr(parent); | ||
137 | |||
138 | return parent; | ||
139 | } | ||
140 | |||
141 | static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len, | ||
142 | int connectable) | ||
143 | { | ||
144 | struct inode *inode = dentry->d_inode; | ||
145 | int len = *max_len; | ||
146 | int type = 1; | ||
147 | u64 blkno; | ||
148 | u32 generation; | ||
149 | |||
150 | mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, | ||
151 | dentry->d_name.len, dentry->d_name.name, | ||
152 | fh, len, connectable); | ||
153 | |||
154 | if (len < 3 || (connectable && len < 6)) { | ||
155 | mlog(ML_ERROR, "fh buffer is too small for encoding\n"); | ||
156 | type = 255; | ||
157 | goto bail; | ||
158 | } | ||
159 | |||
160 | blkno = OCFS2_I(inode)->ip_blkno; | ||
161 | generation = inode->i_generation; | ||
162 | |||
163 | mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", | ||
164 | blkno, generation); | ||
165 | |||
166 | len = 3; | ||
167 | fh[0] = cpu_to_le32((u32)(blkno >> 32)); | ||
168 | fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); | ||
169 | fh[2] = cpu_to_le32(generation); | ||
170 | |||
171 | if (connectable && !S_ISDIR(inode->i_mode)) { | ||
172 | struct inode *parent; | ||
173 | |||
174 | spin_lock(&dentry->d_lock); | ||
175 | |||
176 | parent = dentry->d_parent->d_inode; | ||
177 | blkno = OCFS2_I(parent)->ip_blkno; | ||
178 | generation = parent->i_generation; | ||
179 | |||
180 | fh[3] = cpu_to_le32((u32)(blkno >> 32)); | ||
181 | fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); | ||
182 | fh[5] = cpu_to_le32(generation); | ||
183 | |||
184 | spin_unlock(&dentry->d_lock); | ||
185 | |||
186 | len = 6; | ||
187 | type = 2; | ||
188 | |||
189 | mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n", | ||
190 | blkno, generation); | ||
191 | } | ||
192 | |||
193 | *max_len = len; | ||
194 | |||
195 | bail: | ||
196 | mlog_exit(type); | ||
197 | return type; | ||
198 | } | ||
199 | |||
200 | static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh, | ||
201 | int fh_len, int fileid_type, | ||
202 | int (*acceptable)(void *context, | ||
203 | struct dentry *de), | ||
204 | void *context) | ||
205 | { | ||
206 | struct ocfs2_inode_handle handle, parent; | ||
207 | struct dentry *ret = NULL; | ||
208 | |||
209 | mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n", | ||
210 | sb, fh, fh_len, fileid_type, acceptable, context); | ||
211 | |||
212 | if (fh_len < 3 || fileid_type > 2) | ||
213 | goto bail; | ||
214 | |||
215 | if (fileid_type == 2) { | ||
216 | if (fh_len < 6) | ||
217 | goto bail; | ||
218 | |||
219 | parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32; | ||
220 | parent.ih_blkno |= (u64)le32_to_cpu(fh[4]); | ||
221 | parent.ih_generation = le32_to_cpu(fh[5]); | ||
222 | |||
223 | mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n", | ||
224 | parent.ih_blkno, parent.ih_generation); | ||
225 | } | ||
226 | |||
227 | handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32; | ||
228 | handle.ih_blkno |= (u64)le32_to_cpu(fh[1]); | ||
229 | handle.ih_generation = le32_to_cpu(fh[2]); | ||
230 | |||
231 | mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", | ||
232 | handle.ih_blkno, handle.ih_generation); | ||
233 | |||
234 | ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent, | ||
235 | acceptable, context); | ||
236 | |||
237 | bail: | ||
238 | mlog_exit_ptr(ret); | ||
239 | return ret; | ||
240 | } | ||
241 | |||
242 | struct export_operations ocfs2_export_ops = { | ||
243 | .decode_fh = ocfs2_decode_fh, | ||
244 | .encode_fh = ocfs2_encode_fh, | ||
245 | |||
246 | .get_parent = ocfs2_get_parent, | ||
247 | .get_dentry = ocfs2_get_dentry, | ||
248 | }; | ||
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h new file mode 100644 index 000000000000..5b77ee7866ef --- /dev/null +++ b/fs/ocfs2/export.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * export.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_EXPORT_H | ||
27 | #define OCFS2_EXPORT_H | ||
28 | |||
29 | extern struct export_operations ocfs2_export_ops; | ||
30 | |||
31 | #endif /* OCFS2_EXPORT_H */ | ||
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 000000000000..f2fb40cd296a --- /dev/null +++ b/fs/ocfs2/extent_map.c | |||
@@ -0,0 +1,994 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * extent_map.c | ||
5 | * | ||
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | ||
7 | * the library. | ||
8 | * | ||
9 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License, version 2, as published by the Free Software Foundation. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "extent_map.h" | ||
38 | #include "inode.h" | ||
39 | #include "super.h" | ||
40 | |||
41 | #include "buffer_head_io.h" | ||
42 | |||
43 | |||
44 | /* | ||
45 | * SUCK SUCK SUCK | ||
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static kmem_cache_t *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Find an entry in the tree that intersects the region passed in. | ||
105 | * Note that this will find straddled intervals, it is up to the | ||
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | ||
112 | * The rb_node garbage lets insertion share the search. Trivial | ||
113 | * callers pass NULL. | ||
114 | */ | ||
115 | static struct ocfs2_extent_map_entry * | ||
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | ||
121 | struct rb_node **p = &em->em_extents.rb_node; | ||
122 | struct rb_node *parent = NULL; | ||
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | |||
141 | if (ret_p != NULL) | ||
142 | *ret_p = p; | ||
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find the leaf containing the interval we want. While we're on our | ||
150 | * way down the tree, fill in every record we see at any depth, because | ||
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | ||
161 | int i, ret; | ||
162 | struct buffer_head *eb_bh = NULL; | ||
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | ||
167 | |||
168 | /* | ||
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | goto out_free; | ||
185 | } | ||
186 | |||
187 | if (rec_end <= cpos) { | ||
188 | ret = ocfs2_extent_map_insert(inode, rec, | ||
189 | le16_to_cpu(el->l_tree_depth)); | ||
190 | if (ret && (ret != -EEXIST)) { | ||
191 | mlog_errno(ret); | ||
192 | goto out_free; | ||
193 | } | ||
194 | continue; | ||
195 | } | ||
196 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
197 | ret = ocfs2_extent_map_insert(inode, rec, | ||
198 | le16_to_cpu(el->l_tree_depth)); | ||
199 | if (ret && (ret != -EEXIST)) { | ||
200 | mlog_errno(ret); | ||
201 | goto out_free; | ||
202 | } | ||
203 | continue; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * We've found a record that matches our | ||
208 | * interval. We don't insert it because we're | ||
209 | * about to traverse it. | ||
210 | */ | ||
211 | |||
212 | /* Check to see if we're stradling */ | ||
213 | ret = -ESRCH; | ||
214 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
215 | cpos, | ||
216 | clusters)) { | ||
217 | mlog_errno(ret); | ||
218 | goto out_free; | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * If we've already found a record, the el has | ||
223 | * two records covering the same interval. | ||
224 | * EEEK! | ||
225 | */ | ||
226 | ret = -EBADR; | ||
227 | if (blkno) { | ||
228 | mlog_errno(ret); | ||
229 | goto out_free; | ||
230 | } | ||
231 | |||
232 | blkno = le64_to_cpu(rec->e_blkno); | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * We don't support holes, and we're still up | ||
237 | * in the branches, so we'd better have found someone | ||
238 | */ | ||
239 | ret = -EBADR; | ||
240 | if (!blkno) { | ||
241 | mlog_errno(ret); | ||
242 | goto out_free; | ||
243 | } | ||
244 | |||
245 | if (eb_bh) { | ||
246 | brelse(eb_bh); | ||
247 | eb_bh = NULL; | ||
248 | } | ||
249 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
250 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
251 | inode); | ||
252 | if (ret) { | ||
253 | mlog_errno(ret); | ||
254 | goto out_free; | ||
255 | } | ||
256 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
257 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
258 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
259 | ret = -EIO; | ||
260 | goto out_free; | ||
261 | } | ||
262 | el = &eb->h_list; | ||
263 | } | ||
264 | |||
265 | if (el->l_tree_depth) | ||
266 | BUG(); | ||
267 | |||
268 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
269 | rec = &el->l_recs[i]; | ||
270 | ret = ocfs2_extent_map_insert(inode, rec, | ||
271 | le16_to_cpu(el->l_tree_depth)); | ||
272 | if (ret) { | ||
273 | mlog_errno(ret); | ||
274 | goto out_free; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | ret = 0; | ||
279 | |||
280 | out_free: | ||
281 | if (eb_bh) | ||
282 | brelse(eb_bh); | ||
283 | |||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * This lookup actually will read from disk. It has one invariant: | ||
289 | * It will never re-traverse blocks. This means that all inserts should | ||
290 | * be new regions or more granular regions (both allowed by insert). | ||
291 | */ | ||
292 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
293 | u32 cpos, | ||
294 | u32 clusters, | ||
295 | struct ocfs2_extent_map_entry **ret_ent) | ||
296 | { | ||
297 | int ret; | ||
298 | u64 blkno; | ||
299 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
300 | struct ocfs2_extent_map_entry *ent; | ||
301 | struct buffer_head *bh = NULL; | ||
302 | struct ocfs2_extent_block *eb; | ||
303 | struct ocfs2_dinode *di; | ||
304 | struct ocfs2_extent_list *el; | ||
305 | |||
306 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
307 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
308 | if (ent) { | ||
309 | if (!ent->e_tree_depth) { | ||
310 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
311 | *ret_ent = ent; | ||
312 | return 0; | ||
313 | } | ||
314 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
315 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
316 | |||
317 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
318 | OCFS2_BH_CACHED, inode); | ||
319 | if (ret) { | ||
320 | mlog_errno(ret); | ||
321 | if (bh) | ||
322 | brelse(bh); | ||
323 | return ret; | ||
324 | } | ||
325 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
326 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
327 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
328 | brelse(bh); | ||
329 | return -EIO; | ||
330 | } | ||
331 | el = &eb->h_list; | ||
332 | } else { | ||
333 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
334 | |||
335 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
336 | OCFS2_I(inode)->ip_blkno, &bh, | ||
337 | OCFS2_BH_CACHED, inode); | ||
338 | if (ret) { | ||
339 | mlog_errno(ret); | ||
340 | if (bh) | ||
341 | brelse(bh); | ||
342 | return ret; | ||
343 | } | ||
344 | di = (struct ocfs2_dinode *)bh->b_data; | ||
345 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
346 | brelse(bh); | ||
347 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
348 | return -EIO; | ||
349 | } | ||
350 | el = &di->id2.i_list; | ||
351 | } | ||
352 | |||
353 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
354 | brelse(bh); | ||
355 | if (ret) { | ||
356 | mlog_errno(ret); | ||
357 | return ret; | ||
358 | } | ||
359 | |||
360 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
361 | if (!ent) { | ||
362 | ret = -ESRCH; | ||
363 | mlog_errno(ret); | ||
364 | return ret; | ||
365 | } | ||
366 | |||
367 | if (ent->e_tree_depth) | ||
368 | BUG(); /* FIXME: Make sure this isn't a corruption */ | ||
369 | |||
370 | *ret_ent = ent; | ||
371 | |||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | /* | ||
376 | * Callers must hold ip_lock. This can insert pieces of the tree, | ||
377 | * thus racing lookup if the lock weren't held. | ||
378 | */ | ||
379 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
380 | struct ocfs2_extent_map_entry *ent) | ||
381 | { | ||
382 | struct rb_node **p, *parent; | ||
383 | struct ocfs2_extent_map_entry *old_ent; | ||
384 | |||
385 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | ||
386 | le32_to_cpu(ent->e_rec.e_clusters), | ||
387 | &p, &parent); | ||
388 | if (old_ent) | ||
389 | return -EEXIST; | ||
390 | |||
391 | rb_link_node(&ent->e_node, parent, p); | ||
392 | rb_insert_color(&ent->e_node, &em->em_extents); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | |||
398 | /* | ||
399 | * Simple rule: on any return code other than -EAGAIN, anything left | ||
400 | * in the insert_context will be freed. | ||
401 | */ | ||
402 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
403 | struct ocfs2_extent_rec *rec, | ||
404 | int tree_depth, | ||
405 | struct ocfs2_em_insert_context *ctxt) | ||
406 | { | ||
407 | int ret; | ||
408 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
409 | struct ocfs2_extent_map_entry *old_ent; | ||
410 | |||
411 | ctxt->need_left = 0; | ||
412 | ctxt->need_right = 0; | ||
413 | ctxt->old_ent = NULL; | ||
414 | |||
415 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
416 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
417 | if (!ret) { | ||
418 | ctxt->new_ent = NULL; | ||
419 | goto out_unlock; | ||
420 | } | ||
421 | |||
422 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
423 | le32_to_cpu(rec->e_clusters), NULL, | ||
424 | NULL); | ||
425 | |||
426 | if (!old_ent) | ||
427 | BUG(); | ||
428 | |||
429 | ret = -EEXIST; | ||
430 | if (old_ent->e_tree_depth < tree_depth) | ||
431 | goto out_unlock; | ||
432 | |||
433 | if (old_ent->e_tree_depth == tree_depth) { | ||
434 | if (!memcmp(rec, &old_ent->e_rec, | ||
435 | sizeof(struct ocfs2_extent_rec))) | ||
436 | ret = 0; | ||
437 | |||
438 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
439 | goto out_unlock; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * We do it in this order specifically so that no actual tree | ||
444 | * changes occur until we have all the pieces we need. We | ||
445 | * don't want malloc failures to leave an inconsistent tree. | ||
446 | * Whenever we drop the lock, another process could be | ||
447 | * inserting. Also note that, if another process just beat us | ||
448 | * to an insert, we might not need the same pieces we needed | ||
449 | * the first go round. In the end, the pieces we need will | ||
450 | * be used, and the pieces we don't will be freed. | ||
451 | */ | ||
452 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | ||
453 | le32_to_cpu(old_ent->e_rec.e_cpos)); | ||
454 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
455 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | ||
456 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | ||
457 | ret = -EAGAIN; | ||
458 | if (ctxt->need_left) { | ||
459 | if (!ctxt->left_ent) | ||
460 | goto out_unlock; | ||
461 | *(ctxt->left_ent) = *old_ent; | ||
462 | ctxt->left_ent->e_rec.e_clusters = | ||
463 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | ||
464 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
465 | } | ||
466 | if (ctxt->need_right) { | ||
467 | if (!ctxt->right_ent) | ||
468 | goto out_unlock; | ||
469 | *(ctxt->right_ent) = *old_ent; | ||
470 | ctxt->right_ent->e_rec.e_cpos = | ||
471 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
472 | le32_to_cpu(rec->e_clusters)); | ||
473 | ctxt->right_ent->e_rec.e_clusters = | ||
474 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
475 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
476 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
477 | } | ||
478 | |||
479 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
480 | /* Now that he's erased, set him up for deletion */ | ||
481 | ctxt->old_ent = old_ent; | ||
482 | |||
483 | if (ctxt->need_left) { | ||
484 | ret = ocfs2_extent_map_insert_entry(em, | ||
485 | ctxt->left_ent); | ||
486 | if (ret) | ||
487 | goto out_unlock; | ||
488 | ctxt->left_ent = NULL; | ||
489 | } | ||
490 | |||
491 | if (ctxt->need_right) { | ||
492 | ret = ocfs2_extent_map_insert_entry(em, | ||
493 | ctxt->right_ent); | ||
494 | if (ret) | ||
495 | goto out_unlock; | ||
496 | ctxt->right_ent = NULL; | ||
497 | } | ||
498 | |||
499 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
500 | |||
501 | if (!ret) | ||
502 | ctxt->new_ent = NULL; | ||
503 | |||
504 | out_unlock: | ||
505 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
506 | |||
507 | return ret; | ||
508 | } | ||
509 | |||
510 | |||
511 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
512 | struct ocfs2_extent_rec *rec, | ||
513 | int tree_depth) | ||
514 | { | ||
515 | int ret; | ||
516 | struct ocfs2_em_insert_context ctxt = {0, }; | ||
517 | |||
518 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | ||
519 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
520 | ret = -EBADR; | ||
521 | mlog_errno(ret); | ||
522 | return ret; | ||
523 | } | ||
524 | |||
525 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | ||
526 | if (!rec->e_clusters) { | ||
527 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | ||
528 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
529 | ret = -EBADR; | ||
530 | mlog_errno(ret); | ||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | /* Ignore the truncated tail */ | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | ret = -ENOMEM; | ||
539 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
540 | GFP_KERNEL); | ||
541 | if (!ctxt.new_ent) { | ||
542 | mlog_errno(ret); | ||
543 | return ret; | ||
544 | } | ||
545 | |||
546 | ctxt.new_ent->e_rec = *rec; | ||
547 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
548 | |||
549 | do { | ||
550 | ret = -ENOMEM; | ||
551 | if (ctxt.need_left && !ctxt.left_ent) { | ||
552 | ctxt.left_ent = | ||
553 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
554 | GFP_KERNEL); | ||
555 | if (!ctxt.left_ent) | ||
556 | break; | ||
557 | } | ||
558 | if (ctxt.need_right && !ctxt.right_ent) { | ||
559 | ctxt.right_ent = | ||
560 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
561 | GFP_KERNEL); | ||
562 | if (!ctxt.right_ent) | ||
563 | break; | ||
564 | } | ||
565 | |||
566 | ret = ocfs2_extent_map_try_insert(inode, rec, | ||
567 | tree_depth, &ctxt); | ||
568 | } while (ret == -EAGAIN); | ||
569 | |||
570 | if (ret < 0) | ||
571 | mlog_errno(ret); | ||
572 | |||
573 | if (ctxt.left_ent) | ||
574 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
575 | if (ctxt.right_ent) | ||
576 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
577 | if (ctxt.old_ent) | ||
578 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
579 | if (ctxt.new_ent) | ||
580 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
581 | |||
582 | return ret; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Append this record to the tail of the extent map. It must be | ||
587 | * tree_depth 0. The record might be an extension of an existing | ||
588 | * record, and as such that needs to be handled. eg: | ||
589 | * | ||
590 | * Existing record in the extent map: | ||
591 | * | ||
592 | * cpos = 10, len = 10 | ||
593 | * |---------| | ||
594 | * | ||
595 | * New Record: | ||
596 | * | ||
597 | * cpos = 10, len = 20 | ||
598 | * |------------------| | ||
599 | * | ||
600 | * The passed record is the new on-disk record. The new_clusters value | ||
601 | * is how many clusters were added to the file. If the append is a | ||
602 | * contiguous append, the new_clusters has been added to | ||
603 | * rec->e_clusters. If the append is an entirely new extent, then | ||
604 | * rec->e_clusters is == new_clusters. | ||
605 | */ | ||
606 | int ocfs2_extent_map_append(struct inode *inode, | ||
607 | struct ocfs2_extent_rec *rec, | ||
608 | u32 new_clusters) | ||
609 | { | ||
610 | int ret; | ||
611 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
612 | struct ocfs2_extent_map_entry *ent; | ||
613 | struct ocfs2_extent_rec *old; | ||
614 | |||
615 | BUG_ON(!new_clusters); | ||
616 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
617 | |||
618 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | ||
619 | /* | ||
620 | * Size changed underneath us on disk. Drop any | ||
621 | * straddling records and update our idea of | ||
622 | * i_clusters | ||
623 | */ | ||
624 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
625 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
626 | } | ||
627 | |||
628 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | ||
629 | le32_to_cpu(rec->e_clusters)) != | ||
630 | (em->em_clusters + new_clusters), | ||
631 | "Inode %"MLFu64":\n" | ||
632 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
633 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
634 | OCFS2_I(inode)->ip_blkno, | ||
635 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
636 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
637 | em->em_clusters, new_clusters, | ||
638 | em->em_clusters + new_clusters); | ||
639 | |||
640 | em->em_clusters += new_clusters; | ||
641 | |||
642 | ret = -ENOENT; | ||
643 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
644 | /* This is a contiguous append */ | ||
645 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
646 | NULL, NULL); | ||
647 | if (ent) { | ||
648 | old = &ent->e_rec; | ||
649 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
650 | le32_to_cpu(rec->e_clusters)) != | ||
651 | (le32_to_cpu(old->e_cpos) + | ||
652 | le32_to_cpu(old->e_clusters) + | ||
653 | new_clusters)); | ||
654 | if (ent->e_tree_depth == 0) { | ||
655 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
656 | le32_to_cpu(rec->e_cpos)); | ||
657 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
658 | le64_to_cpu(rec->e_blkno)); | ||
659 | ret = 0; | ||
660 | } | ||
661 | /* | ||
662 | * Let non-leafs fall through as -ENOENT to | ||
663 | * force insertion of the new leaf. | ||
664 | */ | ||
665 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | if (ret == -ENOENT) | ||
670 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
671 | if (ret < 0) | ||
672 | mlog_errno(ret); | ||
673 | return ret; | ||
674 | } | ||
675 | |||
676 | #if 0 | ||
677 | /* Code here is included but defined out as it completes the extent | ||
678 | * map api and may be used in the future. */ | ||
679 | |||
680 | /* | ||
681 | * Look up the record containing this cluster offset. This record is | ||
682 | * part of the extent map. Do not free it. Any changes you make to | ||
683 | * it will reflect in the extent map. So, if your last extent | ||
684 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
685 | * clusters, you can do: | ||
686 | * | ||
687 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | ||
688 | * rec->e_clusters -= 5; | ||
689 | * | ||
690 | * The lookup does not read from disk. If the map isn't filled in for | ||
691 | * an entry, you won't find it. | ||
692 | * | ||
693 | * Also note that the returned record is valid until alloc_sem is | ||
694 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
695 | */ | ||
696 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | ||
697 | struct ocfs2_extent_rec **rec, | ||
698 | int *tree_depth) | ||
699 | { | ||
700 | int ret = -ENOENT; | ||
701 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
702 | struct ocfs2_extent_map_entry *ent; | ||
703 | |||
704 | *rec = NULL; | ||
705 | |||
706 | if (cpos >= OCFS2_I(inode)->ip_clusters) | ||
707 | return -EINVAL; | ||
708 | |||
709 | if (cpos >= em->em_clusters) { | ||
710 | /* | ||
711 | * Size changed underneath us on disk. Drop any | ||
712 | * straddling records and update our idea of | ||
713 | * i_clusters | ||
714 | */ | ||
715 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
716 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
717 | } | ||
718 | |||
719 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
720 | NULL, NULL); | ||
721 | |||
722 | if (ent) { | ||
723 | *rec = &ent->e_rec; | ||
724 | if (tree_depth) | ||
725 | *tree_depth = ent->e_tree_depth; | ||
726 | ret = 0; | ||
727 | } | ||
728 | |||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | int ocfs2_extent_map_get_clusters(struct inode *inode, | ||
733 | u32 v_cpos, int count, | ||
734 | u32 *p_cpos, int *ret_count) | ||
735 | { | ||
736 | int ret; | ||
737 | u32 coff, ccount; | ||
738 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
739 | struct ocfs2_extent_map_entry *ent = NULL; | ||
740 | |||
741 | *p_cpos = ccount = 0; | ||
742 | |||
743 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | ||
744 | return -EINVAL; | ||
745 | |||
746 | if ((v_cpos + count) > em->em_clusters) { | ||
747 | /* | ||
748 | * Size changed underneath us on disk. Drop any | ||
749 | * straddling records and update our idea of | ||
750 | * i_clusters | ||
751 | */ | ||
752 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
753 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
754 | } | ||
755 | |||
756 | |||
757 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | ||
758 | if (ret) | ||
759 | return ret; | ||
760 | |||
761 | if (ent) { | ||
762 | /* We should never find ourselves straddling an interval */ | ||
763 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
764 | v_cpos, | ||
765 | count)) | ||
766 | return -ESRCH; | ||
767 | |||
768 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | ||
769 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | ||
770 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
771 | coff; | ||
772 | |||
773 | if (ret_count) | ||
774 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
775 | |||
776 | return 0; | ||
777 | } | ||
778 | |||
779 | |||
780 | return -ENOENT; | ||
781 | } | ||
782 | |||
783 | #endif /* 0 */ | ||
784 | |||
785 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
786 | u64 v_blkno, int count, | ||
787 | u64 *p_blkno, int *ret_count) | ||
788 | { | ||
789 | int ret; | ||
790 | u64 boff; | ||
791 | u32 cpos, clusters; | ||
792 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | ||
793 | struct ocfs2_extent_map_entry *ent = NULL; | ||
794 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
795 | struct ocfs2_extent_rec *rec; | ||
796 | |||
797 | *p_blkno = 0; | ||
798 | |||
799 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | ||
800 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | ||
801 | (u64)count + bpc - 1); | ||
802 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
803 | ret = -EINVAL; | ||
804 | mlog_errno(ret); | ||
805 | return ret; | ||
806 | } | ||
807 | |||
808 | if ((cpos + clusters) > em->em_clusters) { | ||
809 | /* | ||
810 | * Size changed underneath us on disk. Drop any | ||
811 | * straddling records and update our idea of | ||
812 | * i_clusters | ||
813 | */ | ||
814 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
815 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
816 | } | ||
817 | |||
818 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | ||
819 | if (ret) { | ||
820 | mlog_errno(ret); | ||
821 | return ret; | ||
822 | } | ||
823 | |||
824 | if (ent) | ||
825 | { | ||
826 | rec = &ent->e_rec; | ||
827 | |||
828 | /* We should never find ourselves straddling an interval */ | ||
829 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | ||
830 | ret = -ESRCH; | ||
831 | mlog_errno(ret); | ||
832 | return ret; | ||
833 | } | ||
834 | |||
835 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | ||
836 | le32_to_cpu(rec->e_cpos)); | ||
837 | boff += (v_blkno & (u64)(bpc - 1)); | ||
838 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
839 | |||
840 | if (ret_count) { | ||
841 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | ||
842 | le32_to_cpu(rec->e_clusters)) - boff; | ||
843 | } | ||
844 | |||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | return -ENOENT; | ||
849 | } | ||
850 | |||
851 | int ocfs2_extent_map_init(struct inode *inode) | ||
852 | { | ||
853 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
854 | |||
855 | em->em_extents = RB_ROOT; | ||
856 | em->em_clusters = 0; | ||
857 | |||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | /* Needs the lock */ | ||
862 | static void __ocfs2_extent_map_drop(struct inode *inode, | ||
863 | u32 new_clusters, | ||
864 | struct rb_node **free_head, | ||
865 | struct ocfs2_extent_map_entry **tail_ent) | ||
866 | { | ||
867 | struct rb_node *node, *next; | ||
868 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
869 | struct ocfs2_extent_map_entry *ent; | ||
870 | |||
871 | *free_head = NULL; | ||
872 | |||
873 | ent = NULL; | ||
874 | node = rb_last(&em->em_extents); | ||
875 | while (node) | ||
876 | { | ||
877 | next = rb_prev(node); | ||
878 | |||
879 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
880 | e_node); | ||
881 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
882 | break; | ||
883 | |||
884 | rb_erase(&ent->e_node, &em->em_extents); | ||
885 | |||
886 | node->rb_right = *free_head; | ||
887 | *free_head = node; | ||
888 | |||
889 | ent = NULL; | ||
890 | node = next; | ||
891 | } | ||
892 | |||
893 | /* Do we have an entry straddling new_clusters? */ | ||
894 | if (tail_ent) { | ||
895 | if (ent && | ||
896 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
897 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
898 | *tail_ent = ent; | ||
899 | else | ||
900 | *tail_ent = NULL; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
905 | { | ||
906 | struct rb_node *node; | ||
907 | struct ocfs2_extent_map_entry *ent; | ||
908 | |||
909 | while (free_head) { | ||
910 | node = free_head; | ||
911 | free_head = node->rb_right; | ||
912 | |||
913 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
914 | e_node); | ||
915 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | ||
916 | } | ||
917 | } | ||
918 | |||
919 | /* | ||
920 | * Remove all entries past new_clusters, inclusive of an entry that | ||
921 | * contains new_clusters. This is effectively a cache forget. | ||
922 | * | ||
923 | * If you want to also clip the last extent by some number of clusters, | ||
924 | * you need to call ocfs2_extent_map_trunc(). | ||
925 | * This code does not check or modify ip_clusters. | ||
926 | */ | ||
927 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | ||
928 | { | ||
929 | struct rb_node *free_head = NULL; | ||
930 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
931 | struct ocfs2_extent_map_entry *ent; | ||
932 | |||
933 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
934 | |||
935 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
936 | |||
937 | if (ent) { | ||
938 | rb_erase(&ent->e_node, &em->em_extents); | ||
939 | ent->e_node.rb_right = free_head; | ||
940 | free_head = &ent->e_node; | ||
941 | } | ||
942 | |||
943 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
944 | |||
945 | if (free_head) | ||
946 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
947 | |||
948 | return 0; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Remove all entries past new_clusters and also clip any extent | ||
953 | * straddling new_clusters, if there is one. This does not check | ||
954 | * or modify ip_clusters | ||
955 | */ | ||
956 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
957 | { | ||
958 | struct rb_node *free_head = NULL; | ||
959 | struct ocfs2_extent_map_entry *ent = NULL; | ||
960 | |||
961 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
962 | |||
963 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
964 | |||
965 | if (ent) | ||
966 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
967 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
968 | |||
969 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
970 | |||
971 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
972 | |||
973 | if (free_head) | ||
974 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
975 | |||
976 | return 0; | ||
977 | } | ||
978 | |||
979 | int __init init_ocfs2_extent_maps(void) | ||
980 | { | ||
981 | ocfs2_em_ent_cachep = | ||
982 | kmem_cache_create("ocfs2_em_ent", | ||
983 | sizeof(struct ocfs2_extent_map_entry), | ||
984 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
985 | if (!ocfs2_em_ent_cachep) | ||
986 | return -ENOMEM; | ||
987 | |||
988 | return 0; | ||
989 | } | ||
990 | |||
991 | void __exit exit_ocfs2_extent_maps(void) | ||
992 | { | ||
993 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
994 | } | ||
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h new file mode 100644 index 000000000000..fa3745efa886 --- /dev/null +++ b/fs/ocfs2/extent_map.h | |||
@@ -0,0 +1,46 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * extent_map.h | ||
5 | * | ||
6 | * In-memory file extent mappings for OCFS2. | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License, version 2, as published by the Free Software Foundation. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public | ||
20 | * License along with this program; if not, write to the | ||
21 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
22 | * Boston, MA 021110-1307, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _EXTENT_MAP_H | ||
26 | #define _EXTENT_MAP_H | ||
27 | |||
28 | int init_ocfs2_extent_maps(void); | ||
29 | void exit_ocfs2_extent_maps(void); | ||
30 | |||
31 | /* | ||
32 | * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem | ||
33 | * to be held. The allocation cannot change at all while the map is | ||
34 | * in the process of being updated. | ||
35 | */ | ||
36 | int ocfs2_extent_map_init(struct inode *inode); | ||
37 | int ocfs2_extent_map_append(struct inode *inode, | ||
38 | struct ocfs2_extent_rec *rec, | ||
39 | u32 new_clusters); | ||
40 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
41 | u64 v_blkno, int count, | ||
42 | u64 *p_blkno, int *ret_count); | ||
43 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); | ||
44 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); | ||
45 | |||
46 | #endif /* _EXTENT_MAP_H */ | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c new file mode 100644 index 000000000000..72ae9e3306f4 --- /dev/null +++ b/fs/ocfs2/file.c | |||
@@ -0,0 +1,1237 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * file.c | ||
5 | * | ||
6 | * File open, close, extend, truncate | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/uio.h> | ||
32 | |||
33 | #define MLOG_MASK_PREFIX ML_INODE | ||
34 | #include <cluster/masklog.h> | ||
35 | |||
36 | #include "ocfs2.h" | ||
37 | |||
38 | #include "alloc.h" | ||
39 | #include "aops.h" | ||
40 | #include "dir.h" | ||
41 | #include "dlmglue.h" | ||
42 | #include "extent_map.h" | ||
43 | #include "file.h" | ||
44 | #include "sysfile.h" | ||
45 | #include "inode.h" | ||
46 | #include "journal.h" | ||
47 | #include "mmap.h" | ||
48 | #include "suballoc.h" | ||
49 | #include "super.h" | ||
50 | |||
51 | #include "buffer_head_io.h" | ||
52 | |||
53 | static int ocfs2_sync_inode(struct inode *inode) | ||
54 | { | ||
55 | filemap_fdatawrite(inode->i_mapping); | ||
56 | return sync_mapping_buffers(inode->i_mapping); | ||
57 | } | ||
58 | |||
59 | static int ocfs2_file_open(struct inode *inode, struct file *file) | ||
60 | { | ||
61 | int status; | ||
62 | int mode = file->f_flags; | ||
63 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
64 | |||
65 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | ||
66 | file->f_dentry->d_name.len, file->f_dentry->d_name.name); | ||
67 | |||
68 | spin_lock(&oi->ip_lock); | ||
69 | |||
70 | /* Check that the inode hasn't been wiped from disk by another | ||
71 | * node. If it hasn't then we're safe as long as we hold the | ||
72 | * spin lock until our increment of open count. */ | ||
73 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
74 | spin_unlock(&oi->ip_lock); | ||
75 | |||
76 | status = -ENOENT; | ||
77 | goto leave; | ||
78 | } | ||
79 | |||
80 | if (mode & O_DIRECT) | ||
81 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; | ||
82 | |||
83 | oi->ip_open_count++; | ||
84 | spin_unlock(&oi->ip_lock); | ||
85 | status = 0; | ||
86 | leave: | ||
87 | mlog_exit(status); | ||
88 | return status; | ||
89 | } | ||
90 | |||
91 | static int ocfs2_file_release(struct inode *inode, struct file *file) | ||
92 | { | ||
93 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
94 | |||
95 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | ||
96 | file->f_dentry->d_name.len, | ||
97 | file->f_dentry->d_name.name); | ||
98 | |||
99 | spin_lock(&oi->ip_lock); | ||
100 | if (!--oi->ip_open_count) | ||
101 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; | ||
102 | spin_unlock(&oi->ip_lock); | ||
103 | |||
104 | mlog_exit(0); | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | static int ocfs2_sync_file(struct file *file, | ||
110 | struct dentry *dentry, | ||
111 | int datasync) | ||
112 | { | ||
113 | int err = 0; | ||
114 | journal_t *journal; | ||
115 | struct inode *inode = dentry->d_inode; | ||
116 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
117 | |||
118 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | ||
119 | dentry->d_name.len, dentry->d_name.name); | ||
120 | |||
121 | err = ocfs2_sync_inode(dentry->d_inode); | ||
122 | if (err) | ||
123 | goto bail; | ||
124 | |||
125 | journal = osb->journal->j_journal; | ||
126 | err = journal_force_commit(journal); | ||
127 | |||
128 | bail: | ||
129 | mlog_exit(err); | ||
130 | |||
131 | return (err < 0) ? -EIO : 0; | ||
132 | } | ||
133 | |||
134 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, | ||
135 | struct inode *inode, | ||
136 | struct buffer_head *fe_bh, | ||
137 | u64 new_i_size) | ||
138 | { | ||
139 | int status; | ||
140 | |||
141 | mlog_entry_void(); | ||
142 | i_size_write(inode, new_i_size); | ||
143 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | ||
144 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
145 | |||
146 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
147 | if (status < 0) { | ||
148 | mlog_errno(status); | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | bail: | ||
153 | mlog_exit(status); | ||
154 | return status; | ||
155 | } | ||
156 | |||
157 | static int ocfs2_simple_size_update(struct inode *inode, | ||
158 | struct buffer_head *di_bh, | ||
159 | u64 new_i_size) | ||
160 | { | ||
161 | int ret; | ||
162 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
163 | struct ocfs2_journal_handle *handle = NULL; | ||
164 | |||
165 | handle = ocfs2_start_trans(osb, NULL, | ||
166 | OCFS2_INODE_UPDATE_CREDITS); | ||
167 | if (handle == NULL) { | ||
168 | ret = -ENOMEM; | ||
169 | mlog_errno(ret); | ||
170 | goto out; | ||
171 | } | ||
172 | |||
173 | ret = ocfs2_set_inode_size(handle, inode, di_bh, | ||
174 | new_i_size); | ||
175 | if (ret < 0) | ||
176 | mlog_errno(ret); | ||
177 | |||
178 | ocfs2_commit_trans(handle); | ||
179 | out: | ||
180 | return ret; | ||
181 | } | ||
182 | |||
183 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | ||
184 | struct inode *inode, | ||
185 | struct buffer_head *fe_bh, | ||
186 | u64 new_i_size) | ||
187 | { | ||
188 | int status; | ||
189 | struct ocfs2_journal_handle *handle; | ||
190 | |||
191 | mlog_entry_void(); | ||
192 | |||
193 | /* TODO: This needs to actually orphan the inode in this | ||
194 | * transaction. */ | ||
195 | |||
196 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
197 | if (IS_ERR(handle)) { | ||
198 | status = PTR_ERR(handle); | ||
199 | mlog_errno(status); | ||
200 | goto out; | ||
201 | } | ||
202 | |||
203 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | ||
204 | if (status < 0) | ||
205 | mlog_errno(status); | ||
206 | |||
207 | ocfs2_commit_trans(handle); | ||
208 | out: | ||
209 | mlog_exit(status); | ||
210 | return status; | ||
211 | } | ||
212 | |||
213 | static int ocfs2_truncate_file(struct inode *inode, | ||
214 | struct buffer_head *di_bh, | ||
215 | u64 new_i_size) | ||
216 | { | ||
217 | int status = 0; | ||
218 | struct ocfs2_dinode *fe = NULL; | ||
219 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
220 | struct ocfs2_truncate_context *tc = NULL; | ||
221 | |||
222 | mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", | ||
223 | OCFS2_I(inode)->ip_blkno, new_i_size); | ||
224 | |||
225 | truncate_inode_pages(inode->i_mapping, new_i_size); | ||
226 | |||
227 | fe = (struct ocfs2_dinode *) di_bh->b_data; | ||
228 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
229 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
230 | status = -EIO; | ||
231 | goto bail; | ||
232 | } | ||
233 | |||
234 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), | ||
235 | "Inode %"MLFu64", inode i_size = %lld != di " | ||
236 | "i_size = %"MLFu64", i_flags = 0x%x\n", | ||
237 | OCFS2_I(inode)->ip_blkno, | ||
238 | i_size_read(inode), | ||
239 | le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); | ||
240 | |||
241 | if (new_i_size > le64_to_cpu(fe->i_size)) { | ||
242 | mlog(0, "asked to truncate file with size (%"MLFu64") " | ||
243 | "to size (%"MLFu64")!\n", | ||
244 | le64_to_cpu(fe->i_size), new_i_size); | ||
245 | status = -EINVAL; | ||
246 | mlog_errno(status); | ||
247 | goto bail; | ||
248 | } | ||
249 | |||
250 | mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", | ||
251 | le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); | ||
252 | |||
253 | /* lets handle the simple truncate cases before doing any more | ||
254 | * cluster locking. */ | ||
255 | if (new_i_size == le64_to_cpu(fe->i_size)) | ||
256 | goto bail; | ||
257 | |||
258 | if (le32_to_cpu(fe->i_clusters) == | ||
259 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | ||
260 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | ||
261 | fe->i_clusters); | ||
262 | /* No allocation change is required, so lets fast path | ||
263 | * this truncate. */ | ||
264 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
265 | if (status < 0) | ||
266 | mlog_errno(status); | ||
267 | goto bail; | ||
268 | } | ||
269 | |||
270 | /* This forces other nodes to sync and drop their pages */ | ||
271 | status = ocfs2_data_lock(inode, 1); | ||
272 | if (status < 0) { | ||
273 | mlog_errno(status); | ||
274 | goto bail; | ||
275 | } | ||
276 | ocfs2_data_unlock(inode, 1); | ||
277 | |||
278 | /* alright, we're going to need to do a full blown alloc size | ||
279 | * change. Orphan the inode so that recovery can complete the | ||
280 | * truncate if necessary. This does the task of marking | ||
281 | * i_size. */ | ||
282 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | ||
283 | if (status < 0) { | ||
284 | mlog_errno(status); | ||
285 | goto bail; | ||
286 | } | ||
287 | |||
288 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | ||
289 | if (status < 0) { | ||
290 | mlog_errno(status); | ||
291 | goto bail; | ||
292 | } | ||
293 | |||
294 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | ||
295 | if (status < 0) { | ||
296 | mlog_errno(status); | ||
297 | goto bail; | ||
298 | } | ||
299 | |||
300 | /* TODO: orphan dir cleanup here. */ | ||
301 | bail: | ||
302 | |||
303 | mlog_exit(status); | ||
304 | return status; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * extend allocation only here. | ||
309 | * we'll update all the disk stuff, and oip->alloc_size | ||
310 | * | ||
311 | * expect stuff to be locked, a transaction started and enough data / | ||
312 | * metadata reservations in the contexts. | ||
313 | * | ||
314 | * Will return -EAGAIN, and a reason if a restart is needed. | ||
315 | * If passed in, *reason will always be set, even in error. | ||
316 | */ | ||
317 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | ||
318 | struct inode *inode, | ||
319 | u32 clusters_to_add, | ||
320 | struct buffer_head *fe_bh, | ||
321 | struct ocfs2_journal_handle *handle, | ||
322 | struct ocfs2_alloc_context *data_ac, | ||
323 | struct ocfs2_alloc_context *meta_ac, | ||
324 | enum ocfs2_alloc_restarted *reason_ret) | ||
325 | { | ||
326 | int status = 0; | ||
327 | int free_extents; | ||
328 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
329 | enum ocfs2_alloc_restarted reason = RESTART_NONE; | ||
330 | u32 bit_off, num_bits; | ||
331 | u64 block; | ||
332 | |||
333 | BUG_ON(!clusters_to_add); | ||
334 | |||
335 | free_extents = ocfs2_num_free_extents(osb, inode, fe); | ||
336 | if (free_extents < 0) { | ||
337 | status = free_extents; | ||
338 | mlog_errno(status); | ||
339 | goto leave; | ||
340 | } | ||
341 | |||
342 | /* there are two cases which could cause us to EAGAIN in the | ||
343 | * we-need-more-metadata case: | ||
344 | * 1) we haven't reserved *any* | ||
345 | * 2) we are so fragmented, we've needed to add metadata too | ||
346 | * many times. */ | ||
347 | if (!free_extents && !meta_ac) { | ||
348 | mlog(0, "we haven't reserved any metadata!\n"); | ||
349 | status = -EAGAIN; | ||
350 | reason = RESTART_META; | ||
351 | goto leave; | ||
352 | } else if ((!free_extents) | ||
353 | && (ocfs2_alloc_context_bits_left(meta_ac) | ||
354 | < ocfs2_extend_meta_needed(fe))) { | ||
355 | mlog(0, "filesystem is really fragmented...\n"); | ||
356 | status = -EAGAIN; | ||
357 | reason = RESTART_META; | ||
358 | goto leave; | ||
359 | } | ||
360 | |||
361 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, | ||
362 | &bit_off, &num_bits); | ||
363 | if (status < 0) { | ||
364 | if (status != -ENOSPC) | ||
365 | mlog_errno(status); | ||
366 | goto leave; | ||
367 | } | ||
368 | |||
369 | BUG_ON(num_bits > clusters_to_add); | ||
370 | |||
371 | /* reserve our write early -- insert_extent may update the inode */ | ||
372 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
373 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
374 | if (status < 0) { | ||
375 | mlog_errno(status); | ||
376 | goto leave; | ||
377 | } | ||
378 | |||
379 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | ||
380 | mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", | ||
381 | num_bits, bit_off, OCFS2_I(inode)->ip_blkno); | ||
382 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | ||
383 | num_bits, meta_ac); | ||
384 | if (status < 0) { | ||
385 | mlog_errno(status); | ||
386 | goto leave; | ||
387 | } | ||
388 | |||
389 | le32_add_cpu(&fe->i_clusters, num_bits); | ||
390 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
391 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
392 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
393 | |||
394 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
395 | if (status < 0) { | ||
396 | mlog_errno(status); | ||
397 | goto leave; | ||
398 | } | ||
399 | |||
400 | clusters_to_add -= num_bits; | ||
401 | |||
402 | if (clusters_to_add) { | ||
403 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | ||
404 | "%u\n", fe->i_clusters, clusters_to_add); | ||
405 | status = -EAGAIN; | ||
406 | reason = RESTART_TRANS; | ||
407 | } | ||
408 | |||
409 | leave: | ||
410 | mlog_exit(status); | ||
411 | if (reason_ret) | ||
412 | *reason_ret = reason; | ||
413 | return status; | ||
414 | } | ||
415 | |||
416 | static int ocfs2_extend_allocation(struct inode *inode, | ||
417 | u32 clusters_to_add) | ||
418 | { | ||
419 | int status = 0; | ||
420 | int restart_func = 0; | ||
421 | int drop_alloc_sem = 0; | ||
422 | int credits, num_free_extents; | ||
423 | u32 prev_clusters; | ||
424 | struct buffer_head *bh = NULL; | ||
425 | struct ocfs2_dinode *fe = NULL; | ||
426 | struct ocfs2_journal_handle *handle = NULL; | ||
427 | struct ocfs2_alloc_context *data_ac = NULL; | ||
428 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
429 | enum ocfs2_alloc_restarted why; | ||
430 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
431 | |||
432 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | ||
433 | |||
434 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | ||
435 | OCFS2_BH_CACHED, inode); | ||
436 | if (status < 0) { | ||
437 | mlog_errno(status); | ||
438 | goto leave; | ||
439 | } | ||
440 | |||
441 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
442 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
443 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
444 | status = -EIO; | ||
445 | goto leave; | ||
446 | } | ||
447 | |||
448 | restart_all: | ||
449 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | ||
450 | |||
451 | mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " | ||
452 | "clusters_to_add = %u\n", | ||
453 | OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
454 | fe->i_clusters, clusters_to_add); | ||
455 | |||
456 | handle = ocfs2_alloc_handle(osb); | ||
457 | if (handle == NULL) { | ||
458 | status = -ENOMEM; | ||
459 | mlog_errno(status); | ||
460 | goto leave; | ||
461 | } | ||
462 | |||
463 | num_free_extents = ocfs2_num_free_extents(osb, | ||
464 | inode, | ||
465 | fe); | ||
466 | if (num_free_extents < 0) { | ||
467 | status = num_free_extents; | ||
468 | mlog_errno(status); | ||
469 | goto leave; | ||
470 | } | ||
471 | |||
472 | if (!num_free_extents) { | ||
473 | status = ocfs2_reserve_new_metadata(osb, | ||
474 | handle, | ||
475 | fe, | ||
476 | &meta_ac); | ||
477 | if (status < 0) { | ||
478 | if (status != -ENOSPC) | ||
479 | mlog_errno(status); | ||
480 | goto leave; | ||
481 | } | ||
482 | } | ||
483 | |||
484 | status = ocfs2_reserve_clusters(osb, | ||
485 | handle, | ||
486 | clusters_to_add, | ||
487 | &data_ac); | ||
488 | if (status < 0) { | ||
489 | if (status != -ENOSPC) | ||
490 | mlog_errno(status); | ||
491 | goto leave; | ||
492 | } | ||
493 | |||
494 | /* blocks peope in read/write from reading our allocation | ||
495 | * until we're done changing it. We depend on i_sem to block | ||
496 | * other extend/truncate calls while we're here. Ordering wrt | ||
497 | * start_trans is important here -- always do it before! */ | ||
498 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
499 | drop_alloc_sem = 1; | ||
500 | |||
501 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | ||
502 | handle = ocfs2_start_trans(osb, handle, credits); | ||
503 | if (IS_ERR(handle)) { | ||
504 | status = PTR_ERR(handle); | ||
505 | handle = NULL; | ||
506 | mlog_errno(status); | ||
507 | goto leave; | ||
508 | } | ||
509 | |||
510 | restarted_transaction: | ||
511 | /* reserve a write to the file entry early on - that we if we | ||
512 | * run out of credits in the allocation path, we can still | ||
513 | * update i_size. */ | ||
514 | status = ocfs2_journal_access(handle, inode, bh, | ||
515 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
516 | if (status < 0) { | ||
517 | mlog_errno(status); | ||
518 | goto leave; | ||
519 | } | ||
520 | |||
521 | prev_clusters = OCFS2_I(inode)->ip_clusters; | ||
522 | |||
523 | status = ocfs2_do_extend_allocation(osb, | ||
524 | inode, | ||
525 | clusters_to_add, | ||
526 | bh, | ||
527 | handle, | ||
528 | data_ac, | ||
529 | meta_ac, | ||
530 | &why); | ||
531 | if ((status < 0) && (status != -EAGAIN)) { | ||
532 | if (status != -ENOSPC) | ||
533 | mlog_errno(status); | ||
534 | goto leave; | ||
535 | } | ||
536 | |||
537 | status = ocfs2_journal_dirty(handle, bh); | ||
538 | if (status < 0) { | ||
539 | mlog_errno(status); | ||
540 | goto leave; | ||
541 | } | ||
542 | |||
543 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
544 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); | ||
545 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
546 | |||
547 | if (why != RESTART_NONE && clusters_to_add) { | ||
548 | if (why == RESTART_META) { | ||
549 | mlog(0, "restarting function.\n"); | ||
550 | restart_func = 1; | ||
551 | } else { | ||
552 | BUG_ON(why != RESTART_TRANS); | ||
553 | |||
554 | mlog(0, "restarting transaction.\n"); | ||
555 | /* TODO: This can be more intelligent. */ | ||
556 | credits = ocfs2_calc_extend_credits(osb->sb, | ||
557 | fe, | ||
558 | clusters_to_add); | ||
559 | status = ocfs2_extend_trans(handle, credits); | ||
560 | if (status < 0) { | ||
561 | /* handle still has to be committed at | ||
562 | * this point. */ | ||
563 | status = -ENOMEM; | ||
564 | mlog_errno(status); | ||
565 | goto leave; | ||
566 | } | ||
567 | goto restarted_transaction; | ||
568 | } | ||
569 | } | ||
570 | |||
571 | mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", | ||
572 | fe->i_clusters, fe->i_size); | ||
573 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", | ||
574 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); | ||
575 | |||
576 | leave: | ||
577 | if (drop_alloc_sem) { | ||
578 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
579 | drop_alloc_sem = 0; | ||
580 | } | ||
581 | if (handle) { | ||
582 | ocfs2_commit_trans(handle); | ||
583 | handle = NULL; | ||
584 | } | ||
585 | if (data_ac) { | ||
586 | ocfs2_free_alloc_context(data_ac); | ||
587 | data_ac = NULL; | ||
588 | } | ||
589 | if (meta_ac) { | ||
590 | ocfs2_free_alloc_context(meta_ac); | ||
591 | meta_ac = NULL; | ||
592 | } | ||
593 | if ((!status) && restart_func) { | ||
594 | restart_func = 0; | ||
595 | goto restart_all; | ||
596 | } | ||
597 | if (bh) { | ||
598 | brelse(bh); | ||
599 | bh = NULL; | ||
600 | } | ||
601 | |||
602 | mlog_exit(status); | ||
603 | return status; | ||
604 | } | ||
605 | |||
606 | /* Some parts of this taken from generic_cont_expand, which turned out | ||
607 | * to be too fragile to do exactly what we need without us having to | ||
608 | * worry about recursive locking in ->commit_write(). */ | ||
609 | static int ocfs2_write_zero_page(struct inode *inode, | ||
610 | u64 size) | ||
611 | { | ||
612 | struct address_space *mapping = inode->i_mapping; | ||
613 | struct page *page; | ||
614 | unsigned long index; | ||
615 | unsigned int offset; | ||
616 | struct ocfs2_journal_handle *handle = NULL; | ||
617 | int ret; | ||
618 | |||
619 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ | ||
620 | /* ugh. in prepare/commit_write, if from==to==start of block, we | ||
621 | ** skip the prepare. make sure we never send an offset for the start | ||
622 | ** of a block | ||
623 | */ | ||
624 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { | ||
625 | offset++; | ||
626 | } | ||
627 | index = size >> PAGE_CACHE_SHIFT; | ||
628 | |||
629 | page = grab_cache_page(mapping, index); | ||
630 | if (!page) { | ||
631 | ret = -ENOMEM; | ||
632 | mlog_errno(ret); | ||
633 | goto out; | ||
634 | } | ||
635 | |||
636 | ret = ocfs2_prepare_write(NULL, page, offset, offset); | ||
637 | if (ret < 0) { | ||
638 | mlog_errno(ret); | ||
639 | goto out_unlock; | ||
640 | } | ||
641 | |||
642 | if (ocfs2_should_order_data(inode)) { | ||
643 | handle = ocfs2_start_walk_page_trans(inode, page, offset, | ||
644 | offset); | ||
645 | if (IS_ERR(handle)) { | ||
646 | ret = PTR_ERR(handle); | ||
647 | handle = NULL; | ||
648 | goto out_unlock; | ||
649 | } | ||
650 | } | ||
651 | |||
652 | /* must not update i_size! */ | ||
653 | ret = block_commit_write(page, offset, offset); | ||
654 | if (ret < 0) | ||
655 | mlog_errno(ret); | ||
656 | else | ||
657 | ret = 0; | ||
658 | |||
659 | if (handle) | ||
660 | ocfs2_commit_trans(handle); | ||
661 | out_unlock: | ||
662 | unlock_page(page); | ||
663 | page_cache_release(page); | ||
664 | out: | ||
665 | return ret; | ||
666 | } | ||
667 | |||
668 | static int ocfs2_zero_extend(struct inode *inode, | ||
669 | u64 zero_to_size) | ||
670 | { | ||
671 | int ret = 0; | ||
672 | u64 start_off; | ||
673 | struct super_block *sb = inode->i_sb; | ||
674 | |||
675 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | ||
676 | while (start_off < zero_to_size) { | ||
677 | ret = ocfs2_write_zero_page(inode, start_off); | ||
678 | if (ret < 0) { | ||
679 | mlog_errno(ret); | ||
680 | goto out; | ||
681 | } | ||
682 | |||
683 | start_off += sb->s_blocksize; | ||
684 | } | ||
685 | |||
686 | out: | ||
687 | return ret; | ||
688 | } | ||
689 | |||
690 | static int ocfs2_extend_file(struct inode *inode, | ||
691 | struct buffer_head *di_bh, | ||
692 | u64 new_i_size) | ||
693 | { | ||
694 | int ret = 0; | ||
695 | u32 clusters_to_add; | ||
696 | |||
697 | /* setattr sometimes calls us like this. */ | ||
698 | if (new_i_size == 0) | ||
699 | goto out; | ||
700 | |||
701 | if (i_size_read(inode) == new_i_size) | ||
702 | goto out; | ||
703 | BUG_ON(new_i_size < i_size_read(inode)); | ||
704 | |||
705 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | ||
706 | OCFS2_I(inode)->ip_clusters; | ||
707 | |||
708 | if (clusters_to_add) { | ||
709 | ret = ocfs2_extend_allocation(inode, clusters_to_add); | ||
710 | if (ret < 0) { | ||
711 | mlog_errno(ret); | ||
712 | goto out; | ||
713 | } | ||
714 | |||
715 | ret = ocfs2_zero_extend(inode, new_i_size); | ||
716 | if (ret < 0) { | ||
717 | mlog_errno(ret); | ||
718 | goto out; | ||
719 | } | ||
720 | } | ||
721 | |||
722 | /* No allocation required, we just use this helper to | ||
723 | * do a trivial update of i_size. */ | ||
724 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
725 | if (ret < 0) { | ||
726 | mlog_errno(ret); | ||
727 | goto out; | ||
728 | } | ||
729 | |||
730 | out: | ||
731 | return ret; | ||
732 | } | ||
733 | |||
734 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | ||
735 | { | ||
736 | int status = 0, size_change; | ||
737 | struct inode *inode = dentry->d_inode; | ||
738 | struct super_block *sb = inode->i_sb; | ||
739 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
740 | struct buffer_head *bh = NULL; | ||
741 | struct ocfs2_journal_handle *handle = NULL; | ||
742 | |||
743 | mlog_entry("(0x%p, '%.*s')\n", dentry, | ||
744 | dentry->d_name.len, dentry->d_name.name); | ||
745 | |||
746 | if (attr->ia_valid & ATTR_MODE) | ||
747 | mlog(0, "mode change: %d\n", attr->ia_mode); | ||
748 | if (attr->ia_valid & ATTR_UID) | ||
749 | mlog(0, "uid change: %d\n", attr->ia_uid); | ||
750 | if (attr->ia_valid & ATTR_GID) | ||
751 | mlog(0, "gid change: %d\n", attr->ia_gid); | ||
752 | if (attr->ia_valid & ATTR_SIZE) | ||
753 | mlog(0, "size change...\n"); | ||
754 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) | ||
755 | mlog(0, "time change...\n"); | ||
756 | |||
757 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | ||
758 | | ATTR_GID | ATTR_UID | ATTR_MODE) | ||
759 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { | ||
760 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); | ||
761 | return 0; | ||
762 | } | ||
763 | |||
764 | status = inode_change_ok(inode, attr); | ||
765 | if (status) | ||
766 | return status; | ||
767 | |||
768 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; | ||
769 | if (size_change) { | ||
770 | status = ocfs2_rw_lock(inode, 1); | ||
771 | if (status < 0) { | ||
772 | mlog_errno(status); | ||
773 | goto bail; | ||
774 | } | ||
775 | } | ||
776 | |||
777 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); | ||
778 | if (status < 0) { | ||
779 | if (status != -ENOENT) | ||
780 | mlog_errno(status); | ||
781 | goto bail_unlock_rw; | ||
782 | } | ||
783 | |||
784 | if (size_change && attr->ia_size != i_size_read(inode)) { | ||
785 | if (i_size_read(inode) > attr->ia_size) | ||
786 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); | ||
787 | else | ||
788 | status = ocfs2_extend_file(inode, bh, attr->ia_size); | ||
789 | if (status < 0) { | ||
790 | if (status != -ENOSPC) | ||
791 | mlog_errno(status); | ||
792 | status = -ENOSPC; | ||
793 | goto bail_unlock; | ||
794 | } | ||
795 | } | ||
796 | |||
797 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
798 | if (IS_ERR(handle)) { | ||
799 | status = PTR_ERR(handle); | ||
800 | mlog_errno(status); | ||
801 | goto bail_unlock; | ||
802 | } | ||
803 | |||
804 | status = inode_setattr(inode, attr); | ||
805 | if (status < 0) { | ||
806 | mlog_errno(status); | ||
807 | goto bail_commit; | ||
808 | } | ||
809 | |||
810 | status = ocfs2_mark_inode_dirty(handle, inode, bh); | ||
811 | if (status < 0) | ||
812 | mlog_errno(status); | ||
813 | |||
814 | bail_commit: | ||
815 | ocfs2_commit_trans(handle); | ||
816 | bail_unlock: | ||
817 | ocfs2_meta_unlock(inode, 1); | ||
818 | bail_unlock_rw: | ||
819 | if (size_change) | ||
820 | ocfs2_rw_unlock(inode, 1); | ||
821 | bail: | ||
822 | if (bh) | ||
823 | brelse(bh); | ||
824 | |||
825 | mlog_exit(status); | ||
826 | return status; | ||
827 | } | ||
828 | |||
829 | int ocfs2_getattr(struct vfsmount *mnt, | ||
830 | struct dentry *dentry, | ||
831 | struct kstat *stat) | ||
832 | { | ||
833 | struct inode *inode = dentry->d_inode; | ||
834 | struct super_block *sb = dentry->d_inode->i_sb; | ||
835 | struct ocfs2_super *osb = sb->s_fs_info; | ||
836 | int err; | ||
837 | |||
838 | mlog_entry_void(); | ||
839 | |||
840 | err = ocfs2_inode_revalidate(dentry); | ||
841 | if (err) { | ||
842 | if (err != -ENOENT) | ||
843 | mlog_errno(err); | ||
844 | goto bail; | ||
845 | } | ||
846 | |||
847 | generic_fillattr(inode, stat); | ||
848 | |||
849 | /* We set the blksize from the cluster size for performance */ | ||
850 | stat->blksize = osb->s_clustersize; | ||
851 | |||
852 | bail: | ||
853 | mlog_exit(err); | ||
854 | |||
855 | return err; | ||
856 | } | ||
857 | |||
858 | static int ocfs2_write_remove_suid(struct inode *inode) | ||
859 | { | ||
860 | int ret; | ||
861 | struct buffer_head *bh = NULL; | ||
862 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
863 | struct ocfs2_journal_handle *handle; | ||
864 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
865 | struct ocfs2_dinode *di; | ||
866 | |||
867 | mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, | ||
868 | inode->i_mode); | ||
869 | |||
870 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
871 | if (handle == NULL) { | ||
872 | ret = -ENOMEM; | ||
873 | mlog_errno(ret); | ||
874 | goto out; | ||
875 | } | ||
876 | |||
877 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | ||
878 | if (ret < 0) { | ||
879 | mlog_errno(ret); | ||
880 | goto out_trans; | ||
881 | } | ||
882 | |||
883 | ret = ocfs2_journal_access(handle, inode, bh, | ||
884 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
885 | if (ret < 0) { | ||
886 | mlog_errno(ret); | ||
887 | goto out_bh; | ||
888 | } | ||
889 | |||
890 | inode->i_mode &= ~S_ISUID; | ||
891 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) | ||
892 | inode->i_mode &= ~S_ISGID; | ||
893 | |||
894 | di = (struct ocfs2_dinode *) bh->b_data; | ||
895 | di->i_mode = cpu_to_le16(inode->i_mode); | ||
896 | |||
897 | ret = ocfs2_journal_dirty(handle, bh); | ||
898 | if (ret < 0) | ||
899 | mlog_errno(ret); | ||
900 | out_bh: | ||
901 | brelse(bh); | ||
902 | out_trans: | ||
903 | ocfs2_commit_trans(handle); | ||
904 | out: | ||
905 | mlog_exit(ret); | ||
906 | return ret; | ||
907 | } | ||
908 | |||
909 | static inline int ocfs2_write_should_remove_suid(struct inode *inode) | ||
910 | { | ||
911 | mode_t mode = inode->i_mode; | ||
912 | |||
913 | if (!capable(CAP_FSETID)) { | ||
914 | if (unlikely(mode & S_ISUID)) | ||
915 | return 1; | ||
916 | |||
917 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
918 | return 1; | ||
919 | } | ||
920 | return 0; | ||
921 | } | ||
922 | |||
923 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | ||
924 | const char __user *buf, | ||
925 | size_t count, | ||
926 | loff_t pos) | ||
927 | { | ||
928 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
929 | .iov_len = count }; | ||
930 | int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; | ||
931 | u32 clusters; | ||
932 | struct file *filp = iocb->ki_filp; | ||
933 | struct inode *inode = filp->f_dentry->d_inode; | ||
934 | loff_t newsize, saved_pos; | ||
935 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
936 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
937 | #endif | ||
938 | |||
939 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | ||
940 | (unsigned int)count, | ||
941 | filp->f_dentry->d_name.len, | ||
942 | filp->f_dentry->d_name.name); | ||
943 | |||
944 | /* happy write of zero bytes */ | ||
945 | if (count == 0) | ||
946 | return 0; | ||
947 | |||
948 | if (!inode) { | ||
949 | mlog(0, "bad inode\n"); | ||
950 | return -EIO; | ||
951 | } | ||
952 | |||
953 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
954 | /* ugh, work around some applications which open everything O_DIRECT + | ||
955 | * O_APPEND and really don't mean to use O_DIRECT. */ | ||
956 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && | ||
957 | (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) | ||
958 | filp->f_flags &= ~O_DIRECT; | ||
959 | #endif | ||
960 | |||
961 | down(&inode->i_sem); | ||
962 | /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */ | ||
963 | if (filp->f_flags & O_DIRECT) { | ||
964 | have_alloc_sem = 1; | ||
965 | down_read(&inode->i_alloc_sem); | ||
966 | } | ||
967 | |||
968 | /* concurrent O_DIRECT writes are allowed */ | ||
969 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | ||
970 | ret = ocfs2_rw_lock(inode, rw_level); | ||
971 | if (ret < 0) { | ||
972 | rw_level = -1; | ||
973 | mlog_errno(ret); | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * We sample i_size under a read level meta lock to see if our write | ||
979 | * is extending the file, if it is we back off and get a write level | ||
980 | * meta lock. | ||
981 | */ | ||
982 | meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; | ||
983 | for(;;) { | ||
984 | ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); | ||
985 | if (ret < 0) { | ||
986 | meta_level = -1; | ||
987 | mlog_errno(ret); | ||
988 | goto out; | ||
989 | } | ||
990 | |||
991 | /* Clear suid / sgid if necessary. We do this here | ||
992 | * instead of later in the write path because | ||
993 | * remove_suid() calls ->setattr without any hint that | ||
994 | * we may have already done our cluster locking. Since | ||
995 | * ocfs2_setattr() *must* take cluster locks to | ||
996 | * proceeed, this will lead us to recursively lock the | ||
997 | * inode. There's also the dinode i_size state which | ||
998 | * can be lost via setattr during extending writes (we | ||
999 | * set inode->i_size at the end of a write. */ | ||
1000 | if (ocfs2_write_should_remove_suid(inode)) { | ||
1001 | if (meta_level == 0) { | ||
1002 | ocfs2_meta_unlock(inode, meta_level); | ||
1003 | meta_level = 1; | ||
1004 | continue; | ||
1005 | } | ||
1006 | |||
1007 | ret = ocfs2_write_remove_suid(inode); | ||
1008 | if (ret < 0) { | ||
1009 | mlog_errno(ret); | ||
1010 | goto out; | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | /* work on a copy of ppos until we're sure that we won't have | ||
1015 | * to recalculate it due to relocking. */ | ||
1016 | if (filp->f_flags & O_APPEND) { | ||
1017 | saved_pos = i_size_read(inode); | ||
1018 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); | ||
1019 | } else { | ||
1020 | saved_pos = iocb->ki_pos; | ||
1021 | } | ||
1022 | newsize = count + saved_pos; | ||
1023 | |||
1024 | mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", | ||
1025 | saved_pos, newsize, i_size_read(inode)); | ||
1026 | |||
1027 | /* No need for a higher level metadata lock if we're | ||
1028 | * never going past i_size. */ | ||
1029 | if (newsize <= i_size_read(inode)) | ||
1030 | break; | ||
1031 | |||
1032 | if (meta_level == 0) { | ||
1033 | ocfs2_meta_unlock(inode, meta_level); | ||
1034 | meta_level = 1; | ||
1035 | continue; | ||
1036 | } | ||
1037 | |||
1038 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1039 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - | ||
1040 | OCFS2_I(inode)->ip_clusters; | ||
1041 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1042 | |||
1043 | mlog(0, "Writing at EOF, may need more allocation: " | ||
1044 | "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", | ||
1045 | i_size_read(inode), newsize, clusters); | ||
1046 | |||
1047 | /* We only want to continue the rest of this loop if | ||
1048 | * our extend will actually require more | ||
1049 | * allocation. */ | ||
1050 | if (!clusters) | ||
1051 | break; | ||
1052 | |||
1053 | ret = ocfs2_extend_allocation(inode, clusters); | ||
1054 | if (ret < 0) { | ||
1055 | if (ret != -ENOSPC) | ||
1056 | mlog_errno(ret); | ||
1057 | goto out; | ||
1058 | } | ||
1059 | |||
1060 | /* Fill any holes which would've been created by this | ||
1061 | * write. If we're O_APPEND, this will wind up | ||
1062 | * (correctly) being a noop. */ | ||
1063 | ret = ocfs2_zero_extend(inode, (u64) newsize - count); | ||
1064 | if (ret < 0) { | ||
1065 | mlog_errno(ret); | ||
1066 | goto out; | ||
1067 | } | ||
1068 | break; | ||
1069 | } | ||
1070 | |||
1071 | /* ok, we're done with i_size and alloc work */ | ||
1072 | iocb->ki_pos = saved_pos; | ||
1073 | ocfs2_meta_unlock(inode, meta_level); | ||
1074 | meta_level = -1; | ||
1075 | |||
1076 | /* communicate with ocfs2_dio_end_io */ | ||
1077 | ocfs2_iocb_set_rw_locked(iocb); | ||
1078 | |||
1079 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1080 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && | ||
1081 | filp->f_flags & O_DIRECT) { | ||
1082 | unsigned int saved_flags = filp->f_flags; | ||
1083 | int sector_size = 1 << osb->s_sectsize_bits; | ||
1084 | |||
1085 | if ((saved_pos & (sector_size - 1)) || | ||
1086 | (count & (sector_size - 1)) || | ||
1087 | ((unsigned long)buf & (sector_size - 1))) { | ||
1088 | filp->f_flags |= O_SYNC; | ||
1089 | filp->f_flags &= ~O_DIRECT; | ||
1090 | } | ||
1091 | |||
1092 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, | ||
1093 | &iocb->ki_pos); | ||
1094 | |||
1095 | filp->f_flags = saved_flags; | ||
1096 | } else | ||
1097 | #endif | ||
1098 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, | ||
1099 | &iocb->ki_pos); | ||
1100 | |||
1101 | /* buffered aio wouldn't have proper lock coverage today */ | ||
1102 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | ||
1103 | |||
1104 | /* | ||
1105 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | ||
1106 | * function pointer which is called when o_direct io completes so that | ||
1107 | * it can unlock our rw lock. (it's the clustered equivalent of | ||
1108 | * i_alloc_sem; protects truncate from racing with pending ios). | ||
1109 | * Unfortunately there are error cases which call end_io and others | ||
1110 | * that don't. so we don't have to unlock the rw_lock if either an | ||
1111 | * async dio is going to do it in the future or an end_io after an | ||
1112 | * error has already done it. | ||
1113 | */ | ||
1114 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | ||
1115 | rw_level = -1; | ||
1116 | have_alloc_sem = 0; | ||
1117 | } | ||
1118 | |||
1119 | out: | ||
1120 | if (meta_level != -1) | ||
1121 | ocfs2_meta_unlock(inode, meta_level); | ||
1122 | if (have_alloc_sem) | ||
1123 | up_read(&inode->i_alloc_sem); | ||
1124 | if (rw_level != -1) | ||
1125 | ocfs2_rw_unlock(inode, rw_level); | ||
1126 | up(&inode->i_sem); | ||
1127 | |||
1128 | mlog_exit(ret); | ||
1129 | return ret; | ||
1130 | } | ||
1131 | |||
1132 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | ||
1133 | char __user *buf, | ||
1134 | size_t count, | ||
1135 | loff_t pos) | ||
1136 | { | ||
1137 | int ret = 0, rw_level = -1, have_alloc_sem = 0; | ||
1138 | struct file *filp = iocb->ki_filp; | ||
1139 | struct inode *inode = filp->f_dentry->d_inode; | ||
1140 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1141 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1142 | #endif | ||
1143 | |||
1144 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | ||
1145 | (unsigned int)count, | ||
1146 | filp->f_dentry->d_name.len, | ||
1147 | filp->f_dentry->d_name.name); | ||
1148 | |||
1149 | if (!inode) { | ||
1150 | ret = -EINVAL; | ||
1151 | mlog_errno(ret); | ||
1152 | goto bail; | ||
1153 | } | ||
1154 | |||
1155 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1156 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { | ||
1157 | if (filp->f_flags & O_DIRECT) { | ||
1158 | int sector_size = 1 << osb->s_sectsize_bits; | ||
1159 | |||
1160 | if ((pos & (sector_size - 1)) || | ||
1161 | (count & (sector_size - 1)) || | ||
1162 | ((unsigned long)buf & (sector_size - 1)) || | ||
1163 | (i_size_read(inode) & (sector_size -1))) { | ||
1164 | filp->f_flags &= ~O_DIRECT; | ||
1165 | } | ||
1166 | } | ||
1167 | } | ||
1168 | #endif | ||
1169 | |||
1170 | /* | ||
1171 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | ||
1172 | * need locks to protect pending reads from racing with truncate. | ||
1173 | */ | ||
1174 | if (filp->f_flags & O_DIRECT) { | ||
1175 | down_read(&inode->i_alloc_sem); | ||
1176 | have_alloc_sem = 1; | ||
1177 | |||
1178 | ret = ocfs2_rw_lock(inode, 0); | ||
1179 | if (ret < 0) { | ||
1180 | mlog_errno(ret); | ||
1181 | goto bail; | ||
1182 | } | ||
1183 | rw_level = 0; | ||
1184 | /* communicate with ocfs2_dio_end_io */ | ||
1185 | ocfs2_iocb_set_rw_locked(iocb); | ||
1186 | } | ||
1187 | |||
1188 | ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); | ||
1189 | if (ret == -EINVAL) | ||
1190 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); | ||
1191 | |||
1192 | /* buffered aio wouldn't have proper lock coverage today */ | ||
1193 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | ||
1194 | |||
1195 | /* see ocfs2_file_aio_write */ | ||
1196 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | ||
1197 | rw_level = -1; | ||
1198 | have_alloc_sem = 0; | ||
1199 | } | ||
1200 | |||
1201 | bail: | ||
1202 | if (have_alloc_sem) | ||
1203 | up_read(&inode->i_alloc_sem); | ||
1204 | if (rw_level != -1) | ||
1205 | ocfs2_rw_unlock(inode, rw_level); | ||
1206 | mlog_exit(ret); | ||
1207 | |||
1208 | return ret; | ||
1209 | } | ||
1210 | |||
1211 | struct inode_operations ocfs2_file_iops = { | ||
1212 | .setattr = ocfs2_setattr, | ||
1213 | .getattr = ocfs2_getattr, | ||
1214 | }; | ||
1215 | |||
1216 | struct inode_operations ocfs2_special_file_iops = { | ||
1217 | .setattr = ocfs2_setattr, | ||
1218 | .getattr = ocfs2_getattr, | ||
1219 | }; | ||
1220 | |||
1221 | struct file_operations ocfs2_fops = { | ||
1222 | .read = do_sync_read, | ||
1223 | .write = do_sync_write, | ||
1224 | .sendfile = generic_file_sendfile, | ||
1225 | .mmap = ocfs2_mmap, | ||
1226 | .fsync = ocfs2_sync_file, | ||
1227 | .release = ocfs2_file_release, | ||
1228 | .open = ocfs2_file_open, | ||
1229 | .aio_read = ocfs2_file_aio_read, | ||
1230 | .aio_write = ocfs2_file_aio_write, | ||
1231 | }; | ||
1232 | |||
1233 | struct file_operations ocfs2_dops = { | ||
1234 | .read = generic_read_dir, | ||
1235 | .readdir = ocfs2_readdir, | ||
1236 | .fsync = ocfs2_sync_file, | ||
1237 | }; | ||
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h new file mode 100644 index 000000000000..a5ea33b24060 --- /dev/null +++ b/fs/ocfs2/file.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * file.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_FILE_H | ||
27 | #define OCFS2_FILE_H | ||
28 | |||
29 | extern struct file_operations ocfs2_fops; | ||
30 | extern struct file_operations ocfs2_dops; | ||
31 | extern struct inode_operations ocfs2_file_iops; | ||
32 | extern struct inode_operations ocfs2_special_file_iops; | ||
33 | struct ocfs2_alloc_context; | ||
34 | |||
35 | enum ocfs2_alloc_restarted { | ||
36 | RESTART_NONE = 0, | ||
37 | RESTART_TRANS, | ||
38 | RESTART_META | ||
39 | }; | ||
40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | ||
41 | struct inode *inode, | ||
42 | u32 clusters_to_add, | ||
43 | struct buffer_head *fe_bh, | ||
44 | struct ocfs2_journal_handle *handle, | ||
45 | struct ocfs2_alloc_context *data_ac, | ||
46 | struct ocfs2_alloc_context *meta_ac, | ||
47 | enum ocfs2_alloc_restarted *reason); | ||
48 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | ||
49 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
50 | struct kstat *stat); | ||
51 | |||
52 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, | ||
53 | struct inode *inode, | ||
54 | struct buffer_head *fe_bh, | ||
55 | u64 new_i_size); | ||
56 | |||
57 | #endif /* OCFS2_FILE_H */ | ||
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c new file mode 100644 index 000000000000..0bbd22f46c80 --- /dev/null +++ b/fs/ocfs2/heartbeat.c | |||
@@ -0,0 +1,378 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * heartbeat.c | ||
5 | * | ||
6 | * Register ourselves with the heartbaet service, keep our node maps | ||
7 | * up to date, and fire off recovery when needed. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/kmod.h> | ||
32 | |||
33 | #include <cluster/heartbeat.h> | ||
34 | #include <cluster/nodemanager.h> | ||
35 | |||
36 | #include <dlm/dlmapi.h> | ||
37 | |||
38 | #define MLOG_MASK_PREFIX ML_SUPER | ||
39 | #include <cluster/masklog.h> | ||
40 | |||
41 | #include "ocfs2.h" | ||
42 | |||
43 | #include "alloc.h" | ||
44 | #include "heartbeat.h" | ||
45 | #include "inode.h" | ||
46 | #include "journal.h" | ||
47 | #include "vote.h" | ||
48 | |||
49 | #include "buffer_head_io.h" | ||
50 | |||
51 | #define OCFS2_HB_NODE_DOWN_PRI (0x0000002) | ||
52 | #define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI | ||
53 | |||
54 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | ||
55 | int bit); | ||
56 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, | ||
57 | int bit); | ||
58 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); | ||
59 | static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, | ||
60 | struct ocfs2_node_map *from); | ||
61 | static void __ocfs2_node_map_set(struct ocfs2_node_map *target, | ||
62 | struct ocfs2_node_map *from); | ||
63 | |||
64 | void ocfs2_init_node_maps(struct ocfs2_super *osb) | ||
65 | { | ||
66 | spin_lock_init(&osb->node_map_lock); | ||
67 | ocfs2_node_map_init(&osb->mounted_map); | ||
68 | ocfs2_node_map_init(&osb->recovery_map); | ||
69 | ocfs2_node_map_init(&osb->umount_map); | ||
70 | } | ||
71 | |||
72 | static void ocfs2_do_node_down(int node_num, | ||
73 | struct ocfs2_super *osb) | ||
74 | { | ||
75 | BUG_ON(osb->node_num == node_num); | ||
76 | |||
77 | mlog(0, "ocfs2: node down event for %d\n", node_num); | ||
78 | |||
79 | if (!osb->dlm) { | ||
80 | /* | ||
81 | * No DLM means we're not even ready to participate yet. | ||
82 | * We check the slots after the DLM comes up, so we will | ||
83 | * notice the node death then. We can safely ignore it | ||
84 | * here. | ||
85 | */ | ||
86 | return; | ||
87 | } | ||
88 | |||
89 | if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) { | ||
90 | /* If a node is in the umount map, then we've been | ||
91 | * expecting him to go down and we know ahead of time | ||
92 | * that recovery is not necessary. */ | ||
93 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
94 | return; | ||
95 | } | ||
96 | |||
97 | ocfs2_recovery_thread(osb, node_num); | ||
98 | |||
99 | ocfs2_remove_node_from_vote_queues(osb, node_num); | ||
100 | } | ||
101 | |||
102 | static void ocfs2_hb_node_down_cb(struct o2nm_node *node, | ||
103 | int node_num, | ||
104 | void *data) | ||
105 | { | ||
106 | ocfs2_do_node_down(node_num, (struct ocfs2_super *) data); | ||
107 | } | ||
108 | |||
109 | /* Called from the dlm when it's about to evict a node. We may also | ||
110 | * get a heartbeat callback later. */ | ||
111 | static void ocfs2_dlm_eviction_cb(int node_num, | ||
112 | void *data) | ||
113 | { | ||
114 | struct ocfs2_super *osb = (struct ocfs2_super *) data; | ||
115 | struct super_block *sb = osb->sb; | ||
116 | |||
117 | mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", | ||
118 | MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); | ||
119 | |||
120 | ocfs2_do_node_down(node_num, osb); | ||
121 | } | ||
122 | |||
123 | static void ocfs2_hb_node_up_cb(struct o2nm_node *node, | ||
124 | int node_num, | ||
125 | void *data) | ||
126 | { | ||
127 | struct ocfs2_super *osb = data; | ||
128 | |||
129 | BUG_ON(osb->node_num == node_num); | ||
130 | |||
131 | mlog(0, "node up event for %d\n", node_num); | ||
132 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
133 | } | ||
134 | |||
135 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) | ||
136 | { | ||
137 | o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB, | ||
138 | ocfs2_hb_node_down_cb, osb, | ||
139 | OCFS2_HB_NODE_DOWN_PRI); | ||
140 | |||
141 | o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB, | ||
142 | ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI); | ||
143 | |||
144 | /* Not exactly a heartbeat callback, but leads to essentially | ||
145 | * the same path so we set it up here. */ | ||
146 | dlm_setup_eviction_cb(&osb->osb_eviction_cb, | ||
147 | ocfs2_dlm_eviction_cb, | ||
148 | osb); | ||
149 | } | ||
150 | |||
151 | /* Most functions here are just stubs for now... */ | ||
152 | int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) | ||
153 | { | ||
154 | int status; | ||
155 | |||
156 | status = o2hb_register_callback(&osb->osb_hb_down); | ||
157 | if (status < 0) { | ||
158 | mlog_errno(status); | ||
159 | goto bail; | ||
160 | } | ||
161 | |||
162 | status = o2hb_register_callback(&osb->osb_hb_up); | ||
163 | if (status < 0) | ||
164 | mlog_errno(status); | ||
165 | |||
166 | bail: | ||
167 | return status; | ||
168 | } | ||
169 | |||
170 | void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) | ||
171 | { | ||
172 | int status; | ||
173 | |||
174 | status = o2hb_unregister_callback(&osb->osb_hb_down); | ||
175 | if (status < 0) | ||
176 | mlog_errno(status); | ||
177 | |||
178 | status = o2hb_unregister_callback(&osb->osb_hb_up); | ||
179 | if (status < 0) | ||
180 | mlog_errno(status); | ||
181 | } | ||
182 | |||
183 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb) | ||
184 | { | ||
185 | int ret; | ||
186 | char *argv[5], *envp[3]; | ||
187 | |||
188 | if (!osb->uuid_str) { | ||
189 | /* This can happen if we don't get far enough in mount... */ | ||
190 | mlog(0, "No UUID with which to stop heartbeat!\n\n"); | ||
191 | return; | ||
192 | } | ||
193 | |||
194 | argv[0] = (char *)o2nm_get_hb_ctl_path(); | ||
195 | argv[1] = "-K"; | ||
196 | argv[2] = "-u"; | ||
197 | argv[3] = osb->uuid_str; | ||
198 | argv[4] = NULL; | ||
199 | |||
200 | mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); | ||
201 | |||
202 | /* minimal command environment taken from cpu_run_sbin_hotplug */ | ||
203 | envp[0] = "HOME=/"; | ||
204 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
205 | envp[2] = NULL; | ||
206 | |||
207 | ret = call_usermodehelper(argv[0], argv, envp, 1); | ||
208 | if (ret < 0) | ||
209 | mlog_errno(ret); | ||
210 | } | ||
211 | |||
212 | /* special case -1 for now | ||
213 | * TODO: should *really* make sure the calling func never passes -1!! */ | ||
214 | void ocfs2_node_map_init(struct ocfs2_node_map *map) | ||
215 | { | ||
216 | map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; | ||
217 | memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * | ||
218 | sizeof(unsigned long)); | ||
219 | } | ||
220 | |||
221 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | ||
222 | int bit) | ||
223 | { | ||
224 | set_bit(bit, map->map); | ||
225 | } | ||
226 | |||
227 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, | ||
228 | struct ocfs2_node_map *map, | ||
229 | int bit) | ||
230 | { | ||
231 | if (bit==-1) | ||
232 | return; | ||
233 | BUG_ON(bit >= map->num_nodes); | ||
234 | spin_lock(&osb->node_map_lock); | ||
235 | __ocfs2_node_map_set_bit(map, bit); | ||
236 | spin_unlock(&osb->node_map_lock); | ||
237 | } | ||
238 | |||
239 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, | ||
240 | int bit) | ||
241 | { | ||
242 | clear_bit(bit, map->map); | ||
243 | } | ||
244 | |||
245 | void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, | ||
246 | struct ocfs2_node_map *map, | ||
247 | int bit) | ||
248 | { | ||
249 | if (bit==-1) | ||
250 | return; | ||
251 | BUG_ON(bit >= map->num_nodes); | ||
252 | spin_lock(&osb->node_map_lock); | ||
253 | __ocfs2_node_map_clear_bit(map, bit); | ||
254 | spin_unlock(&osb->node_map_lock); | ||
255 | } | ||
256 | |||
257 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | ||
258 | struct ocfs2_node_map *map, | ||
259 | int bit) | ||
260 | { | ||
261 | int ret; | ||
262 | if (bit >= map->num_nodes) { | ||
263 | mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); | ||
264 | BUG(); | ||
265 | } | ||
266 | spin_lock(&osb->node_map_lock); | ||
267 | ret = test_bit(bit, map->map); | ||
268 | spin_unlock(&osb->node_map_lock); | ||
269 | return ret; | ||
270 | } | ||
271 | |||
272 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) | ||
273 | { | ||
274 | int bit; | ||
275 | bit = find_next_bit(map->map, map->num_nodes, 0); | ||
276 | if (bit < map->num_nodes) | ||
277 | return 0; | ||
278 | return 1; | ||
279 | } | ||
280 | |||
281 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
282 | struct ocfs2_node_map *map) | ||
283 | { | ||
284 | int ret; | ||
285 | BUG_ON(map->num_nodes == 0); | ||
286 | spin_lock(&osb->node_map_lock); | ||
287 | ret = __ocfs2_node_map_is_empty(map); | ||
288 | spin_unlock(&osb->node_map_lock); | ||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, | ||
293 | struct ocfs2_node_map *from) | ||
294 | { | ||
295 | BUG_ON(from->num_nodes == 0); | ||
296 | ocfs2_node_map_init(target); | ||
297 | __ocfs2_node_map_set(target, from); | ||
298 | } | ||
299 | |||
300 | /* returns 1 if bit is the only bit set in target, 0 otherwise */ | ||
301 | int ocfs2_node_map_is_only(struct ocfs2_super *osb, | ||
302 | struct ocfs2_node_map *target, | ||
303 | int bit) | ||
304 | { | ||
305 | struct ocfs2_node_map temp; | ||
306 | int ret; | ||
307 | |||
308 | spin_lock(&osb->node_map_lock); | ||
309 | __ocfs2_node_map_dup(&temp, target); | ||
310 | __ocfs2_node_map_clear_bit(&temp, bit); | ||
311 | ret = __ocfs2_node_map_is_empty(&temp); | ||
312 | spin_unlock(&osb->node_map_lock); | ||
313 | |||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | static void __ocfs2_node_map_set(struct ocfs2_node_map *target, | ||
318 | struct ocfs2_node_map *from) | ||
319 | { | ||
320 | int num_longs, i; | ||
321 | |||
322 | BUG_ON(target->num_nodes != from->num_nodes); | ||
323 | BUG_ON(target->num_nodes == 0); | ||
324 | |||
325 | num_longs = BITS_TO_LONGS(target->num_nodes); | ||
326 | for (i = 0; i < num_longs; i++) | ||
327 | target->map[i] = from->map[i]; | ||
328 | } | ||
329 | |||
330 | /* Returns whether the recovery bit was actually set - it may not be | ||
331 | * if a node is still marked as needing recovery */ | ||
332 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
333 | int num) | ||
334 | { | ||
335 | int set = 0; | ||
336 | |||
337 | spin_lock(&osb->node_map_lock); | ||
338 | |||
339 | __ocfs2_node_map_clear_bit(&osb->mounted_map, num); | ||
340 | |||
341 | if (!test_bit(num, osb->recovery_map.map)) { | ||
342 | __ocfs2_node_map_set_bit(&osb->recovery_map, num); | ||
343 | set = 1; | ||
344 | } | ||
345 | |||
346 | spin_unlock(&osb->node_map_lock); | ||
347 | |||
348 | return set; | ||
349 | } | ||
350 | |||
351 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
352 | int num) | ||
353 | { | ||
354 | ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); | ||
355 | } | ||
356 | |||
357 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
358 | struct ocfs2_node_map *map, | ||
359 | int idx) | ||
360 | { | ||
361 | int i = idx; | ||
362 | |||
363 | idx = O2NM_INVALID_NODE_NUM; | ||
364 | spin_lock(&osb->node_map_lock); | ||
365 | if ((i != O2NM_INVALID_NODE_NUM) && | ||
366 | (i >= 0) && | ||
367 | (i < map->num_nodes)) { | ||
368 | while(i < map->num_nodes) { | ||
369 | if (test_bit(i, map->map)) { | ||
370 | idx = i; | ||
371 | break; | ||
372 | } | ||
373 | i++; | ||
374 | } | ||
375 | } | ||
376 | spin_unlock(&osb->node_map_lock); | ||
377 | return idx; | ||
378 | } | ||
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h new file mode 100644 index 000000000000..e8fb079122e4 --- /dev/null +++ b/fs/ocfs2/heartbeat.h | |||
@@ -0,0 +1,67 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * heartbeat.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_HEARTBEAT_H | ||
27 | #define OCFS2_HEARTBEAT_H | ||
28 | |||
29 | void ocfs2_init_node_maps(struct ocfs2_super *osb); | ||
30 | |||
31 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); | ||
32 | int ocfs2_register_hb_callbacks(struct ocfs2_super *osb); | ||
33 | void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb); | ||
34 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb); | ||
35 | |||
36 | /* node map functions - used to keep track of mounted and in-recovery | ||
37 | * nodes. */ | ||
38 | void ocfs2_node_map_init(struct ocfs2_node_map *map); | ||
39 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
40 | struct ocfs2_node_map *map); | ||
41 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, | ||
42 | struct ocfs2_node_map *map, | ||
43 | int bit); | ||
44 | void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, | ||
45 | struct ocfs2_node_map *map, | ||
46 | int bit); | ||
47 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | ||
48 | struct ocfs2_node_map *map, | ||
49 | int bit); | ||
50 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
51 | struct ocfs2_node_map *map, | ||
52 | int idx); | ||
53 | static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, | ||
54 | struct ocfs2_node_map *map) | ||
55 | { | ||
56 | return ocfs2_node_map_iterate(osb, map, 0); | ||
57 | } | ||
58 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
59 | int num); | ||
60 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
61 | int num); | ||
62 | /* returns 1 if bit is the only bit set in target, 0 otherwise */ | ||
63 | int ocfs2_node_map_is_only(struct ocfs2_super *osb, | ||
64 | struct ocfs2_node_map *target, | ||
65 | int bit); | ||
66 | |||
67 | #endif /* OCFS2_HEARTBEAT_H */ | ||
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c new file mode 100644 index 000000000000..a91ba4dec936 --- /dev/null +++ b/fs/ocfs2/inode.c | |||
@@ -0,0 +1,1140 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * inode.c | ||
5 | * | ||
6 | * vfs' aops, fops, dops and iops | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | |||
33 | #include <asm/byteorder.h> | ||
34 | |||
35 | #define MLOG_MASK_PREFIX ML_INODE | ||
36 | #include <cluster/masklog.h> | ||
37 | |||
38 | #include "ocfs2.h" | ||
39 | |||
40 | #include "alloc.h" | ||
41 | #include "dlmglue.h" | ||
42 | #include "extent_map.h" | ||
43 | #include "file.h" | ||
44 | #include "inode.h" | ||
45 | #include "journal.h" | ||
46 | #include "namei.h" | ||
47 | #include "suballoc.h" | ||
48 | #include "super.h" | ||
49 | #include "symlink.h" | ||
50 | #include "sysfile.h" | ||
51 | #include "uptodate.h" | ||
52 | #include "vote.h" | ||
53 | |||
54 | #include "buffer_head_io.h" | ||
55 | |||
56 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | ||
57 | #define OCFS2_FI_FLAG_DELETE 0x2 | ||
58 | struct ocfs2_find_inode_args | ||
59 | { | ||
60 | u64 fi_blkno; | ||
61 | unsigned long fi_ino; | ||
62 | unsigned int fi_flags; | ||
63 | }; | ||
64 | |||
65 | static int ocfs2_read_locked_inode(struct inode *inode, | ||
66 | struct ocfs2_find_inode_args *args); | ||
67 | static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); | ||
68 | static int ocfs2_find_actor(struct inode *inode, void *opaque); | ||
69 | static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | ||
70 | struct inode *inode, | ||
71 | struct buffer_head *fe_bh); | ||
72 | |||
73 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
74 | u64 blkno, | ||
75 | int delete_vote) | ||
76 | { | ||
77 | struct ocfs2_find_inode_args args; | ||
78 | |||
79 | /* ocfs2_ilookup_for_vote should *only* be called from the | ||
80 | * vote thread */ | ||
81 | BUG_ON(current != osb->vote_task); | ||
82 | |||
83 | args.fi_blkno = blkno; | ||
84 | args.fi_flags = OCFS2_FI_FLAG_NOWAIT; | ||
85 | if (delete_vote) | ||
86 | args.fi_flags |= OCFS2_FI_FLAG_DELETE; | ||
87 | args.fi_ino = ino_from_blkno(osb->sb, blkno); | ||
88 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | ||
89 | } | ||
90 | |||
91 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | ||
92 | { | ||
93 | struct inode *inode = NULL; | ||
94 | struct super_block *sb = osb->sb; | ||
95 | struct ocfs2_find_inode_args args; | ||
96 | |||
97 | mlog_entry("(blkno = %"MLFu64")\n", blkno); | ||
98 | |||
99 | /* Ok. By now we've either got the offsets passed to us by the | ||
100 | * caller, or we just pulled them off the bh. Lets do some | ||
101 | * sanity checks to make sure they're OK. */ | ||
102 | if (blkno == 0) { | ||
103 | inode = ERR_PTR(-EINVAL); | ||
104 | mlog_errno(PTR_ERR(inode)); | ||
105 | goto bail; | ||
106 | } | ||
107 | |||
108 | args.fi_blkno = blkno; | ||
109 | args.fi_flags = 0; | ||
110 | args.fi_ino = ino_from_blkno(sb, blkno); | ||
111 | |||
112 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, | ||
113 | ocfs2_init_locked_inode, &args); | ||
114 | /* inode was *not* in the inode cache. 2.6.x requires | ||
115 | * us to do our own read_inode call and unlock it | ||
116 | * afterwards. */ | ||
117 | if (inode && inode->i_state & I_NEW) { | ||
118 | mlog(0, "Inode was not in inode cache, reading it.\n"); | ||
119 | ocfs2_read_locked_inode(inode, &args); | ||
120 | unlock_new_inode(inode); | ||
121 | } | ||
122 | if (inode == NULL) { | ||
123 | inode = ERR_PTR(-ENOMEM); | ||
124 | mlog_errno(PTR_ERR(inode)); | ||
125 | goto bail; | ||
126 | } | ||
127 | if (is_bad_inode(inode)) { | ||
128 | iput(inode); | ||
129 | inode = ERR_PTR(-ESTALE); | ||
130 | mlog_errno(PTR_ERR(inode)); | ||
131 | goto bail; | ||
132 | } | ||
133 | |||
134 | bail: | ||
135 | if (!IS_ERR(inode)) { | ||
136 | mlog(0, "returning inode with number %"MLFu64"\n", | ||
137 | OCFS2_I(inode)->ip_blkno); | ||
138 | mlog_exit_ptr(inode); | ||
139 | } else | ||
140 | mlog_errno(PTR_ERR(inode)); | ||
141 | |||
142 | return inode; | ||
143 | } | ||
144 | |||
145 | |||
146 | /* | ||
147 | * here's how inodes get read from disk: | ||
148 | * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR | ||
149 | * found? : return the in-memory inode | ||
150 | * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE | ||
151 | */ | ||
152 | |||
153 | static int ocfs2_find_actor(struct inode *inode, void *opaque) | ||
154 | { | ||
155 | struct ocfs2_find_inode_args *args = NULL; | ||
156 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
157 | int ret = 0; | ||
158 | |||
159 | mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); | ||
160 | |||
161 | args = opaque; | ||
162 | |||
163 | mlog_bug_on_msg(!inode, "No inode in find actor!\n"); | ||
164 | |||
165 | if (oi->ip_blkno != args->fi_blkno) | ||
166 | goto bail; | ||
167 | |||
168 | /* OCFS2_FI_FLAG_NOWAIT is *only* set from | ||
169 | * ocfs2_ilookup_for_vote which won't create an inode for one | ||
170 | * that isn't found. The vote thread which doesn't want to get | ||
171 | * an inode which is in the process of going away - otherwise | ||
172 | * the call to __wait_on_freeing_inode in find_inode_fast will | ||
173 | * cause it to deadlock on an inode which may be waiting on a | ||
174 | * vote (or lock release) in delete_inode */ | ||
175 | if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && | ||
176 | (inode->i_state & (I_FREEING|I_CLEAR))) { | ||
177 | /* As stated above, we're not going to return an | ||
178 | * inode. In the case of a delete vote, the voting | ||
179 | * code is going to signal the other node to go | ||
180 | * ahead. Mark that state here, so this freeing inode | ||
181 | * has the state when it gets to delete_inode. */ | ||
182 | if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { | ||
183 | spin_lock(&oi->ip_lock); | ||
184 | ocfs2_mark_inode_remotely_deleted(inode); | ||
185 | spin_unlock(&oi->ip_lock); | ||
186 | } | ||
187 | goto bail; | ||
188 | } | ||
189 | |||
190 | ret = 1; | ||
191 | bail: | ||
192 | mlog_exit(ret); | ||
193 | return ret; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * initialize the new inode, but don't do anything that would cause | ||
198 | * us to sleep. | ||
199 | * return 0 on success, 1 on failure | ||
200 | */ | ||
201 | static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) | ||
202 | { | ||
203 | struct ocfs2_find_inode_args *args = opaque; | ||
204 | |||
205 | mlog_entry("inode = %p, opaque = %p\n", inode, opaque); | ||
206 | |||
207 | inode->i_ino = args->fi_ino; | ||
208 | OCFS2_I(inode)->ip_blkno = args->fi_blkno; | ||
209 | |||
210 | mlog_exit(0); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | ||
215 | int create_ino) | ||
216 | { | ||
217 | struct super_block *sb; | ||
218 | struct ocfs2_super *osb; | ||
219 | int status = -EINVAL; | ||
220 | |||
221 | mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size); | ||
222 | |||
223 | sb = inode->i_sb; | ||
224 | osb = OCFS2_SB(sb); | ||
225 | |||
226 | /* this means that read_inode cannot create a superblock inode | ||
227 | * today. change if needed. */ | ||
228 | if (!OCFS2_IS_VALID_DINODE(fe) || | ||
229 | !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { | ||
230 | mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", " | ||
231 | "signature = %.*s, flags = 0x%x\n", | ||
232 | inode->i_ino, le64_to_cpu(fe->i_blkno), 7, | ||
233 | fe->i_signature, le32_to_cpu(fe->i_flags)); | ||
234 | goto bail; | ||
235 | } | ||
236 | |||
237 | if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { | ||
238 | mlog(ML_ERROR, "file entry generation does not match " | ||
239 | "superblock! osb->fs_generation=%x, " | ||
240 | "fe->i_fs_generation=%x\n", | ||
241 | osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); | ||
242 | goto bail; | ||
243 | } | ||
244 | |||
245 | inode->i_version = 1; | ||
246 | inode->i_generation = le32_to_cpu(fe->i_generation); | ||
247 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | ||
248 | inode->i_mode = le16_to_cpu(fe->i_mode); | ||
249 | inode->i_uid = le32_to_cpu(fe->i_uid); | ||
250 | inode->i_gid = le32_to_cpu(fe->i_gid); | ||
251 | inode->i_blksize = (u32)osb->s_clustersize; | ||
252 | |||
253 | /* Fast symlinks will have i_size but no allocated clusters. */ | ||
254 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) | ||
255 | inode->i_blocks = 0; | ||
256 | else | ||
257 | inode->i_blocks = | ||
258 | ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); | ||
259 | inode->i_mapping->a_ops = &ocfs2_aops; | ||
260 | inode->i_flags |= S_NOATIME; | ||
261 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | ||
262 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | ||
263 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | ||
264 | inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); | ||
265 | inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); | ||
266 | inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); | ||
267 | |||
268 | if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) | ||
269 | mlog(ML_ERROR, | ||
270 | "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n", | ||
271 | OCFS2_I(inode)->ip_blkno, fe->i_blkno); | ||
272 | |||
273 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
274 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
275 | |||
276 | if (create_ino) | ||
277 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
278 | le64_to_cpu(fe->i_blkno)); | ||
279 | |||
280 | mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n", | ||
281 | fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); | ||
282 | |||
283 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | ||
284 | |||
285 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { | ||
286 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | ||
287 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); | ||
288 | } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { | ||
289 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | ||
290 | } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { | ||
291 | mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); | ||
292 | /* we can't actually hit this as read_inode can't | ||
293 | * handle superblocks today ;-) */ | ||
294 | BUG(); | ||
295 | } | ||
296 | |||
297 | switch (inode->i_mode & S_IFMT) { | ||
298 | case S_IFREG: | ||
299 | inode->i_fop = &ocfs2_fops; | ||
300 | inode->i_op = &ocfs2_file_iops; | ||
301 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
302 | break; | ||
303 | case S_IFDIR: | ||
304 | inode->i_op = &ocfs2_dir_iops; | ||
305 | inode->i_fop = &ocfs2_dops; | ||
306 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
307 | break; | ||
308 | case S_IFLNK: | ||
309 | if (ocfs2_inode_is_fast_symlink(inode)) | ||
310 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | ||
311 | else | ||
312 | inode->i_op = &ocfs2_symlink_inode_operations; | ||
313 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
314 | break; | ||
315 | default: | ||
316 | inode->i_op = &ocfs2_special_file_iops; | ||
317 | init_special_inode(inode, inode->i_mode, | ||
318 | inode->i_rdev); | ||
319 | break; | ||
320 | } | ||
321 | |||
322 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | ||
323 | OCFS2_LOCK_TYPE_RW, inode); | ||
324 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
325 | OCFS2_LOCK_TYPE_META, inode); | ||
326 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, | ||
327 | OCFS2_LOCK_TYPE_DATA, inode); | ||
328 | |||
329 | status = 0; | ||
330 | bail: | ||
331 | mlog_exit(status); | ||
332 | return status; | ||
333 | } | ||
334 | |||
335 | static int ocfs2_read_locked_inode(struct inode *inode, | ||
336 | struct ocfs2_find_inode_args *args) | ||
337 | { | ||
338 | struct super_block *sb; | ||
339 | struct ocfs2_super *osb; | ||
340 | struct ocfs2_dinode *fe; | ||
341 | struct buffer_head *bh = NULL; | ||
342 | int status; | ||
343 | int sysfile = 0; | ||
344 | |||
345 | mlog_entry("(0x%p, 0x%p)\n", inode, args); | ||
346 | |||
347 | status = -EINVAL; | ||
348 | if (inode == NULL || inode->i_sb == NULL) { | ||
349 | mlog(ML_ERROR, "bad inode\n"); | ||
350 | goto bail; | ||
351 | } | ||
352 | sb = inode->i_sb; | ||
353 | osb = OCFS2_SB(sb); | ||
354 | |||
355 | if (!args) { | ||
356 | mlog(ML_ERROR, "bad inode args\n"); | ||
357 | make_bad_inode(inode); | ||
358 | goto bail; | ||
359 | } | ||
360 | |||
361 | /* Read the FE off disk. This is safe because the kernel only | ||
362 | * does one read_inode2 for a new inode, and if it doesn't | ||
363 | * exist yet then nobody can be working on it! */ | ||
364 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); | ||
365 | if (status < 0) { | ||
366 | mlog_errno(status); | ||
367 | make_bad_inode(inode); | ||
368 | goto bail; | ||
369 | } | ||
370 | |||
371 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
372 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
373 | mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", | ||
374 | fe->i_blkno, 7, fe->i_signature); | ||
375 | make_bad_inode(inode); | ||
376 | goto bail; | ||
377 | } | ||
378 | |||
379 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | ||
380 | sysfile = 1; | ||
381 | |||
382 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || | ||
383 | S_ISBLK(le16_to_cpu(fe->i_mode))) | ||
384 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | ||
385 | |||
386 | status = -EINVAL; | ||
387 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { | ||
388 | mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", " | ||
389 | "i_ino=%lu\n", fe->i_blkno, inode->i_ino); | ||
390 | make_bad_inode(inode); | ||
391 | goto bail; | ||
392 | } | ||
393 | |||
394 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); | ||
395 | |||
396 | if (sysfile) | ||
397 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
398 | |||
399 | status = 0; | ||
400 | |||
401 | bail: | ||
402 | if (args && bh) | ||
403 | brelse(bh); | ||
404 | |||
405 | mlog_exit(status); | ||
406 | return status; | ||
407 | } | ||
408 | |||
409 | void ocfs2_sync_blockdev(struct super_block *sb) | ||
410 | { | ||
411 | sync_blockdev(sb->s_bdev); | ||
412 | } | ||
413 | |||
414 | static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | ||
415 | struct inode *inode, | ||
416 | struct buffer_head *fe_bh) | ||
417 | { | ||
418 | int status = 0; | ||
419 | struct ocfs2_journal_handle *handle = NULL; | ||
420 | struct ocfs2_truncate_context *tc = NULL; | ||
421 | struct ocfs2_dinode *fe; | ||
422 | |||
423 | mlog_entry_void(); | ||
424 | |||
425 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
426 | |||
427 | /* zero allocation, zero truncate :) */ | ||
428 | if (!fe->i_clusters) | ||
429 | goto bail; | ||
430 | |||
431 | handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS); | ||
432 | if (IS_ERR(handle)) { | ||
433 | status = PTR_ERR(handle); | ||
434 | handle = NULL; | ||
435 | mlog_errno(status); | ||
436 | goto bail; | ||
437 | } | ||
438 | |||
439 | status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); | ||
440 | if (status < 0) { | ||
441 | mlog_errno(status); | ||
442 | goto bail; | ||
443 | } | ||
444 | |||
445 | ocfs2_commit_trans(handle); | ||
446 | handle = NULL; | ||
447 | |||
448 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); | ||
449 | if (status < 0) { | ||
450 | mlog_errno(status); | ||
451 | goto bail; | ||
452 | } | ||
453 | |||
454 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | ||
455 | if (status < 0) { | ||
456 | mlog_errno(status); | ||
457 | goto bail; | ||
458 | } | ||
459 | bail: | ||
460 | if (handle) | ||
461 | ocfs2_commit_trans(handle); | ||
462 | |||
463 | mlog_exit(status); | ||
464 | return status; | ||
465 | } | ||
466 | |||
467 | static int ocfs2_remove_inode(struct inode *inode, | ||
468 | struct buffer_head *di_bh, | ||
469 | struct inode *orphan_dir_inode, | ||
470 | struct buffer_head *orphan_dir_bh) | ||
471 | { | ||
472 | int status; | ||
473 | struct inode *inode_alloc_inode = NULL; | ||
474 | struct buffer_head *inode_alloc_bh = NULL; | ||
475 | struct ocfs2_journal_handle *handle; | ||
476 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
477 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; | ||
478 | |||
479 | inode_alloc_inode = | ||
480 | ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, | ||
481 | le16_to_cpu(di->i_suballoc_slot)); | ||
482 | if (!inode_alloc_inode) { | ||
483 | status = -EEXIST; | ||
484 | mlog_errno(status); | ||
485 | goto bail; | ||
486 | } | ||
487 | |||
488 | down(&inode_alloc_inode->i_sem); | ||
489 | status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1); | ||
490 | if (status < 0) { | ||
491 | up(&inode_alloc_inode->i_sem); | ||
492 | |||
493 | mlog_errno(status); | ||
494 | goto bail; | ||
495 | } | ||
496 | |||
497 | handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS); | ||
498 | if (IS_ERR(handle)) { | ||
499 | status = PTR_ERR(handle); | ||
500 | mlog_errno(status); | ||
501 | goto bail_unlock; | ||
502 | } | ||
503 | |||
504 | status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, | ||
505 | orphan_dir_bh); | ||
506 | if (status < 0) { | ||
507 | mlog_errno(status); | ||
508 | goto bail_commit; | ||
509 | } | ||
510 | |||
511 | /* set the inodes dtime */ | ||
512 | status = ocfs2_journal_access(handle, inode, di_bh, | ||
513 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
514 | if (status < 0) { | ||
515 | mlog_errno(status); | ||
516 | goto bail_commit; | ||
517 | } | ||
518 | |||
519 | di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
520 | le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); | ||
521 | |||
522 | status = ocfs2_journal_dirty(handle, di_bh); | ||
523 | if (status < 0) { | ||
524 | mlog_errno(status); | ||
525 | goto bail_commit; | ||
526 | } | ||
527 | |||
528 | ocfs2_remove_from_cache(inode, di_bh); | ||
529 | |||
530 | status = ocfs2_free_dinode(handle, inode_alloc_inode, | ||
531 | inode_alloc_bh, di); | ||
532 | if (status < 0) | ||
533 | mlog_errno(status); | ||
534 | |||
535 | bail_commit: | ||
536 | ocfs2_commit_trans(handle); | ||
537 | bail_unlock: | ||
538 | ocfs2_meta_unlock(inode_alloc_inode, 1); | ||
539 | up(&inode_alloc_inode->i_sem); | ||
540 | brelse(inode_alloc_bh); | ||
541 | bail: | ||
542 | iput(inode_alloc_inode); | ||
543 | |||
544 | return status; | ||
545 | } | ||
546 | |||
547 | static int ocfs2_wipe_inode(struct inode *inode, | ||
548 | struct buffer_head *di_bh) | ||
549 | { | ||
550 | int status, orphaned_slot; | ||
551 | struct inode *orphan_dir_inode = NULL; | ||
552 | struct buffer_head *orphan_dir_bh = NULL; | ||
553 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
554 | |||
555 | /* We've already voted on this so it should be readonly - no | ||
556 | * spinlock needed. */ | ||
557 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
558 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
559 | ORPHAN_DIR_SYSTEM_INODE, | ||
560 | orphaned_slot); | ||
561 | if (!orphan_dir_inode) { | ||
562 | status = -EEXIST; | ||
563 | mlog_errno(status); | ||
564 | goto bail; | ||
565 | } | ||
566 | |||
567 | /* Lock the orphan dir. The lock will be held for the entire | ||
568 | * delete_inode operation. We do this now to avoid races with | ||
569 | * recovery completion on other nodes. */ | ||
570 | down(&orphan_dir_inode->i_sem); | ||
571 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1); | ||
572 | if (status < 0) { | ||
573 | up(&orphan_dir_inode->i_sem); | ||
574 | |||
575 | mlog_errno(status); | ||
576 | goto bail; | ||
577 | } | ||
578 | |||
579 | /* we do this while holding the orphan dir lock because we | ||
580 | * don't want recovery being run from another node to vote for | ||
581 | * an inode delete on us -- this will result in two nodes | ||
582 | * truncating the same file! */ | ||
583 | status = ocfs2_truncate_for_delete(osb, inode, di_bh); | ||
584 | if (status < 0) { | ||
585 | mlog_errno(status); | ||
586 | goto bail_unlock_dir; | ||
587 | } | ||
588 | |||
589 | status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, | ||
590 | orphan_dir_bh); | ||
591 | if (status < 0) | ||
592 | mlog_errno(status); | ||
593 | |||
594 | bail_unlock_dir: | ||
595 | ocfs2_meta_unlock(orphan_dir_inode, 1); | ||
596 | up(&orphan_dir_inode->i_sem); | ||
597 | brelse(orphan_dir_bh); | ||
598 | bail: | ||
599 | iput(orphan_dir_inode); | ||
600 | |||
601 | return status; | ||
602 | } | ||
603 | |||
604 | /* There is a series of simple checks that should be done before a | ||
605 | * vote is even considered. Encapsulate those in this function. */ | ||
606 | static int ocfs2_inode_is_valid_to_delete(struct inode *inode) | ||
607 | { | ||
608 | int ret = 0; | ||
609 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
610 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
611 | |||
612 | /* We shouldn't be getting here for the root directory | ||
613 | * inode.. */ | ||
614 | if (inode == osb->root_inode) { | ||
615 | mlog(ML_ERROR, "Skipping delete of root inode.\n"); | ||
616 | goto bail; | ||
617 | } | ||
618 | |||
619 | /* If we're coming from process_vote we can't go into our own | ||
620 | * voting [hello, deadlock city!], so unforuntately we just | ||
621 | * have to skip deleting this guy. That's OK though because | ||
622 | * the node who's doing the actual deleting should handle it | ||
623 | * anyway. */ | ||
624 | if (current == osb->vote_task) { | ||
625 | mlog(0, "Skipping delete of %lu because we're currently " | ||
626 | "in process_vote\n", inode->i_ino); | ||
627 | goto bail; | ||
628 | } | ||
629 | |||
630 | spin_lock(&oi->ip_lock); | ||
631 | /* OCFS2 *never* deletes system files. This should technically | ||
632 | * never get here as system file inodes should always have a | ||
633 | * positive link count. */ | ||
634 | if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { | ||
635 | mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n", | ||
636 | oi->ip_blkno); | ||
637 | goto bail_unlock; | ||
638 | } | ||
639 | |||
640 | /* If we have voted "yes" on the wipe of this inode for | ||
641 | * another node, it will be marked here so we can safely skip | ||
642 | * it. Recovery will cleanup any inodes we might inadvertantly | ||
643 | * skip here. */ | ||
644 | if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { | ||
645 | mlog(0, "Skipping delete of %lu because another node " | ||
646 | "has done this for us.\n", inode->i_ino); | ||
647 | goto bail_unlock; | ||
648 | } | ||
649 | |||
650 | ret = 1; | ||
651 | bail_unlock: | ||
652 | spin_unlock(&oi->ip_lock); | ||
653 | bail: | ||
654 | return ret; | ||
655 | } | ||
656 | |||
657 | /* Query the cluster to determine whether we should wipe an inode from | ||
658 | * disk or not. | ||
659 | * | ||
660 | * Requires the inode to have the cluster lock. */ | ||
661 | static int ocfs2_query_inode_wipe(struct inode *inode, | ||
662 | struct buffer_head *di_bh, | ||
663 | int *wipe) | ||
664 | { | ||
665 | int status = 0; | ||
666 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
667 | struct ocfs2_dinode *di; | ||
668 | |||
669 | *wipe = 0; | ||
670 | |||
671 | /* While we were waiting for the cluster lock in | ||
672 | * ocfs2_delete_inode, another node might have asked to delete | ||
673 | * the inode. Recheck our flags to catch this. */ | ||
674 | if (!ocfs2_inode_is_valid_to_delete(inode)) { | ||
675 | mlog(0, "Skipping delete of %"MLFu64" because flags changed\n", | ||
676 | oi->ip_blkno); | ||
677 | goto bail; | ||
678 | } | ||
679 | |||
680 | /* Now that we have an up to date inode, we can double check | ||
681 | * the link count. */ | ||
682 | if (inode->i_nlink) { | ||
683 | mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n", | ||
684 | oi->ip_blkno, inode->i_nlink); | ||
685 | goto bail; | ||
686 | } | ||
687 | |||
688 | /* Do some basic inode verification... */ | ||
689 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
690 | if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { | ||
691 | /* for lack of a better error? */ | ||
692 | status = -EEXIST; | ||
693 | mlog(ML_ERROR, | ||
694 | "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! " | ||
695 | "Disk flags 0x%x, inode flags 0x%x\n", | ||
696 | oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags); | ||
697 | goto bail; | ||
698 | } | ||
699 | |||
700 | /* has someone already deleted us?! baaad... */ | ||
701 | if (di->i_dtime) { | ||
702 | status = -EEXIST; | ||
703 | mlog_errno(status); | ||
704 | goto bail; | ||
705 | } | ||
706 | |||
707 | status = ocfs2_request_delete_vote(inode); | ||
708 | /* -EBUSY means that other nodes are still using the | ||
709 | * inode. We're done here though, so avoid doing anything on | ||
710 | * disk and let them worry about deleting it. */ | ||
711 | if (status == -EBUSY) { | ||
712 | status = 0; | ||
713 | mlog(0, "Skipping delete of %"MLFu64" because it is in use on" | ||
714 | "other nodes\n", oi->ip_blkno); | ||
715 | goto bail; | ||
716 | } | ||
717 | if (status < 0) { | ||
718 | mlog_errno(status); | ||
719 | goto bail; | ||
720 | } | ||
721 | |||
722 | spin_lock(&oi->ip_lock); | ||
723 | if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { | ||
724 | /* Nobody knew which slot this inode was orphaned | ||
725 | * into. This may happen during node death and | ||
726 | * recovery knows how to clean it up so we can safely | ||
727 | * ignore this inode for now on. */ | ||
728 | mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n", | ||
729 | oi->ip_blkno); | ||
730 | } else { | ||
731 | *wipe = 1; | ||
732 | |||
733 | mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n", | ||
734 | oi->ip_blkno, oi->ip_orphaned_slot); | ||
735 | } | ||
736 | spin_unlock(&oi->ip_lock); | ||
737 | |||
738 | bail: | ||
739 | return status; | ||
740 | } | ||
741 | |||
742 | /* Support function for ocfs2_delete_inode. Will help us keep the | ||
743 | * inode data in a consistent state for clear_inode. Always truncates | ||
744 | * pages, optionally sync's them first. */ | ||
745 | static void ocfs2_cleanup_delete_inode(struct inode *inode, | ||
746 | int sync_data) | ||
747 | { | ||
748 | mlog(0, "Cleanup inode %"MLFu64", sync = %d\n", | ||
749 | OCFS2_I(inode)->ip_blkno, sync_data); | ||
750 | if (sync_data) | ||
751 | write_inode_now(inode, 1); | ||
752 | truncate_inode_pages(&inode->i_data, 0); | ||
753 | } | ||
754 | |||
755 | void ocfs2_delete_inode(struct inode *inode) | ||
756 | { | ||
757 | int wipe, status; | ||
758 | sigset_t blocked, oldset; | ||
759 | struct buffer_head *di_bh = NULL; | ||
760 | |||
761 | mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); | ||
762 | |||
763 | if (is_bad_inode(inode)) { | ||
764 | mlog(0, "Skipping delete of bad inode\n"); | ||
765 | goto bail; | ||
766 | } | ||
767 | |||
768 | if (!ocfs2_inode_is_valid_to_delete(inode)) { | ||
769 | /* It's probably not necessary to truncate_inode_pages | ||
770 | * here but we do it for safety anyway (it will most | ||
771 | * likely be a no-op anyway) */ | ||
772 | ocfs2_cleanup_delete_inode(inode, 0); | ||
773 | goto bail; | ||
774 | } | ||
775 | |||
776 | /* We want to block signals in delete_inode as the lock and | ||
777 | * messaging paths may return us -ERESTARTSYS. Which would | ||
778 | * cause us to exit early, resulting in inodes being orphaned | ||
779 | * forever. */ | ||
780 | sigfillset(&blocked); | ||
781 | status = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
782 | if (status < 0) { | ||
783 | mlog_errno(status); | ||
784 | ocfs2_cleanup_delete_inode(inode, 1); | ||
785 | goto bail; | ||
786 | } | ||
787 | |||
788 | /* Lock down the inode. This gives us an up to date view of | ||
789 | * it's metadata (for verification), and allows us to | ||
790 | * serialize delete_inode votes. | ||
791 | * | ||
792 | * Even though we might be doing a truncate, we don't take the | ||
793 | * allocation lock here as it won't be needed - nobody will | ||
794 | * have the file open. | ||
795 | */ | ||
796 | status = ocfs2_meta_lock(inode, NULL, &di_bh, 1); | ||
797 | if (status < 0) { | ||
798 | if (status != -ENOENT) | ||
799 | mlog_errno(status); | ||
800 | ocfs2_cleanup_delete_inode(inode, 0); | ||
801 | goto bail_unblock; | ||
802 | } | ||
803 | |||
804 | /* Query the cluster. This will be the final decision made | ||
805 | * before we go ahead and wipe the inode. */ | ||
806 | status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); | ||
807 | if (!wipe || status < 0) { | ||
808 | /* Error and inode busy vote both mean we won't be | ||
809 | * removing the inode, so they take almost the same | ||
810 | * path. */ | ||
811 | if (status < 0) | ||
812 | mlog_errno(status); | ||
813 | |||
814 | /* Someone in the cluster has voted to not wipe this | ||
815 | * inode, or it was never completely orphaned. Write | ||
816 | * out the pages and exit now. */ | ||
817 | ocfs2_cleanup_delete_inode(inode, 1); | ||
818 | goto bail_unlock_inode; | ||
819 | } | ||
820 | |||
821 | ocfs2_cleanup_delete_inode(inode, 0); | ||
822 | |||
823 | status = ocfs2_wipe_inode(inode, di_bh); | ||
824 | if (status < 0) { | ||
825 | mlog_errno(status); | ||
826 | goto bail_unlock_inode; | ||
827 | } | ||
828 | |||
829 | /* Mark the inode as successfully deleted. This is important | ||
830 | * for ocfs2_clear_inode as it will check this flag and skip | ||
831 | * any checkpointing work */ | ||
832 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; | ||
833 | |||
834 | bail_unlock_inode: | ||
835 | ocfs2_meta_unlock(inode, 1); | ||
836 | brelse(di_bh); | ||
837 | bail_unblock: | ||
838 | status = sigprocmask(SIG_SETMASK, &oldset, NULL); | ||
839 | if (status < 0) | ||
840 | mlog_errno(status); | ||
841 | bail: | ||
842 | clear_inode(inode); | ||
843 | mlog_exit_void(); | ||
844 | } | ||
845 | |||
846 | void ocfs2_clear_inode(struct inode *inode) | ||
847 | { | ||
848 | int status; | ||
849 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
850 | |||
851 | mlog_entry_void(); | ||
852 | |||
853 | if (!inode) | ||
854 | goto bail; | ||
855 | |||
856 | mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n", | ||
857 | OCFS2_I(inode)->ip_blkno, inode->i_nlink); | ||
858 | |||
859 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, | ||
860 | "Inode=%lu\n", inode->i_ino); | ||
861 | |||
862 | /* Do these before all the other work so that we don't bounce | ||
863 | * the vote thread while waiting to destroy the locks. */ | ||
864 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); | ||
865 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); | ||
866 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); | ||
867 | |||
868 | /* We very well may get a clear_inode before all an inodes | ||
869 | * metadata has hit disk. Of course, we can't drop any cluster | ||
870 | * locks until the journal has finished with it. The only | ||
871 | * exception here are successfully wiped inodes - their | ||
872 | * metadata can now be considered to be part of the system | ||
873 | * inodes from which it came. */ | ||
874 | if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) | ||
875 | ocfs2_checkpoint_inode(inode); | ||
876 | |||
877 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), | ||
878 | "Clear inode of %"MLFu64", inode has io markers\n", | ||
879 | oi->ip_blkno); | ||
880 | |||
881 | ocfs2_extent_map_drop(inode, 0); | ||
882 | ocfs2_extent_map_init(inode); | ||
883 | |||
884 | status = ocfs2_drop_inode_locks(inode); | ||
885 | if (status < 0) | ||
886 | mlog_errno(status); | ||
887 | |||
888 | ocfs2_lock_res_free(&oi->ip_rw_lockres); | ||
889 | ocfs2_lock_res_free(&oi->ip_meta_lockres); | ||
890 | ocfs2_lock_res_free(&oi->ip_data_lockres); | ||
891 | |||
892 | ocfs2_metadata_cache_purge(inode); | ||
893 | |||
894 | mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, | ||
895 | "Clear inode of %"MLFu64", inode has %u cache items\n", | ||
896 | oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); | ||
897 | |||
898 | mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), | ||
899 | "Clear inode of %"MLFu64", inode has a bad flag\n", | ||
900 | oi->ip_blkno); | ||
901 | |||
902 | mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), | ||
903 | "Clear inode of %"MLFu64", inode is locked\n", | ||
904 | oi->ip_blkno); | ||
905 | |||
906 | mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), | ||
907 | "Clear inode of %"MLFu64", io_sem is locked\n", | ||
908 | oi->ip_blkno); | ||
909 | up(&oi->ip_io_sem); | ||
910 | |||
911 | /* | ||
912 | * down_trylock() returns 0, down_write_trylock() returns 1 | ||
913 | * kernel 1, world 0 | ||
914 | */ | ||
915 | mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), | ||
916 | "Clear inode of %"MLFu64", alloc_sem is locked\n", | ||
917 | oi->ip_blkno); | ||
918 | up_write(&oi->ip_alloc_sem); | ||
919 | |||
920 | mlog_bug_on_msg(oi->ip_open_count, | ||
921 | "Clear inode of %"MLFu64" has open count %d\n", | ||
922 | oi->ip_blkno, oi->ip_open_count); | ||
923 | mlog_bug_on_msg(!list_empty(&oi->ip_handle_list), | ||
924 | "Clear inode of %"MLFu64" has non empty handle list\n", | ||
925 | oi->ip_blkno); | ||
926 | mlog_bug_on_msg(oi->ip_handle, | ||
927 | "Clear inode of %"MLFu64" has non empty handle pointer\n", | ||
928 | oi->ip_blkno); | ||
929 | |||
930 | /* Clear all other flags. */ | ||
931 | oi->ip_flags = OCFS2_INODE_CACHE_INLINE; | ||
932 | oi->ip_created_trans = 0; | ||
933 | oi->ip_last_trans = 0; | ||
934 | oi->ip_dir_start_lookup = 0; | ||
935 | oi->ip_blkno = 0ULL; | ||
936 | |||
937 | bail: | ||
938 | mlog_exit_void(); | ||
939 | } | ||
940 | |||
941 | /* Called under inode_lock, with no more references on the | ||
942 | * struct inode, so it's safe here to check the flags field | ||
943 | * and to manipulate i_nlink without any other locks. */ | ||
944 | void ocfs2_drop_inode(struct inode *inode) | ||
945 | { | ||
946 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
947 | |||
948 | mlog_entry_void(); | ||
949 | |||
950 | mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n", | ||
951 | oi->ip_blkno, inode->i_nlink, oi->ip_flags); | ||
952 | |||
953 | /* Testing ip_orphaned_slot here wouldn't work because we may | ||
954 | * not have gotten a delete_inode vote from any other nodes | ||
955 | * yet. */ | ||
956 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { | ||
957 | mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); | ||
958 | inode->i_nlink = 0; | ||
959 | } | ||
960 | |||
961 | generic_drop_inode(inode); | ||
962 | |||
963 | mlog_exit_void(); | ||
964 | } | ||
965 | |||
966 | /* | ||
967 | * TODO: this should probably be merged into ocfs2_get_block | ||
968 | * | ||
969 | * However, you now need to pay attention to the cont_prepare_write() | ||
970 | * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much | ||
971 | * expects never to extend). | ||
972 | */ | ||
973 | struct buffer_head *ocfs2_bread(struct inode *inode, | ||
974 | int block, int *err, int reada) | ||
975 | { | ||
976 | struct buffer_head *bh = NULL; | ||
977 | int tmperr; | ||
978 | u64 p_blkno; | ||
979 | int readflags = OCFS2_BH_CACHED; | ||
980 | |||
981 | #if 0 | ||
982 | /* only turn this on if we know we can deal with read_block | ||
983 | * returning nothing */ | ||
984 | if (reada) | ||
985 | readflags |= OCFS2_BH_READAHEAD; | ||
986 | #endif | ||
987 | |||
988 | if (((u64)block << inode->i_sb->s_blocksize_bits) >= | ||
989 | i_size_read(inode)) { | ||
990 | BUG_ON(!reada); | ||
991 | return NULL; | ||
992 | } | ||
993 | |||
994 | tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, | ||
995 | &p_blkno, NULL); | ||
996 | if (tmperr < 0) { | ||
997 | mlog_errno(tmperr); | ||
998 | goto fail; | ||
999 | } | ||
1000 | |||
1001 | tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, | ||
1002 | readflags, inode); | ||
1003 | if (tmperr < 0) | ||
1004 | goto fail; | ||
1005 | |||
1006 | tmperr = 0; | ||
1007 | |||
1008 | *err = 0; | ||
1009 | return bh; | ||
1010 | |||
1011 | fail: | ||
1012 | if (bh) { | ||
1013 | brelse(bh); | ||
1014 | bh = NULL; | ||
1015 | } | ||
1016 | *err = -EIO; | ||
1017 | return NULL; | ||
1018 | } | ||
1019 | |||
1020 | /* | ||
1021 | * This is called from our getattr. | ||
1022 | */ | ||
1023 | int ocfs2_inode_revalidate(struct dentry *dentry) | ||
1024 | { | ||
1025 | struct inode *inode = dentry->d_inode; | ||
1026 | int status = 0; | ||
1027 | |||
1028 | mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode, | ||
1029 | inode ? OCFS2_I(inode)->ip_blkno : 0ULL); | ||
1030 | |||
1031 | if (!inode) { | ||
1032 | mlog(0, "eep, no inode!\n"); | ||
1033 | status = -ENOENT; | ||
1034 | goto bail; | ||
1035 | } | ||
1036 | |||
1037 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1038 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
1039 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1040 | mlog(0, "inode deleted!\n"); | ||
1041 | status = -ENOENT; | ||
1042 | goto bail; | ||
1043 | } | ||
1044 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1045 | |||
1046 | /* Let ocfs2_meta_lock do the work of updating our struct | ||
1047 | * inode for us. */ | ||
1048 | status = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
1049 | if (status < 0) { | ||
1050 | if (status != -ENOENT) | ||
1051 | mlog_errno(status); | ||
1052 | goto bail; | ||
1053 | } | ||
1054 | ocfs2_meta_unlock(inode, 0); | ||
1055 | bail: | ||
1056 | mlog_exit(status); | ||
1057 | |||
1058 | return status; | ||
1059 | } | ||
1060 | |||
1061 | /* | ||
1062 | * Updates a disk inode from a | ||
1063 | * struct inode. | ||
1064 | * Only takes ip_lock. | ||
1065 | */ | ||
1066 | int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, | ||
1067 | struct inode *inode, | ||
1068 | struct buffer_head *bh) | ||
1069 | { | ||
1070 | int status; | ||
1071 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; | ||
1072 | |||
1073 | mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno); | ||
1074 | |||
1075 | status = ocfs2_journal_access(handle, inode, bh, | ||
1076 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1077 | if (status < 0) { | ||
1078 | mlog_errno(status); | ||
1079 | goto leave; | ||
1080 | } | ||
1081 | |||
1082 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1083 | fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); | ||
1084 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1085 | |||
1086 | fe->i_size = cpu_to_le64(i_size_read(inode)); | ||
1087 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
1088 | fe->i_uid = cpu_to_le32(inode->i_uid); | ||
1089 | fe->i_gid = cpu_to_le32(inode->i_gid); | ||
1090 | fe->i_mode = cpu_to_le16(inode->i_mode); | ||
1091 | fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); | ||
1092 | fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); | ||
1093 | fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
1094 | fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
1095 | fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1096 | fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1097 | |||
1098 | status = ocfs2_journal_dirty(handle, bh); | ||
1099 | if (status < 0) | ||
1100 | mlog_errno(status); | ||
1101 | |||
1102 | status = 0; | ||
1103 | leave: | ||
1104 | |||
1105 | mlog_exit(status); | ||
1106 | return status; | ||
1107 | } | ||
1108 | |||
1109 | /* | ||
1110 | * | ||
1111 | * Updates a struct inode from a disk inode. | ||
1112 | * does no i/o, only takes ip_lock. | ||
1113 | */ | ||
1114 | void ocfs2_refresh_inode(struct inode *inode, | ||
1115 | struct ocfs2_dinode *fe) | ||
1116 | { | ||
1117 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1118 | |||
1119 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1120 | |||
1121 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
1122 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
1123 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | ||
1124 | inode->i_uid = le32_to_cpu(fe->i_uid); | ||
1125 | inode->i_gid = le32_to_cpu(fe->i_gid); | ||
1126 | inode->i_mode = le16_to_cpu(fe->i_mode); | ||
1127 | inode->i_blksize = (u32) osb->s_clustersize; | ||
1128 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) | ||
1129 | inode->i_blocks = 0; | ||
1130 | else | ||
1131 | inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
1132 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | ||
1133 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | ||
1134 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | ||
1135 | inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); | ||
1136 | inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); | ||
1137 | inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); | ||
1138 | |||
1139 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1140 | } | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h new file mode 100644 index 000000000000..9b0177433653 --- /dev/null +++ b/fs/ocfs2/inode.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * inode.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_INODE_H | ||
27 | #define OCFS2_INODE_H | ||
28 | |||
29 | /* OCFS2 Inode Private Data */ | ||
30 | struct ocfs2_inode_info | ||
31 | { | ||
32 | u64 ip_blkno; | ||
33 | |||
34 | struct ocfs2_lock_res ip_rw_lockres; | ||
35 | struct ocfs2_lock_res ip_meta_lockres; | ||
36 | struct ocfs2_lock_res ip_data_lockres; | ||
37 | |||
38 | /* protects allocation changes on this inode. */ | ||
39 | struct rw_semaphore ip_alloc_sem; | ||
40 | |||
41 | /* These fields are protected by ip_lock */ | ||
42 | spinlock_t ip_lock; | ||
43 | u32 ip_open_count; | ||
44 | u32 ip_clusters; | ||
45 | struct ocfs2_extent_map ip_map; | ||
46 | struct list_head ip_io_markers; | ||
47 | int ip_orphaned_slot; | ||
48 | |||
49 | struct semaphore ip_io_sem; | ||
50 | |||
51 | /* Used by the journalling code to attach an inode to a | ||
52 | * handle. These are protected by ip_io_sem in order to lock | ||
53 | * out other I/O to the inode until we either commit or | ||
54 | * abort. */ | ||
55 | struct list_head ip_handle_list; | ||
56 | struct ocfs2_journal_handle *ip_handle; | ||
57 | |||
58 | u32 ip_flags; /* see below */ | ||
59 | |||
60 | /* protected by recovery_lock. */ | ||
61 | struct inode *ip_next_orphan; | ||
62 | |||
63 | u32 ip_dir_start_lookup; | ||
64 | |||
65 | /* next two are protected by trans_inc_lock */ | ||
66 | /* which transaction were we created on? Zero if none. */ | ||
67 | unsigned long ip_created_trans; | ||
68 | /* last transaction we were a part of. */ | ||
69 | unsigned long ip_last_trans; | ||
70 | |||
71 | struct ocfs2_caching_info ip_metadata_cache; | ||
72 | |||
73 | struct inode vfs_inode; | ||
74 | }; | ||
75 | |||
76 | /* | ||
77 | * Flags for the ip_flags field | ||
78 | */ | ||
79 | /* System file inodes */ | ||
80 | #define OCFS2_INODE_SYSTEM_FILE 0x00000001 | ||
81 | #define OCFS2_INODE_JOURNAL 0x00000002 | ||
82 | #define OCFS2_INODE_BITMAP 0x00000004 | ||
83 | /* This inode has been wiped from disk */ | ||
84 | #define OCFS2_INODE_DELETED 0x00000008 | ||
85 | /* Another node is deleting, so our delete is a nop */ | ||
86 | #define OCFS2_INODE_SKIP_DELETE 0x00000010 | ||
87 | /* Has the inode been orphaned on another node? | ||
88 | * | ||
89 | * This hints to ocfs2_drop_inode that it should clear i_nlink before | ||
90 | * continuing. | ||
91 | * | ||
92 | * We *only* set this on unlink vote from another node. If the inode | ||
93 | * was locally orphaned, then we're sure of the state and don't need | ||
94 | * to twiddle i_nlink later - it's either zero or not depending on | ||
95 | * whether our unlink succeeded. Otherwise we got this from a node | ||
96 | * whose intention was to orphan the inode, however he may have | ||
97 | * crashed, failed etc, so we let ocfs2_drop_inode zero the value and | ||
98 | * rely on ocfs2_delete_inode to sort things out under the proper | ||
99 | * cluster locks. | ||
100 | */ | ||
101 | #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 | ||
102 | /* Does someone have the file open O_DIRECT */ | ||
103 | #define OCFS2_INODE_OPEN_DIRECT 0x00000040 | ||
104 | /* Indicates that the metadata cache should be used as an array. */ | ||
105 | #define OCFS2_INODE_CACHE_INLINE 0x00000080 | ||
106 | |||
107 | static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) | ||
108 | { | ||
109 | return container_of(inode, struct ocfs2_inode_info, vfs_inode); | ||
110 | } | ||
111 | |||
112 | #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) | ||
113 | #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) | ||
114 | |||
115 | extern kmem_cache_t *ocfs2_inode_cache; | ||
116 | |||
117 | extern struct address_space_operations ocfs2_aops; | ||
118 | |||
119 | struct buffer_head *ocfs2_bread(struct inode *inode, int block, | ||
120 | int *err, int reada); | ||
121 | void ocfs2_clear_inode(struct inode *inode); | ||
122 | void ocfs2_delete_inode(struct inode *inode); | ||
123 | void ocfs2_drop_inode(struct inode *inode); | ||
124 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); | ||
125 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
126 | u64 blkno, | ||
127 | int delete_vote); | ||
128 | int ocfs2_inode_init_private(struct inode *inode); | ||
129 | int ocfs2_inode_revalidate(struct dentry *dentry); | ||
130 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | ||
131 | int create_ino); | ||
132 | void ocfs2_read_inode(struct inode *inode); | ||
133 | void ocfs2_read_inode2(struct inode *inode, void *opaque); | ||
134 | ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, | ||
135 | size_t size, loff_t *offp); | ||
136 | void ocfs2_sync_blockdev(struct super_block *sb); | ||
137 | void ocfs2_refresh_inode(struct inode *inode, | ||
138 | struct ocfs2_dinode *fe); | ||
139 | int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, | ||
140 | struct inode *inode, | ||
141 | struct buffer_head *bh); | ||
142 | int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
143 | int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
144 | |||
145 | #endif /* OCFS2_INODE_H */ | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c new file mode 100644 index 000000000000..04428042e5e5 --- /dev/null +++ b/fs/ocfs2/journal.c | |||
@@ -0,0 +1,1652 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * journal.c | ||
5 | * | ||
6 | * Defines functions of journalling api | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/kthread.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_JOURNAL | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "extent_map.h" | ||
40 | #include "heartbeat.h" | ||
41 | #include "inode.h" | ||
42 | #include "journal.h" | ||
43 | #include "localalloc.h" | ||
44 | #include "namei.h" | ||
45 | #include "slot_map.h" | ||
46 | #include "super.h" | ||
47 | #include "vote.h" | ||
48 | #include "sysfile.h" | ||
49 | |||
50 | #include "buffer_head_io.h" | ||
51 | |||
52 | spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; | ||
53 | |||
54 | static int ocfs2_force_read_journal(struct inode *inode); | ||
55 | static int ocfs2_recover_node(struct ocfs2_super *osb, | ||
56 | int node_num); | ||
57 | static int __ocfs2_recovery_thread(void *arg); | ||
58 | static int ocfs2_commit_cache(struct ocfs2_super *osb); | ||
59 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb); | ||
60 | static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, | ||
61 | struct ocfs2_journal_handle *handle); | ||
62 | static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle); | ||
63 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | ||
64 | int dirty); | ||
65 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, | ||
66 | int slot_num); | ||
67 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | ||
68 | int slot); | ||
69 | static int ocfs2_commit_thread(void *arg); | ||
70 | |||
71 | static int ocfs2_commit_cache(struct ocfs2_super *osb) | ||
72 | { | ||
73 | int status = 0; | ||
74 | unsigned int flushed; | ||
75 | unsigned long old_id; | ||
76 | struct ocfs2_journal *journal = NULL; | ||
77 | |||
78 | mlog_entry_void(); | ||
79 | |||
80 | journal = osb->journal; | ||
81 | |||
82 | /* Flush all pending commits and checkpoint the journal. */ | ||
83 | down_write(&journal->j_trans_barrier); | ||
84 | |||
85 | if (atomic_read(&journal->j_num_trans) == 0) { | ||
86 | up_write(&journal->j_trans_barrier); | ||
87 | mlog(0, "No transactions for me to flush!\n"); | ||
88 | goto finally; | ||
89 | } | ||
90 | |||
91 | journal_lock_updates(journal->j_journal); | ||
92 | status = journal_flush(journal->j_journal); | ||
93 | journal_unlock_updates(journal->j_journal); | ||
94 | if (status < 0) { | ||
95 | up_write(&journal->j_trans_barrier); | ||
96 | mlog_errno(status); | ||
97 | goto finally; | ||
98 | } | ||
99 | |||
100 | old_id = ocfs2_inc_trans_id(journal); | ||
101 | |||
102 | flushed = atomic_read(&journal->j_num_trans); | ||
103 | atomic_set(&journal->j_num_trans, 0); | ||
104 | up_write(&journal->j_trans_barrier); | ||
105 | |||
106 | mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", | ||
107 | journal->j_trans_id, flushed); | ||
108 | |||
109 | ocfs2_kick_vote_thread(osb); | ||
110 | wake_up(&journal->j_checkpointed); | ||
111 | finally: | ||
112 | mlog_exit(status); | ||
113 | return status; | ||
114 | } | ||
115 | |||
116 | struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) | ||
117 | { | ||
118 | struct ocfs2_journal_handle *retval = NULL; | ||
119 | |||
120 | retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); | ||
121 | if (!retval) { | ||
122 | mlog(ML_ERROR, "Failed to allocate memory for journal " | ||
123 | "handle!\n"); | ||
124 | return NULL; | ||
125 | } | ||
126 | |||
127 | retval->max_buffs = 0; | ||
128 | retval->num_locks = 0; | ||
129 | retval->k_handle = NULL; | ||
130 | |||
131 | INIT_LIST_HEAD(&retval->locks); | ||
132 | INIT_LIST_HEAD(&retval->inode_list); | ||
133 | retval->journal = osb->journal; | ||
134 | |||
135 | return retval; | ||
136 | } | ||
137 | |||
138 | /* pass it NULL and it will allocate a new handle object for you. If | ||
139 | * you pass it a handle however, it may still return error, in which | ||
140 | * case it has free'd the passed handle for you. */ | ||
141 | struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, | ||
142 | struct ocfs2_journal_handle *handle, | ||
143 | int max_buffs) | ||
144 | { | ||
145 | int ret; | ||
146 | journal_t *journal = osb->journal->j_journal; | ||
147 | |||
148 | mlog_entry("(max_buffs = %d)\n", max_buffs); | ||
149 | |||
150 | if (!osb || !osb->journal->j_journal) | ||
151 | BUG(); | ||
152 | |||
153 | if (ocfs2_is_hard_readonly(osb)) { | ||
154 | ret = -EROFS; | ||
155 | goto done_free; | ||
156 | } | ||
157 | |||
158 | BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); | ||
159 | BUG_ON(max_buffs <= 0); | ||
160 | |||
161 | /* JBD might support this, but our journalling code doesn't yet. */ | ||
162 | if (journal_current_handle()) { | ||
163 | mlog(ML_ERROR, "Recursive transaction attempted!\n"); | ||
164 | BUG(); | ||
165 | } | ||
166 | |||
167 | if (!handle) | ||
168 | handle = ocfs2_alloc_handle(osb); | ||
169 | if (!handle) { | ||
170 | ret = -ENOMEM; | ||
171 | mlog(ML_ERROR, "Failed to allocate memory for journal " | ||
172 | "handle!\n"); | ||
173 | goto done_free; | ||
174 | } | ||
175 | |||
176 | handle->max_buffs = max_buffs; | ||
177 | |||
178 | down_read(&osb->journal->j_trans_barrier); | ||
179 | |||
180 | /* actually start the transaction now */ | ||
181 | handle->k_handle = journal_start(journal, max_buffs); | ||
182 | if (IS_ERR(handle->k_handle)) { | ||
183 | up_read(&osb->journal->j_trans_barrier); | ||
184 | |||
185 | ret = PTR_ERR(handle->k_handle); | ||
186 | handle->k_handle = NULL; | ||
187 | mlog_errno(ret); | ||
188 | |||
189 | if (is_journal_aborted(journal)) { | ||
190 | ocfs2_abort(osb->sb, "Detected aborted journal"); | ||
191 | ret = -EROFS; | ||
192 | } | ||
193 | goto done_free; | ||
194 | } | ||
195 | |||
196 | atomic_inc(&(osb->journal->j_num_trans)); | ||
197 | handle->flags |= OCFS2_HANDLE_STARTED; | ||
198 | |||
199 | mlog_exit_ptr(handle); | ||
200 | return handle; | ||
201 | |||
202 | done_free: | ||
203 | if (handle) | ||
204 | ocfs2_commit_unstarted_handle(handle); /* will kfree handle */ | ||
205 | |||
206 | mlog_exit(ret); | ||
207 | return ERR_PTR(ret); | ||
208 | } | ||
209 | |||
210 | void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, | ||
211 | struct inode *inode) | ||
212 | { | ||
213 | BUG_ON(!handle); | ||
214 | BUG_ON(!inode); | ||
215 | |||
216 | atomic_inc(&inode->i_count); | ||
217 | |||
218 | /* we're obviously changing it... */ | ||
219 | down(&inode->i_sem); | ||
220 | |||
221 | /* sanity check */ | ||
222 | BUG_ON(OCFS2_I(inode)->ip_handle); | ||
223 | BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); | ||
224 | |||
225 | OCFS2_I(inode)->ip_handle = handle; | ||
226 | list_del(&(OCFS2_I(inode)->ip_handle_list)); | ||
227 | list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); | ||
228 | } | ||
229 | |||
230 | static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) | ||
231 | { | ||
232 | struct list_head *p, *n; | ||
233 | struct inode *inode; | ||
234 | struct ocfs2_inode_info *oi; | ||
235 | |||
236 | list_for_each_safe(p, n, &handle->inode_list) { | ||
237 | oi = list_entry(p, struct ocfs2_inode_info, | ||
238 | ip_handle_list); | ||
239 | inode = &oi->vfs_inode; | ||
240 | |||
241 | OCFS2_I(inode)->ip_handle = NULL; | ||
242 | list_del_init(&OCFS2_I(inode)->ip_handle_list); | ||
243 | |||
244 | up(&inode->i_sem); | ||
245 | iput(inode); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | /* This is trivial so we do it out of the main commit | ||
250 | * paths. Beware, it can be called from start_trans too! */ | ||
251 | static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle) | ||
252 | { | ||
253 | mlog_entry_void(); | ||
254 | |||
255 | BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); | ||
256 | |||
257 | ocfs2_handle_unlock_inodes(handle); | ||
258 | /* You are allowed to add journal locks before the transaction | ||
259 | * has started. */ | ||
260 | ocfs2_handle_cleanup_locks(handle->journal, handle); | ||
261 | |||
262 | kfree(handle); | ||
263 | |||
264 | mlog_exit_void(); | ||
265 | } | ||
266 | |||
267 | void ocfs2_commit_trans(struct ocfs2_journal_handle *handle) | ||
268 | { | ||
269 | handle_t *jbd_handle; | ||
270 | int retval; | ||
271 | struct ocfs2_journal *journal = handle->journal; | ||
272 | |||
273 | mlog_entry_void(); | ||
274 | |||
275 | BUG_ON(!handle); | ||
276 | |||
277 | if (!(handle->flags & OCFS2_HANDLE_STARTED)) { | ||
278 | ocfs2_commit_unstarted_handle(handle); | ||
279 | mlog_exit_void(); | ||
280 | return; | ||
281 | } | ||
282 | |||
283 | /* release inode semaphores we took during this transaction */ | ||
284 | ocfs2_handle_unlock_inodes(handle); | ||
285 | |||
286 | /* ocfs2_extend_trans may have had to call journal_restart | ||
287 | * which will always commit the transaction, but may return | ||
288 | * error for any number of reasons. If this is the case, we | ||
289 | * clear k_handle as it's not valid any more. */ | ||
290 | if (handle->k_handle) { | ||
291 | jbd_handle = handle->k_handle; | ||
292 | |||
293 | if (handle->flags & OCFS2_HANDLE_SYNC) | ||
294 | jbd_handle->h_sync = 1; | ||
295 | else | ||
296 | jbd_handle->h_sync = 0; | ||
297 | |||
298 | /* actually stop the transaction. if we've set h_sync, | ||
299 | * it'll have been committed when we return */ | ||
300 | retval = journal_stop(jbd_handle); | ||
301 | if (retval < 0) { | ||
302 | mlog_errno(retval); | ||
303 | mlog(ML_ERROR, "Could not commit transaction\n"); | ||
304 | BUG(); | ||
305 | } | ||
306 | |||
307 | handle->k_handle = NULL; /* it's been free'd in journal_stop */ | ||
308 | } | ||
309 | |||
310 | ocfs2_handle_cleanup_locks(journal, handle); | ||
311 | |||
312 | up_read(&journal->j_trans_barrier); | ||
313 | |||
314 | kfree(handle); | ||
315 | mlog_exit_void(); | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * 'nblocks' is what you want to add to the current | ||
320 | * transaction. extend_trans will either extend the current handle by | ||
321 | * nblocks, or commit it and start a new one with nblocks credits. | ||
322 | * | ||
323 | * WARNING: This will not release any semaphores or disk locks taken | ||
324 | * during the transaction, so make sure they were taken *before* | ||
325 | * start_trans or we'll have ordering deadlocks. | ||
326 | * | ||
327 | * WARNING2: Note that we do *not* drop j_trans_barrier here. This is | ||
328 | * good because transaction ids haven't yet been recorded on the | ||
329 | * cluster locks associated with this handle. | ||
330 | */ | ||
331 | int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, | ||
332 | int nblocks) | ||
333 | { | ||
334 | int status; | ||
335 | |||
336 | BUG_ON(!handle); | ||
337 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
338 | BUG_ON(!nblocks); | ||
339 | |||
340 | mlog_entry_void(); | ||
341 | |||
342 | mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); | ||
343 | |||
344 | status = journal_extend(handle->k_handle, nblocks); | ||
345 | if (status < 0) { | ||
346 | mlog_errno(status); | ||
347 | goto bail; | ||
348 | } | ||
349 | |||
350 | if (status > 0) { | ||
351 | mlog(0, "journal_extend failed, trying journal_restart\n"); | ||
352 | status = journal_restart(handle->k_handle, nblocks); | ||
353 | if (status < 0) { | ||
354 | handle->k_handle = NULL; | ||
355 | mlog_errno(status); | ||
356 | goto bail; | ||
357 | } | ||
358 | handle->max_buffs = nblocks; | ||
359 | } else | ||
360 | handle->max_buffs += nblocks; | ||
361 | |||
362 | status = 0; | ||
363 | bail: | ||
364 | |||
365 | mlog_exit(status); | ||
366 | return status; | ||
367 | } | ||
368 | |||
369 | int ocfs2_journal_access(struct ocfs2_journal_handle *handle, | ||
370 | struct inode *inode, | ||
371 | struct buffer_head *bh, | ||
372 | int type) | ||
373 | { | ||
374 | int status; | ||
375 | |||
376 | BUG_ON(!inode); | ||
377 | BUG_ON(!handle); | ||
378 | BUG_ON(!bh); | ||
379 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
380 | |||
381 | mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n", | ||
382 | (unsigned long long)bh->b_blocknr, type, | ||
383 | (type == OCFS2_JOURNAL_ACCESS_CREATE) ? | ||
384 | "OCFS2_JOURNAL_ACCESS_CREATE" : | ||
385 | "OCFS2_JOURNAL_ACCESS_WRITE", | ||
386 | bh->b_size); | ||
387 | |||
388 | /* we can safely remove this assertion after testing. */ | ||
389 | if (!buffer_uptodate(bh)) { | ||
390 | mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); | ||
391 | mlog(ML_ERROR, "b_blocknr=%llu\n", | ||
392 | (unsigned long long)bh->b_blocknr); | ||
393 | BUG(); | ||
394 | } | ||
395 | |||
396 | /* Set the current transaction information on the inode so | ||
397 | * that the locking code knows whether it can drop it's locks | ||
398 | * on this inode or not. We're protected from the commit | ||
399 | * thread updating the current transaction id until | ||
400 | * ocfs2_commit_trans() because ocfs2_start_trans() took | ||
401 | * j_trans_barrier for us. */ | ||
402 | ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); | ||
403 | |||
404 | down(&OCFS2_I(inode)->ip_io_sem); | ||
405 | switch (type) { | ||
406 | case OCFS2_JOURNAL_ACCESS_CREATE: | ||
407 | case OCFS2_JOURNAL_ACCESS_WRITE: | ||
408 | status = journal_get_write_access(handle->k_handle, bh); | ||
409 | break; | ||
410 | |||
411 | case OCFS2_JOURNAL_ACCESS_UNDO: | ||
412 | status = journal_get_undo_access(handle->k_handle, bh); | ||
413 | break; | ||
414 | |||
415 | default: | ||
416 | status = -EINVAL; | ||
417 | mlog(ML_ERROR, "Uknown access type!\n"); | ||
418 | } | ||
419 | up(&OCFS2_I(inode)->ip_io_sem); | ||
420 | |||
421 | if (status < 0) | ||
422 | mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", | ||
423 | status, type); | ||
424 | |||
425 | mlog_exit(status); | ||
426 | return status; | ||
427 | } | ||
428 | |||
429 | int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, | ||
430 | struct buffer_head *bh) | ||
431 | { | ||
432 | int status; | ||
433 | |||
434 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
435 | |||
436 | mlog_entry("(bh->b_blocknr=%llu)\n", | ||
437 | (unsigned long long)bh->b_blocknr); | ||
438 | |||
439 | status = journal_dirty_metadata(handle->k_handle, bh); | ||
440 | if (status < 0) | ||
441 | mlog(ML_ERROR, "Could not dirty metadata buffer. " | ||
442 | "(bh->b_blocknr=%llu)\n", | ||
443 | (unsigned long long)bh->b_blocknr); | ||
444 | |||
445 | mlog_exit(status); | ||
446 | return status; | ||
447 | } | ||
448 | |||
449 | int ocfs2_journal_dirty_data(handle_t *handle, | ||
450 | struct buffer_head *bh) | ||
451 | { | ||
452 | int err = journal_dirty_data(handle, bh); | ||
453 | if (err) | ||
454 | mlog_errno(err); | ||
455 | /* TODO: When we can handle it, abort the handle and go RO on | ||
456 | * error here. */ | ||
457 | |||
458 | return err; | ||
459 | } | ||
460 | |||
461 | /* We always assume you're adding a metadata lock at level 'ex' */ | ||
462 | int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, | ||
463 | struct inode *inode) | ||
464 | { | ||
465 | int status; | ||
466 | struct ocfs2_journal_lock *lock; | ||
467 | |||
468 | BUG_ON(!inode); | ||
469 | |||
470 | lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS); | ||
471 | if (!lock) { | ||
472 | status = -ENOMEM; | ||
473 | mlog_errno(-ENOMEM); | ||
474 | goto bail; | ||
475 | } | ||
476 | |||
477 | if (!igrab(inode)) | ||
478 | BUG(); | ||
479 | lock->jl_inode = inode; | ||
480 | |||
481 | list_add_tail(&(lock->jl_lock_list), &(handle->locks)); | ||
482 | handle->num_locks++; | ||
483 | |||
484 | status = 0; | ||
485 | bail: | ||
486 | mlog_exit(status); | ||
487 | return status; | ||
488 | } | ||
489 | |||
490 | static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, | ||
491 | struct ocfs2_journal_handle *handle) | ||
492 | { | ||
493 | struct list_head *p, *n; | ||
494 | struct ocfs2_journal_lock *lock; | ||
495 | struct inode *inode; | ||
496 | |||
497 | list_for_each_safe(p, n, &(handle->locks)) { | ||
498 | lock = list_entry(p, struct ocfs2_journal_lock, | ||
499 | jl_lock_list); | ||
500 | list_del(&lock->jl_lock_list); | ||
501 | handle->num_locks--; | ||
502 | |||
503 | inode = lock->jl_inode; | ||
504 | ocfs2_meta_unlock(inode, 1); | ||
505 | if (atomic_read(&inode->i_count) == 1) | ||
506 | mlog(ML_ERROR, | ||
507 | "Inode %"MLFu64", I'm doing a last iput for!", | ||
508 | OCFS2_I(inode)->ip_blkno); | ||
509 | iput(inode); | ||
510 | kmem_cache_free(ocfs2_lock_cache, lock); | ||
511 | } | ||
512 | } | ||
513 | |||
514 | #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) | ||
515 | |||
516 | void ocfs2_set_journal_params(struct ocfs2_super *osb) | ||
517 | { | ||
518 | journal_t *journal = osb->journal->j_journal; | ||
519 | |||
520 | spin_lock(&journal->j_state_lock); | ||
521 | journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; | ||
522 | if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) | ||
523 | journal->j_flags |= JFS_BARRIER; | ||
524 | else | ||
525 | journal->j_flags &= ~JFS_BARRIER; | ||
526 | spin_unlock(&journal->j_state_lock); | ||
527 | } | ||
528 | |||
529 | int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) | ||
530 | { | ||
531 | int status = -1; | ||
532 | struct inode *inode = NULL; /* the journal inode */ | ||
533 | journal_t *j_journal = NULL; | ||
534 | struct ocfs2_dinode *di = NULL; | ||
535 | struct buffer_head *bh = NULL; | ||
536 | struct ocfs2_super *osb; | ||
537 | int meta_lock = 0; | ||
538 | |||
539 | mlog_entry_void(); | ||
540 | |||
541 | BUG_ON(!journal); | ||
542 | |||
543 | osb = journal->j_osb; | ||
544 | |||
545 | /* already have the inode for our journal */ | ||
546 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
547 | osb->slot_num); | ||
548 | if (inode == NULL) { | ||
549 | status = -EACCES; | ||
550 | mlog_errno(status); | ||
551 | goto done; | ||
552 | } | ||
553 | if (is_bad_inode(inode)) { | ||
554 | mlog(ML_ERROR, "access error (bad inode)\n"); | ||
555 | iput(inode); | ||
556 | inode = NULL; | ||
557 | status = -EACCES; | ||
558 | goto done; | ||
559 | } | ||
560 | |||
561 | SET_INODE_JOURNAL(inode); | ||
562 | OCFS2_I(inode)->ip_open_count++; | ||
563 | |||
564 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); | ||
565 | if (status < 0) { | ||
566 | if (status != -ERESTARTSYS) | ||
567 | mlog(ML_ERROR, "Could not get lock on journal!\n"); | ||
568 | goto done; | ||
569 | } | ||
570 | |||
571 | meta_lock = 1; | ||
572 | di = (struct ocfs2_dinode *)bh->b_data; | ||
573 | |||
574 | if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { | ||
575 | mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", | ||
576 | inode->i_size); | ||
577 | status = -EINVAL; | ||
578 | goto done; | ||
579 | } | ||
580 | |||
581 | mlog(0, "inode->i_size = %lld\n", inode->i_size); | ||
582 | mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks); | ||
583 | mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); | ||
584 | |||
585 | /* call the kernels journal init function now */ | ||
586 | j_journal = journal_init_inode(inode); | ||
587 | if (j_journal == NULL) { | ||
588 | mlog(ML_ERROR, "Linux journal layer error\n"); | ||
589 | status = -EINVAL; | ||
590 | goto done; | ||
591 | } | ||
592 | |||
593 | mlog(0, "Returned from journal_init_inode\n"); | ||
594 | mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); | ||
595 | |||
596 | *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & | ||
597 | OCFS2_JOURNAL_DIRTY_FL); | ||
598 | |||
599 | journal->j_journal = j_journal; | ||
600 | journal->j_inode = inode; | ||
601 | journal->j_bh = bh; | ||
602 | |||
603 | ocfs2_set_journal_params(osb); | ||
604 | |||
605 | journal->j_state = OCFS2_JOURNAL_LOADED; | ||
606 | |||
607 | status = 0; | ||
608 | done: | ||
609 | if (status < 0) { | ||
610 | if (meta_lock) | ||
611 | ocfs2_meta_unlock(inode, 1); | ||
612 | if (bh != NULL) | ||
613 | brelse(bh); | ||
614 | if (inode) { | ||
615 | OCFS2_I(inode)->ip_open_count--; | ||
616 | iput(inode); | ||
617 | } | ||
618 | } | ||
619 | |||
620 | mlog_exit(status); | ||
621 | return status; | ||
622 | } | ||
623 | |||
624 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | ||
625 | int dirty) | ||
626 | { | ||
627 | int status; | ||
628 | unsigned int flags; | ||
629 | struct ocfs2_journal *journal = osb->journal; | ||
630 | struct buffer_head *bh = journal->j_bh; | ||
631 | struct ocfs2_dinode *fe; | ||
632 | |||
633 | mlog_entry_void(); | ||
634 | |||
635 | fe = (struct ocfs2_dinode *)bh->b_data; | ||
636 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
637 | /* This is called from startup/shutdown which will | ||
638 | * handle the errors in a specific manner, so no need | ||
639 | * to call ocfs2_error() here. */ | ||
640 | mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid " | ||
641 | "signature: %.*s", fe->i_blkno, 7, fe->i_signature); | ||
642 | status = -EIO; | ||
643 | goto out; | ||
644 | } | ||
645 | |||
646 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
647 | if (dirty) | ||
648 | flags |= OCFS2_JOURNAL_DIRTY_FL; | ||
649 | else | ||
650 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | ||
651 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | ||
652 | |||
653 | status = ocfs2_write_block(osb, bh, journal->j_inode); | ||
654 | if (status < 0) | ||
655 | mlog_errno(status); | ||
656 | |||
657 | out: | ||
658 | mlog_exit(status); | ||
659 | return status; | ||
660 | } | ||
661 | |||
662 | /* | ||
663 | * If the journal has been kmalloc'd it needs to be freed after this | ||
664 | * call. | ||
665 | */ | ||
666 | void ocfs2_journal_shutdown(struct ocfs2_super *osb) | ||
667 | { | ||
668 | struct ocfs2_journal *journal = NULL; | ||
669 | int status = 0; | ||
670 | struct inode *inode = NULL; | ||
671 | int num_running_trans = 0; | ||
672 | |||
673 | mlog_entry_void(); | ||
674 | |||
675 | if (!osb) | ||
676 | BUG(); | ||
677 | |||
678 | journal = osb->journal; | ||
679 | if (!journal) | ||
680 | goto done; | ||
681 | |||
682 | inode = journal->j_inode; | ||
683 | |||
684 | if (journal->j_state != OCFS2_JOURNAL_LOADED) | ||
685 | goto done; | ||
686 | |||
687 | /* need to inc inode use count as journal_destroy will iput. */ | ||
688 | if (!igrab(inode)) | ||
689 | BUG(); | ||
690 | |||
691 | num_running_trans = atomic_read(&(osb->journal->j_num_trans)); | ||
692 | if (num_running_trans > 0) | ||
693 | mlog(0, "Shutting down journal: must wait on %d " | ||
694 | "running transactions!\n", | ||
695 | num_running_trans); | ||
696 | |||
697 | /* Do a commit_cache here. It will flush our journal, *and* | ||
698 | * release any locks that are still held. | ||
699 | * set the SHUTDOWN flag and release the trans lock. | ||
700 | * the commit thread will take the trans lock for us below. */ | ||
701 | journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; | ||
702 | |||
703 | /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not | ||
704 | * drop the trans_lock (which we want to hold until we | ||
705 | * completely destroy the journal. */ | ||
706 | if (osb->commit_task) { | ||
707 | /* Wait for the commit thread */ | ||
708 | mlog(0, "Waiting for ocfs2commit to exit....\n"); | ||
709 | kthread_stop(osb->commit_task); | ||
710 | osb->commit_task = NULL; | ||
711 | } | ||
712 | |||
713 | BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); | ||
714 | |||
715 | status = ocfs2_journal_toggle_dirty(osb, 0); | ||
716 | if (status < 0) | ||
717 | mlog_errno(status); | ||
718 | |||
719 | /* Shutdown the kernel journal system */ | ||
720 | journal_destroy(journal->j_journal); | ||
721 | |||
722 | OCFS2_I(inode)->ip_open_count--; | ||
723 | |||
724 | /* unlock our journal */ | ||
725 | ocfs2_meta_unlock(inode, 1); | ||
726 | |||
727 | brelse(journal->j_bh); | ||
728 | journal->j_bh = NULL; | ||
729 | |||
730 | journal->j_state = OCFS2_JOURNAL_FREE; | ||
731 | |||
732 | // up_write(&journal->j_trans_barrier); | ||
733 | done: | ||
734 | if (inode) | ||
735 | iput(inode); | ||
736 | mlog_exit_void(); | ||
737 | } | ||
738 | |||
739 | static void ocfs2_clear_journal_error(struct super_block *sb, | ||
740 | journal_t *journal, | ||
741 | int slot) | ||
742 | { | ||
743 | int olderr; | ||
744 | |||
745 | olderr = journal_errno(journal); | ||
746 | if (olderr) { | ||
747 | mlog(ML_ERROR, "File system error %d recorded in " | ||
748 | "journal %u.\n", olderr, slot); | ||
749 | mlog(ML_ERROR, "File system on device %s needs checking.\n", | ||
750 | sb->s_id); | ||
751 | |||
752 | journal_ack_err(journal); | ||
753 | journal_clear_err(journal); | ||
754 | } | ||
755 | } | ||
756 | |||
757 | int ocfs2_journal_load(struct ocfs2_journal *journal) | ||
758 | { | ||
759 | int status = 0; | ||
760 | struct ocfs2_super *osb; | ||
761 | |||
762 | mlog_entry_void(); | ||
763 | |||
764 | if (!journal) | ||
765 | BUG(); | ||
766 | |||
767 | osb = journal->j_osb; | ||
768 | |||
769 | status = journal_load(journal->j_journal); | ||
770 | if (status < 0) { | ||
771 | mlog(ML_ERROR, "Failed to load journal!\n"); | ||
772 | goto done; | ||
773 | } | ||
774 | |||
775 | ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); | ||
776 | |||
777 | status = ocfs2_journal_toggle_dirty(osb, 1); | ||
778 | if (status < 0) { | ||
779 | mlog_errno(status); | ||
780 | goto done; | ||
781 | } | ||
782 | |||
783 | /* Launch the commit thread */ | ||
784 | osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", | ||
785 | osb->osb_id); | ||
786 | if (IS_ERR(osb->commit_task)) { | ||
787 | status = PTR_ERR(osb->commit_task); | ||
788 | osb->commit_task = NULL; | ||
789 | mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", | ||
790 | status); | ||
791 | goto done; | ||
792 | } | ||
793 | |||
794 | done: | ||
795 | mlog_exit(status); | ||
796 | return status; | ||
797 | } | ||
798 | |||
799 | |||
800 | /* 'full' flag tells us whether we clear out all blocks or if we just | ||
801 | * mark the journal clean */ | ||
802 | int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) | ||
803 | { | ||
804 | int status; | ||
805 | |||
806 | mlog_entry_void(); | ||
807 | |||
808 | if (!journal) | ||
809 | BUG(); | ||
810 | |||
811 | status = journal_wipe(journal->j_journal, full); | ||
812 | if (status < 0) { | ||
813 | mlog_errno(status); | ||
814 | goto bail; | ||
815 | } | ||
816 | |||
817 | status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); | ||
818 | if (status < 0) | ||
819 | mlog_errno(status); | ||
820 | |||
821 | bail: | ||
822 | mlog_exit(status); | ||
823 | return status; | ||
824 | } | ||
825 | |||
826 | /* | ||
827 | * JBD Might read a cached version of another nodes journal file. We | ||
828 | * don't want this as this file changes often and we get no | ||
829 | * notification on those changes. The only way to be sure that we've | ||
830 | * got the most up to date version of those blocks then is to force | ||
831 | * read them off disk. Just searching through the buffer cache won't | ||
832 | * work as there may be pages backing this file which are still marked | ||
833 | * up to date. We know things can't change on this file underneath us | ||
834 | * as we have the lock by now :) | ||
835 | */ | ||
836 | static int ocfs2_force_read_journal(struct inode *inode) | ||
837 | { | ||
838 | int status = 0; | ||
839 | int i, p_blocks; | ||
840 | u64 v_blkno, p_blkno; | ||
841 | #define CONCURRENT_JOURNAL_FILL 32 | ||
842 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; | ||
843 | |||
844 | mlog_entry_void(); | ||
845 | |||
846 | BUG_ON(inode->i_blocks != | ||
847 | ocfs2_align_bytes_to_sectors(i_size_read(inode))); | ||
848 | |||
849 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); | ||
850 | |||
851 | mlog(0, "Force reading %lu blocks\n", | ||
852 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))); | ||
853 | |||
854 | v_blkno = 0; | ||
855 | while (v_blkno < | ||
856 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { | ||
857 | |||
858 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, | ||
859 | 1, &p_blkno, | ||
860 | &p_blocks); | ||
861 | if (status < 0) { | ||
862 | mlog_errno(status); | ||
863 | goto bail; | ||
864 | } | ||
865 | |||
866 | if (p_blocks > CONCURRENT_JOURNAL_FILL) | ||
867 | p_blocks = CONCURRENT_JOURNAL_FILL; | ||
868 | |||
869 | status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), | ||
870 | p_blkno, p_blocks, bhs, 0, | ||
871 | inode); | ||
872 | if (status < 0) { | ||
873 | mlog_errno(status); | ||
874 | goto bail; | ||
875 | } | ||
876 | |||
877 | for(i = 0; i < p_blocks; i++) { | ||
878 | brelse(bhs[i]); | ||
879 | bhs[i] = NULL; | ||
880 | } | ||
881 | |||
882 | v_blkno += p_blocks; | ||
883 | } | ||
884 | |||
885 | bail: | ||
886 | for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) | ||
887 | if (bhs[i]) | ||
888 | brelse(bhs[i]); | ||
889 | mlog_exit(status); | ||
890 | return status; | ||
891 | } | ||
892 | |||
893 | struct ocfs2_la_recovery_item { | ||
894 | struct list_head lri_list; | ||
895 | int lri_slot; | ||
896 | struct ocfs2_dinode *lri_la_dinode; | ||
897 | struct ocfs2_dinode *lri_tl_dinode; | ||
898 | }; | ||
899 | |||
900 | /* Does the second half of the recovery process. By this point, the | ||
901 | * node is marked clean and can actually be considered recovered, | ||
902 | * hence it's no longer in the recovery map, but there's still some | ||
903 | * cleanup we can do which shouldn't happen within the recovery thread | ||
904 | * as locking in that context becomes very difficult if we are to take | ||
905 | * recovering nodes into account. | ||
906 | * | ||
907 | * NOTE: This function can and will sleep on recovery of other nodes | ||
908 | * during cluster locking, just like any other ocfs2 process. | ||
909 | */ | ||
910 | void ocfs2_complete_recovery(void *data) | ||
911 | { | ||
912 | int ret; | ||
913 | struct ocfs2_super *osb = data; | ||
914 | struct ocfs2_journal *journal = osb->journal; | ||
915 | struct ocfs2_dinode *la_dinode, *tl_dinode; | ||
916 | struct ocfs2_la_recovery_item *item; | ||
917 | struct list_head *p, *n; | ||
918 | LIST_HEAD(tmp_la_list); | ||
919 | |||
920 | mlog_entry_void(); | ||
921 | |||
922 | mlog(0, "completing recovery from keventd\n"); | ||
923 | |||
924 | spin_lock(&journal->j_lock); | ||
925 | list_splice_init(&journal->j_la_cleanups, &tmp_la_list); | ||
926 | spin_unlock(&journal->j_lock); | ||
927 | |||
928 | list_for_each_safe(p, n, &tmp_la_list) { | ||
929 | item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); | ||
930 | list_del_init(&item->lri_list); | ||
931 | |||
932 | mlog(0, "Complete recovery for slot %d\n", item->lri_slot); | ||
933 | |||
934 | la_dinode = item->lri_la_dinode; | ||
935 | if (la_dinode) { | ||
936 | mlog(0, "Clean up local alloc %"MLFu64"\n", | ||
937 | la_dinode->i_blkno); | ||
938 | |||
939 | ret = ocfs2_complete_local_alloc_recovery(osb, | ||
940 | la_dinode); | ||
941 | if (ret < 0) | ||
942 | mlog_errno(ret); | ||
943 | |||
944 | kfree(la_dinode); | ||
945 | } | ||
946 | |||
947 | tl_dinode = item->lri_tl_dinode; | ||
948 | if (tl_dinode) { | ||
949 | mlog(0, "Clean up truncate log %"MLFu64"\n", | ||
950 | tl_dinode->i_blkno); | ||
951 | |||
952 | ret = ocfs2_complete_truncate_log_recovery(osb, | ||
953 | tl_dinode); | ||
954 | if (ret < 0) | ||
955 | mlog_errno(ret); | ||
956 | |||
957 | kfree(tl_dinode); | ||
958 | } | ||
959 | |||
960 | ret = ocfs2_recover_orphans(osb, item->lri_slot); | ||
961 | if (ret < 0) | ||
962 | mlog_errno(ret); | ||
963 | |||
964 | kfree(item); | ||
965 | } | ||
966 | |||
967 | mlog(0, "Recovery completion\n"); | ||
968 | mlog_exit_void(); | ||
969 | } | ||
970 | |||
971 | /* NOTE: This function always eats your references to la_dinode and | ||
972 | * tl_dinode, either manually on error, or by passing them to | ||
973 | * ocfs2_complete_recovery */ | ||
974 | static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, | ||
975 | int slot_num, | ||
976 | struct ocfs2_dinode *la_dinode, | ||
977 | struct ocfs2_dinode *tl_dinode) | ||
978 | { | ||
979 | struct ocfs2_la_recovery_item *item; | ||
980 | |||
981 | item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); | ||
982 | if (!item) { | ||
983 | /* Though we wish to avoid it, we are in fact safe in | ||
984 | * skipping local alloc cleanup as fsck.ocfs2 is more | ||
985 | * than capable of reclaiming unused space. */ | ||
986 | if (la_dinode) | ||
987 | kfree(la_dinode); | ||
988 | |||
989 | if (tl_dinode) | ||
990 | kfree(tl_dinode); | ||
991 | |||
992 | mlog_errno(-ENOMEM); | ||
993 | return; | ||
994 | } | ||
995 | |||
996 | INIT_LIST_HEAD(&item->lri_list); | ||
997 | item->lri_la_dinode = la_dinode; | ||
998 | item->lri_slot = slot_num; | ||
999 | item->lri_tl_dinode = tl_dinode; | ||
1000 | |||
1001 | spin_lock(&journal->j_lock); | ||
1002 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); | ||
1003 | queue_work(ocfs2_wq, &journal->j_recovery_work); | ||
1004 | spin_unlock(&journal->j_lock); | ||
1005 | } | ||
1006 | |||
1007 | /* Called by the mount code to queue recovery the last part of | ||
1008 | * recovery for it's own slot. */ | ||
1009 | void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) | ||
1010 | { | ||
1011 | struct ocfs2_journal *journal = osb->journal; | ||
1012 | |||
1013 | if (osb->dirty) { | ||
1014 | /* No need to queue up our truncate_log as regular | ||
1015 | * cleanup will catch that. */ | ||
1016 | ocfs2_queue_recovery_completion(journal, | ||
1017 | osb->slot_num, | ||
1018 | osb->local_alloc_copy, | ||
1019 | NULL); | ||
1020 | ocfs2_schedule_truncate_log_flush(osb, 0); | ||
1021 | |||
1022 | osb->local_alloc_copy = NULL; | ||
1023 | osb->dirty = 0; | ||
1024 | } | ||
1025 | } | ||
1026 | |||
1027 | static int __ocfs2_recovery_thread(void *arg) | ||
1028 | { | ||
1029 | int status, node_num; | ||
1030 | struct ocfs2_super *osb = arg; | ||
1031 | |||
1032 | mlog_entry_void(); | ||
1033 | |||
1034 | status = ocfs2_wait_on_mount(osb); | ||
1035 | if (status < 0) { | ||
1036 | goto bail; | ||
1037 | } | ||
1038 | |||
1039 | restart: | ||
1040 | status = ocfs2_super_lock(osb, 1); | ||
1041 | if (status < 0) { | ||
1042 | mlog_errno(status); | ||
1043 | goto bail; | ||
1044 | } | ||
1045 | |||
1046 | while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | ||
1047 | node_num = ocfs2_node_map_first_set_bit(osb, | ||
1048 | &osb->recovery_map); | ||
1049 | if (node_num == O2NM_INVALID_NODE_NUM) { | ||
1050 | mlog(0, "Out of nodes to recover.\n"); | ||
1051 | break; | ||
1052 | } | ||
1053 | |||
1054 | status = ocfs2_recover_node(osb, node_num); | ||
1055 | if (status < 0) { | ||
1056 | mlog(ML_ERROR, | ||
1057 | "Error %d recovering node %d on device (%u,%u)!\n", | ||
1058 | status, node_num, | ||
1059 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1060 | mlog(ML_ERROR, "Volume requires unmount.\n"); | ||
1061 | continue; | ||
1062 | } | ||
1063 | |||
1064 | ocfs2_recovery_map_clear(osb, node_num); | ||
1065 | } | ||
1066 | ocfs2_super_unlock(osb, 1); | ||
1067 | |||
1068 | /* We always run recovery on our own orphan dir - the dead | ||
1069 | * node(s) may have voted "no" on an inode delete earlier. A | ||
1070 | * revote is therefore required. */ | ||
1071 | ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, | ||
1072 | NULL); | ||
1073 | |||
1074 | bail: | ||
1075 | down(&osb->recovery_lock); | ||
1076 | if (!status && | ||
1077 | !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | ||
1078 | up(&osb->recovery_lock); | ||
1079 | goto restart; | ||
1080 | } | ||
1081 | |||
1082 | osb->recovery_thread_task = NULL; | ||
1083 | mb(); /* sync with ocfs2_recovery_thread_running */ | ||
1084 | wake_up(&osb->recovery_event); | ||
1085 | |||
1086 | up(&osb->recovery_lock); | ||
1087 | |||
1088 | mlog_exit(status); | ||
1089 | /* no one is callint kthread_stop() for us so the kthread() api | ||
1090 | * requires that we call do_exit(). And it isn't exported, but | ||
1091 | * complete_and_exit() seems to be a minimal wrapper around it. */ | ||
1092 | complete_and_exit(NULL, status); | ||
1093 | return status; | ||
1094 | } | ||
1095 | |||
1096 | void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) | ||
1097 | { | ||
1098 | mlog_entry("(node_num=%d, osb->node_num = %d)\n", | ||
1099 | node_num, osb->node_num); | ||
1100 | |||
1101 | down(&osb->recovery_lock); | ||
1102 | if (osb->disable_recovery) | ||
1103 | goto out; | ||
1104 | |||
1105 | /* People waiting on recovery will wait on | ||
1106 | * the recovery map to empty. */ | ||
1107 | if (!ocfs2_recovery_map_set(osb, node_num)) | ||
1108 | mlog(0, "node %d already be in recovery.\n", node_num); | ||
1109 | |||
1110 | mlog(0, "starting recovery thread...\n"); | ||
1111 | |||
1112 | if (osb->recovery_thread_task) | ||
1113 | goto out; | ||
1114 | |||
1115 | osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, | ||
1116 | "ocfs2rec-%d", osb->osb_id); | ||
1117 | if (IS_ERR(osb->recovery_thread_task)) { | ||
1118 | mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); | ||
1119 | osb->recovery_thread_task = NULL; | ||
1120 | } | ||
1121 | |||
1122 | out: | ||
1123 | up(&osb->recovery_lock); | ||
1124 | wake_up(&osb->recovery_event); | ||
1125 | |||
1126 | mlog_exit_void(); | ||
1127 | } | ||
1128 | |||
1129 | /* Does the actual journal replay and marks the journal inode as | ||
1130 | * clean. Will only replay if the journal inode is marked dirty. */ | ||
1131 | static int ocfs2_replay_journal(struct ocfs2_super *osb, | ||
1132 | int node_num, | ||
1133 | int slot_num) | ||
1134 | { | ||
1135 | int status; | ||
1136 | int got_lock = 0; | ||
1137 | unsigned int flags; | ||
1138 | struct inode *inode = NULL; | ||
1139 | struct ocfs2_dinode *fe; | ||
1140 | journal_t *journal = NULL; | ||
1141 | struct buffer_head *bh = NULL; | ||
1142 | |||
1143 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
1144 | slot_num); | ||
1145 | if (inode == NULL) { | ||
1146 | status = -EACCES; | ||
1147 | mlog_errno(status); | ||
1148 | goto done; | ||
1149 | } | ||
1150 | if (is_bad_inode(inode)) { | ||
1151 | status = -EACCES; | ||
1152 | iput(inode); | ||
1153 | inode = NULL; | ||
1154 | mlog_errno(status); | ||
1155 | goto done; | ||
1156 | } | ||
1157 | SET_INODE_JOURNAL(inode); | ||
1158 | |||
1159 | status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, | ||
1160 | OCFS2_META_LOCK_RECOVERY); | ||
1161 | if (status < 0) { | ||
1162 | mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); | ||
1163 | if (status != -ERESTARTSYS) | ||
1164 | mlog(ML_ERROR, "Could not lock journal!\n"); | ||
1165 | goto done; | ||
1166 | } | ||
1167 | got_lock = 1; | ||
1168 | |||
1169 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
1170 | |||
1171 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
1172 | |||
1173 | if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { | ||
1174 | mlog(0, "No recovery required for node %d\n", node_num); | ||
1175 | goto done; | ||
1176 | } | ||
1177 | |||
1178 | mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", | ||
1179 | node_num, slot_num, | ||
1180 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1181 | |||
1182 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
1183 | |||
1184 | status = ocfs2_force_read_journal(inode); | ||
1185 | if (status < 0) { | ||
1186 | mlog_errno(status); | ||
1187 | goto done; | ||
1188 | } | ||
1189 | |||
1190 | mlog(0, "calling journal_init_inode\n"); | ||
1191 | journal = journal_init_inode(inode); | ||
1192 | if (journal == NULL) { | ||
1193 | mlog(ML_ERROR, "Linux journal layer error\n"); | ||
1194 | status = -EIO; | ||
1195 | goto done; | ||
1196 | } | ||
1197 | |||
1198 | status = journal_load(journal); | ||
1199 | if (status < 0) { | ||
1200 | mlog_errno(status); | ||
1201 | if (!igrab(inode)) | ||
1202 | BUG(); | ||
1203 | journal_destroy(journal); | ||
1204 | goto done; | ||
1205 | } | ||
1206 | |||
1207 | ocfs2_clear_journal_error(osb->sb, journal, slot_num); | ||
1208 | |||
1209 | /* wipe the journal */ | ||
1210 | mlog(0, "flushing the journal.\n"); | ||
1211 | journal_lock_updates(journal); | ||
1212 | status = journal_flush(journal); | ||
1213 | journal_unlock_updates(journal); | ||
1214 | if (status < 0) | ||
1215 | mlog_errno(status); | ||
1216 | |||
1217 | /* This will mark the node clean */ | ||
1218 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
1219 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | ||
1220 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | ||
1221 | |||
1222 | status = ocfs2_write_block(osb, bh, inode); | ||
1223 | if (status < 0) | ||
1224 | mlog_errno(status); | ||
1225 | |||
1226 | if (!igrab(inode)) | ||
1227 | BUG(); | ||
1228 | |||
1229 | journal_destroy(journal); | ||
1230 | |||
1231 | done: | ||
1232 | /* drop the lock on this nodes journal */ | ||
1233 | if (got_lock) | ||
1234 | ocfs2_meta_unlock(inode, 1); | ||
1235 | |||
1236 | if (inode) | ||
1237 | iput(inode); | ||
1238 | |||
1239 | if (bh) | ||
1240 | brelse(bh); | ||
1241 | |||
1242 | mlog_exit(status); | ||
1243 | return status; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * Do the most important parts of node recovery: | ||
1248 | * - Replay it's journal | ||
1249 | * - Stamp a clean local allocator file | ||
1250 | * - Stamp a clean truncate log | ||
1251 | * - Mark the node clean | ||
1252 | * | ||
1253 | * If this function completes without error, a node in OCFS2 can be | ||
1254 | * said to have been safely recovered. As a result, failure during the | ||
1255 | * second part of a nodes recovery process (local alloc recovery) is | ||
1256 | * far less concerning. | ||
1257 | */ | ||
1258 | static int ocfs2_recover_node(struct ocfs2_super *osb, | ||
1259 | int node_num) | ||
1260 | { | ||
1261 | int status = 0; | ||
1262 | int slot_num; | ||
1263 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1264 | struct ocfs2_dinode *la_copy = NULL; | ||
1265 | struct ocfs2_dinode *tl_copy = NULL; | ||
1266 | |||
1267 | mlog_entry("(node_num=%d, osb->node_num = %d)\n", | ||
1268 | node_num, osb->node_num); | ||
1269 | |||
1270 | mlog(0, "checking node %d\n", node_num); | ||
1271 | |||
1272 | /* Should not ever be called to recover ourselves -- in that | ||
1273 | * case we should've called ocfs2_journal_load instead. */ | ||
1274 | if (osb->node_num == node_num) | ||
1275 | BUG(); | ||
1276 | |||
1277 | slot_num = ocfs2_node_num_to_slot(si, node_num); | ||
1278 | if (slot_num == OCFS2_INVALID_SLOT) { | ||
1279 | status = 0; | ||
1280 | mlog(0, "no slot for this node, so no recovery required.\n"); | ||
1281 | goto done; | ||
1282 | } | ||
1283 | |||
1284 | mlog(0, "node %d was using slot %d\n", node_num, slot_num); | ||
1285 | |||
1286 | status = ocfs2_replay_journal(osb, node_num, slot_num); | ||
1287 | if (status < 0) { | ||
1288 | mlog_errno(status); | ||
1289 | goto done; | ||
1290 | } | ||
1291 | |||
1292 | /* Stamp a clean local alloc file AFTER recovering the journal... */ | ||
1293 | status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); | ||
1294 | if (status < 0) { | ||
1295 | mlog_errno(status); | ||
1296 | goto done; | ||
1297 | } | ||
1298 | |||
1299 | /* An error from begin_truncate_log_recovery is not | ||
1300 | * serious enough to warrant halting the rest of | ||
1301 | * recovery. */ | ||
1302 | status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); | ||
1303 | if (status < 0) | ||
1304 | mlog_errno(status); | ||
1305 | |||
1306 | /* Likewise, this would be a strange but ultimately not so | ||
1307 | * harmful place to get an error... */ | ||
1308 | ocfs2_clear_slot(si, slot_num); | ||
1309 | status = ocfs2_update_disk_slots(osb, si); | ||
1310 | if (status < 0) | ||
1311 | mlog_errno(status); | ||
1312 | |||
1313 | /* This will kfree the memory pointed to by la_copy and tl_copy */ | ||
1314 | ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, | ||
1315 | tl_copy); | ||
1316 | |||
1317 | status = 0; | ||
1318 | done: | ||
1319 | |||
1320 | mlog_exit(status); | ||
1321 | return status; | ||
1322 | } | ||
1323 | |||
1324 | /* Test node liveness by trylocking his journal. If we get the lock, | ||
1325 | * we drop it here. Return 0 if we got the lock, -EAGAIN if node is | ||
1326 | * still alive (we couldn't get the lock) and < 0 on error. */ | ||
1327 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, | ||
1328 | int slot_num) | ||
1329 | { | ||
1330 | int status, flags; | ||
1331 | struct inode *inode = NULL; | ||
1332 | |||
1333 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
1334 | slot_num); | ||
1335 | if (inode == NULL) { | ||
1336 | mlog(ML_ERROR, "access error\n"); | ||
1337 | status = -EACCES; | ||
1338 | goto bail; | ||
1339 | } | ||
1340 | if (is_bad_inode(inode)) { | ||
1341 | mlog(ML_ERROR, "access error (bad inode)\n"); | ||
1342 | iput(inode); | ||
1343 | inode = NULL; | ||
1344 | status = -EACCES; | ||
1345 | goto bail; | ||
1346 | } | ||
1347 | SET_INODE_JOURNAL(inode); | ||
1348 | |||
1349 | flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; | ||
1350 | status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags); | ||
1351 | if (status < 0) { | ||
1352 | if (status != -EAGAIN) | ||
1353 | mlog_errno(status); | ||
1354 | goto bail; | ||
1355 | } | ||
1356 | |||
1357 | ocfs2_meta_unlock(inode, 1); | ||
1358 | bail: | ||
1359 | if (inode) | ||
1360 | iput(inode); | ||
1361 | |||
1362 | return status; | ||
1363 | } | ||
1364 | |||
1365 | /* Call this underneath ocfs2_super_lock. It also assumes that the | ||
1366 | * slot info struct has been updated from disk. */ | ||
1367 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) | ||
1368 | { | ||
1369 | int status, i, node_num; | ||
1370 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1371 | |||
1372 | /* This is called with the super block cluster lock, so we | ||
1373 | * know that the slot map can't change underneath us. */ | ||
1374 | |||
1375 | spin_lock(&si->si_lock); | ||
1376 | for(i = 0; i < si->si_num_slots; i++) { | ||
1377 | if (i == osb->slot_num) | ||
1378 | continue; | ||
1379 | if (ocfs2_is_empty_slot(si, i)) | ||
1380 | continue; | ||
1381 | |||
1382 | node_num = si->si_global_node_nums[i]; | ||
1383 | if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) | ||
1384 | continue; | ||
1385 | spin_unlock(&si->si_lock); | ||
1386 | |||
1387 | /* Ok, we have a slot occupied by another node which | ||
1388 | * is not in the recovery map. We trylock his journal | ||
1389 | * file here to test if he's alive. */ | ||
1390 | status = ocfs2_trylock_journal(osb, i); | ||
1391 | if (!status) { | ||
1392 | /* Since we're called from mount, we know that | ||
1393 | * the recovery thread can't race us on | ||
1394 | * setting / checking the recovery bits. */ | ||
1395 | ocfs2_recovery_thread(osb, node_num); | ||
1396 | } else if ((status < 0) && (status != -EAGAIN)) { | ||
1397 | mlog_errno(status); | ||
1398 | goto bail; | ||
1399 | } | ||
1400 | |||
1401 | spin_lock(&si->si_lock); | ||
1402 | } | ||
1403 | spin_unlock(&si->si_lock); | ||
1404 | |||
1405 | status = 0; | ||
1406 | bail: | ||
1407 | mlog_exit(status); | ||
1408 | return status; | ||
1409 | } | ||
1410 | |||
1411 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | ||
1412 | int slot) | ||
1413 | { | ||
1414 | int status = 0; | ||
1415 | int have_disk_lock = 0; | ||
1416 | struct inode *inode = NULL; | ||
1417 | struct inode *iter; | ||
1418 | struct inode *orphan_dir_inode = NULL; | ||
1419 | unsigned long offset, blk, local; | ||
1420 | struct buffer_head *bh = NULL; | ||
1421 | struct ocfs2_dir_entry *de; | ||
1422 | struct super_block *sb = osb->sb; | ||
1423 | struct ocfs2_inode_info *oi; | ||
1424 | |||
1425 | mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); | ||
1426 | |||
1427 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
1428 | ORPHAN_DIR_SYSTEM_INODE, | ||
1429 | slot); | ||
1430 | if (!orphan_dir_inode) { | ||
1431 | status = -ENOENT; | ||
1432 | mlog_errno(status); | ||
1433 | goto out; | ||
1434 | } | ||
1435 | |||
1436 | down(&orphan_dir_inode->i_sem); | ||
1437 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); | ||
1438 | if (status < 0) { | ||
1439 | up(&orphan_dir_inode->i_sem); | ||
1440 | mlog_errno(status); | ||
1441 | goto out; | ||
1442 | } | ||
1443 | have_disk_lock = 1; | ||
1444 | |||
1445 | offset = 0; | ||
1446 | iter = NULL; | ||
1447 | while(offset < i_size_read(orphan_dir_inode)) { | ||
1448 | blk = offset >> sb->s_blocksize_bits; | ||
1449 | |||
1450 | bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); | ||
1451 | if (!bh) | ||
1452 | status = -EINVAL; | ||
1453 | if (status < 0) { | ||
1454 | up(&orphan_dir_inode->i_sem); | ||
1455 | if (bh) | ||
1456 | brelse(bh); | ||
1457 | mlog_errno(status); | ||
1458 | goto out; | ||
1459 | } | ||
1460 | |||
1461 | local = 0; | ||
1462 | while(offset < i_size_read(orphan_dir_inode) | ||
1463 | && local < sb->s_blocksize) { | ||
1464 | de = (struct ocfs2_dir_entry *) (bh->b_data + local); | ||
1465 | |||
1466 | if (!ocfs2_check_dir_entry(orphan_dir_inode, | ||
1467 | de, bh, local)) { | ||
1468 | up(&orphan_dir_inode->i_sem); | ||
1469 | status = -EINVAL; | ||
1470 | mlog_errno(status); | ||
1471 | brelse(bh); | ||
1472 | goto out; | ||
1473 | } | ||
1474 | |||
1475 | local += le16_to_cpu(de->rec_len); | ||
1476 | offset += le16_to_cpu(de->rec_len); | ||
1477 | |||
1478 | /* I guess we silently fail on no inode? */ | ||
1479 | if (!le64_to_cpu(de->inode)) | ||
1480 | continue; | ||
1481 | if (de->file_type > OCFS2_FT_MAX) { | ||
1482 | mlog(ML_ERROR, | ||
1483 | "block %llu contains invalid de: " | ||
1484 | "inode = %"MLFu64", rec_len = %u, " | ||
1485 | "name_len = %u, file_type = %u, " | ||
1486 | "name='%.*s'\n", | ||
1487 | (unsigned long long)bh->b_blocknr, | ||
1488 | le64_to_cpu(de->inode), | ||
1489 | le16_to_cpu(de->rec_len), | ||
1490 | de->name_len, | ||
1491 | de->file_type, | ||
1492 | de->name_len, | ||
1493 | de->name); | ||
1494 | continue; | ||
1495 | } | ||
1496 | if (de->name_len == 1 && !strncmp(".", de->name, 1)) | ||
1497 | continue; | ||
1498 | if (de->name_len == 2 && !strncmp("..", de->name, 2)) | ||
1499 | continue; | ||
1500 | |||
1501 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); | ||
1502 | if (IS_ERR(iter)) | ||
1503 | continue; | ||
1504 | |||
1505 | mlog(0, "queue orphan %"MLFu64"\n", | ||
1506 | OCFS2_I(iter)->ip_blkno); | ||
1507 | OCFS2_I(iter)->ip_next_orphan = inode; | ||
1508 | inode = iter; | ||
1509 | } | ||
1510 | brelse(bh); | ||
1511 | } | ||
1512 | up(&orphan_dir_inode->i_sem); | ||
1513 | |||
1514 | ocfs2_meta_unlock(orphan_dir_inode, 0); | ||
1515 | have_disk_lock = 0; | ||
1516 | |||
1517 | iput(orphan_dir_inode); | ||
1518 | orphan_dir_inode = NULL; | ||
1519 | |||
1520 | while (inode) { | ||
1521 | oi = OCFS2_I(inode); | ||
1522 | mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno); | ||
1523 | |||
1524 | iter = oi->ip_next_orphan; | ||
1525 | |||
1526 | spin_lock(&oi->ip_lock); | ||
1527 | /* Delete voting may have set these on the assumption | ||
1528 | * that the other node would wipe them successfully. | ||
1529 | * If they are still in the node's orphan dir, we need | ||
1530 | * to reset that state. */ | ||
1531 | oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); | ||
1532 | |||
1533 | /* Set the proper information to get us going into | ||
1534 | * ocfs2_delete_inode. */ | ||
1535 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | ||
1536 | oi->ip_orphaned_slot = slot; | ||
1537 | spin_unlock(&oi->ip_lock); | ||
1538 | |||
1539 | iput(inode); | ||
1540 | |||
1541 | inode = iter; | ||
1542 | } | ||
1543 | |||
1544 | out: | ||
1545 | if (have_disk_lock) | ||
1546 | ocfs2_meta_unlock(orphan_dir_inode, 0); | ||
1547 | |||
1548 | if (orphan_dir_inode) | ||
1549 | iput(orphan_dir_inode); | ||
1550 | |||
1551 | return status; | ||
1552 | } | ||
1553 | |||
1554 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb) | ||
1555 | { | ||
1556 | /* This check is good because ocfs2 will wait on our recovery | ||
1557 | * thread before changing it to something other than MOUNTED | ||
1558 | * or DISABLED. */ | ||
1559 | wait_event(osb->osb_mount_event, | ||
1560 | atomic_read(&osb->vol_state) == VOLUME_MOUNTED || | ||
1561 | atomic_read(&osb->vol_state) == VOLUME_DISABLED); | ||
1562 | |||
1563 | /* If there's an error on mount, then we may never get to the | ||
1564 | * MOUNTED flag, but this is set right before | ||
1565 | * dismount_volume() so we can trust it. */ | ||
1566 | if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { | ||
1567 | mlog(0, "mount error, exiting!\n"); | ||
1568 | return -EBUSY; | ||
1569 | } | ||
1570 | |||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | static int ocfs2_commit_thread(void *arg) | ||
1575 | { | ||
1576 | int status; | ||
1577 | struct ocfs2_super *osb = arg; | ||
1578 | struct ocfs2_journal *journal = osb->journal; | ||
1579 | |||
1580 | /* we can trust j_num_trans here because _should_stop() is only set in | ||
1581 | * shutdown and nobody other than ourselves should be able to start | ||
1582 | * transactions. committing on shutdown might take a few iterations | ||
1583 | * as final transactions put deleted inodes on the list */ | ||
1584 | while (!(kthread_should_stop() && | ||
1585 | atomic_read(&journal->j_num_trans) == 0)) { | ||
1586 | |||
1587 | wait_event_interruptible_timeout(osb->checkpoint_event, | ||
1588 | atomic_read(&journal->j_num_trans) | ||
1589 | || kthread_should_stop(), | ||
1590 | OCFS2_CHECKPOINT_INTERVAL); | ||
1591 | |||
1592 | status = ocfs2_commit_cache(osb); | ||
1593 | if (status < 0) | ||
1594 | mlog_errno(status); | ||
1595 | |||
1596 | if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ | ||
1597 | mlog(ML_KTHREAD, | ||
1598 | "commit_thread: %u transactions pending on " | ||
1599 | "shutdown\n", | ||
1600 | atomic_read(&journal->j_num_trans)); | ||
1601 | } | ||
1602 | } | ||
1603 | |||
1604 | return 0; | ||
1605 | } | ||
1606 | |||
1607 | /* Look for a dirty journal without taking any cluster locks. Used for | ||
1608 | * hard readonly access to determine whether the file system journals | ||
1609 | * require recovery. */ | ||
1610 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) | ||
1611 | { | ||
1612 | int ret = 0; | ||
1613 | unsigned int slot; | ||
1614 | struct buffer_head *di_bh; | ||
1615 | struct ocfs2_dinode *di; | ||
1616 | struct inode *journal = NULL; | ||
1617 | |||
1618 | for(slot = 0; slot < osb->max_slots; slot++) { | ||
1619 | journal = ocfs2_get_system_file_inode(osb, | ||
1620 | JOURNAL_SYSTEM_INODE, | ||
1621 | slot); | ||
1622 | if (!journal || is_bad_inode(journal)) { | ||
1623 | ret = -EACCES; | ||
1624 | mlog_errno(ret); | ||
1625 | goto out; | ||
1626 | } | ||
1627 | |||
1628 | di_bh = NULL; | ||
1629 | ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, | ||
1630 | 0, journal); | ||
1631 | if (ret < 0) { | ||
1632 | mlog_errno(ret); | ||
1633 | goto out; | ||
1634 | } | ||
1635 | |||
1636 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
1637 | |||
1638 | if (le32_to_cpu(di->id1.journal1.ij_flags) & | ||
1639 | OCFS2_JOURNAL_DIRTY_FL) | ||
1640 | ret = -EROFS; | ||
1641 | |||
1642 | brelse(di_bh); | ||
1643 | if (ret) | ||
1644 | break; | ||
1645 | } | ||
1646 | |||
1647 | out: | ||
1648 | if (journal) | ||
1649 | iput(journal); | ||
1650 | |||
1651 | return ret; | ||
1652 | } | ||
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h new file mode 100644 index 000000000000..7d0a816184fa --- /dev/null +++ b/fs/ocfs2/journal.h | |||
@@ -0,0 +1,457 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * journal.h | ||
5 | * | ||
6 | * Defines journalling api and structures. | ||
7 | * | ||
8 | * Copyright (C) 2003, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_JOURNAL_H | ||
27 | #define OCFS2_JOURNAL_H | ||
28 | |||
29 | #include <linux/fs.h> | ||
30 | #include <linux/jbd.h> | ||
31 | |||
32 | #define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) | ||
33 | |||
34 | enum ocfs2_journal_state { | ||
35 | OCFS2_JOURNAL_FREE = 0, | ||
36 | OCFS2_JOURNAL_LOADED, | ||
37 | OCFS2_JOURNAL_IN_SHUTDOWN, | ||
38 | }; | ||
39 | |||
40 | struct ocfs2_super; | ||
41 | struct ocfs2_dinode; | ||
42 | struct ocfs2_journal_handle; | ||
43 | |||
44 | struct ocfs2_journal { | ||
45 | enum ocfs2_journal_state j_state; /* Journals current state */ | ||
46 | |||
47 | journal_t *j_journal; /* The kernels journal type */ | ||
48 | struct inode *j_inode; /* Kernel inode pointing to | ||
49 | * this journal */ | ||
50 | struct ocfs2_super *j_osb; /* pointer to the super | ||
51 | * block for the node | ||
52 | * we're currently | ||
53 | * running on -- not | ||
54 | * necessarily the super | ||
55 | * block from the node | ||
56 | * which we usually run | ||
57 | * from (recovery, | ||
58 | * etc) */ | ||
59 | struct buffer_head *j_bh; /* Journal disk inode block */ | ||
60 | atomic_t j_num_trans; /* Number of transactions | ||
61 | * currently in the system. */ | ||
62 | unsigned long j_trans_id; | ||
63 | struct rw_semaphore j_trans_barrier; | ||
64 | wait_queue_head_t j_checkpointed; | ||
65 | |||
66 | spinlock_t j_lock; | ||
67 | struct list_head j_la_cleanups; | ||
68 | struct work_struct j_recovery_work; | ||
69 | }; | ||
70 | |||
71 | extern spinlock_t trans_inc_lock; | ||
72 | |||
73 | /* wrap j_trans_id so we never have it equal to zero. */ | ||
74 | static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) | ||
75 | { | ||
76 | unsigned long old_id; | ||
77 | spin_lock(&trans_inc_lock); | ||
78 | old_id = j->j_trans_id++; | ||
79 | if (unlikely(!j->j_trans_id)) | ||
80 | j->j_trans_id = 1; | ||
81 | spin_unlock(&trans_inc_lock); | ||
82 | return old_id; | ||
83 | } | ||
84 | |||
85 | static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, | ||
86 | struct inode *inode) | ||
87 | { | ||
88 | spin_lock(&trans_inc_lock); | ||
89 | OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; | ||
90 | spin_unlock(&trans_inc_lock); | ||
91 | } | ||
92 | |||
93 | /* Used to figure out whether it's safe to drop a metadata lock on an | ||
94 | * inode. Returns true if all the inodes changes have been | ||
95 | * checkpointed to disk. You should be holding the spinlock on the | ||
96 | * metadata lock while calling this to be sure that nobody can take | ||
97 | * the lock and put it on another transaction. */ | ||
98 | static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) | ||
99 | { | ||
100 | int ret; | ||
101 | struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; | ||
102 | |||
103 | spin_lock(&trans_inc_lock); | ||
104 | ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); | ||
105 | spin_unlock(&trans_inc_lock); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | /* convenience function to check if an inode is still new (has never | ||
110 | * hit disk) Will do you a favor and set created_trans = 0 when you've | ||
111 | * been checkpointed. returns '1' if the inode is still new. */ | ||
112 | static inline int ocfs2_inode_is_new(struct inode *inode) | ||
113 | { | ||
114 | int ret; | ||
115 | |||
116 | /* System files are never "new" as they're written out by | ||
117 | * mkfs. This helps us early during mount, before we have the | ||
118 | * journal open and j_trans_id could be junk. */ | ||
119 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) | ||
120 | return 0; | ||
121 | spin_lock(&trans_inc_lock); | ||
122 | ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, | ||
123 | OCFS2_I(inode)->ip_created_trans)); | ||
124 | if (!ret) | ||
125 | OCFS2_I(inode)->ip_created_trans = 0; | ||
126 | spin_unlock(&trans_inc_lock); | ||
127 | return ret; | ||
128 | } | ||
129 | |||
130 | static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, | ||
131 | struct inode *inode) | ||
132 | { | ||
133 | spin_lock(&trans_inc_lock); | ||
134 | OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; | ||
135 | spin_unlock(&trans_inc_lock); | ||
136 | } | ||
137 | |||
138 | extern kmem_cache_t *ocfs2_lock_cache; | ||
139 | |||
140 | struct ocfs2_journal_lock { | ||
141 | struct inode *jl_inode; | ||
142 | struct list_head jl_lock_list; | ||
143 | }; | ||
144 | |||
145 | struct ocfs2_journal_handle { | ||
146 | handle_t *k_handle; /* kernel handle. */ | ||
147 | struct ocfs2_journal *journal; | ||
148 | u32 flags; /* see flags below. */ | ||
149 | int max_buffs; /* Buffs reserved by this handle */ | ||
150 | |||
151 | /* The following two fields are for ocfs2_handle_add_lock */ | ||
152 | int num_locks; | ||
153 | struct list_head locks; /* A bunch of locks to | ||
154 | * release on commit. This | ||
155 | * should be a list_head */ | ||
156 | |||
157 | struct list_head inode_list; | ||
158 | }; | ||
159 | |||
160 | #define OCFS2_HANDLE_STARTED 1 | ||
161 | /* should we sync-commit this handle? */ | ||
162 | #define OCFS2_HANDLE_SYNC 2 | ||
163 | static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle) | ||
164 | { | ||
165 | return handle->flags & OCFS2_HANDLE_STARTED; | ||
166 | } | ||
167 | |||
168 | static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync) | ||
169 | { | ||
170 | if (sync) | ||
171 | handle->flags |= OCFS2_HANDLE_SYNC; | ||
172 | else | ||
173 | handle->flags &= ~OCFS2_HANDLE_SYNC; | ||
174 | } | ||
175 | |||
176 | /* Exported only for the journal struct init code in super.c. Do not call. */ | ||
177 | void ocfs2_complete_recovery(void *data); | ||
178 | |||
179 | /* | ||
180 | * Journal Control: | ||
181 | * Initialize, Load, Shutdown, Wipe a journal. | ||
182 | * | ||
183 | * ocfs2_journal_init - Initialize journal structures in the OSB. | ||
184 | * ocfs2_journal_load - Load the given journal off disk. Replay it if | ||
185 | * there's transactions still in there. | ||
186 | * ocfs2_journal_shutdown - Shutdown a journal, this will flush all | ||
187 | * uncommitted, uncheckpointed transactions. | ||
188 | * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally | ||
189 | * zero out each block. | ||
190 | * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. | ||
191 | * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat | ||
192 | * event on. | ||
193 | * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. | ||
194 | */ | ||
195 | void ocfs2_set_journal_params(struct ocfs2_super *osb); | ||
196 | int ocfs2_journal_init(struct ocfs2_journal *journal, | ||
197 | int *dirty); | ||
198 | void ocfs2_journal_shutdown(struct ocfs2_super *osb); | ||
199 | int ocfs2_journal_wipe(struct ocfs2_journal *journal, | ||
200 | int full); | ||
201 | int ocfs2_journal_load(struct ocfs2_journal *journal); | ||
202 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); | ||
203 | void ocfs2_recovery_thread(struct ocfs2_super *osb, | ||
204 | int node_num); | ||
205 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); | ||
206 | void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); | ||
207 | |||
208 | static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) | ||
209 | { | ||
210 | atomic_set(&osb->needs_checkpoint, 1); | ||
211 | wake_up(&osb->checkpoint_event); | ||
212 | } | ||
213 | |||
214 | static inline void ocfs2_checkpoint_inode(struct inode *inode) | ||
215 | { | ||
216 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
217 | |||
218 | if (!ocfs2_inode_fully_checkpointed(inode)) { | ||
219 | /* WARNING: This only kicks off a single | ||
220 | * checkpoint. If someone races you and adds more | ||
221 | * metadata to the journal, you won't know, and will | ||
222 | * wind up waiting *alot* longer than necessary. Right | ||
223 | * now we only use this in clear_inode so that's | ||
224 | * OK. */ | ||
225 | ocfs2_start_checkpoint(osb); | ||
226 | |||
227 | wait_event(osb->journal->j_checkpointed, | ||
228 | ocfs2_inode_fully_checkpointed(inode)); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Transaction Handling: | ||
234 | * Manage the lifetime of a transaction handle. | ||
235 | * | ||
236 | * ocfs2_alloc_handle - Only allocate a handle so we can start putting | ||
237 | * cluster locks on it. To actually change blocks, | ||
238 | * call ocfs2_start_trans with the handle returned | ||
239 | * from this function. You may call ocfs2_commit_trans | ||
240 | * at any time in the lifetime of a handle. | ||
241 | * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of | ||
242 | * the number of blocks that will be changed during | ||
243 | * this handle. | ||
244 | * ocfs2_commit_trans - Complete a handle. | ||
245 | * ocfs2_extend_trans - Extend a handle by nblocks credits. This may | ||
246 | * commit the handle to disk in the process, but will | ||
247 | * not release any locks taken during the transaction. | ||
248 | * ocfs2_journal_access - Notify the handle that we want to journal this | ||
249 | * buffer. Will have to call ocfs2_journal_dirty once | ||
250 | * we've actually dirtied it. Type is one of . or . | ||
251 | * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. | ||
252 | * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before | ||
253 | * the current handle commits. | ||
254 | * ocfs2_handle_add_lock - Sometimes we need to delay lock release | ||
255 | * until after a transaction has been completed. Use | ||
256 | * ocfs2_handle_add_lock to indicate that a lock needs | ||
257 | * to be released at the end of that handle. Locks | ||
258 | * will be released in the order that they are added. | ||
259 | * ocfs2_handle_add_inode - Add a locked inode to a transaction. | ||
260 | */ | ||
261 | |||
262 | /* You must always start_trans with a number of buffs > 0, but it's | ||
263 | * perfectly legal to go through an entire transaction without having | ||
264 | * dirtied any buffers. */ | ||
265 | struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb); | ||
266 | struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, | ||
267 | struct ocfs2_journal_handle *handle, | ||
268 | int max_buffs); | ||
269 | void ocfs2_commit_trans(struct ocfs2_journal_handle *handle); | ||
270 | int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, | ||
271 | int nblocks); | ||
272 | |||
273 | /* | ||
274 | * Create access is for when we get a newly created buffer and we're | ||
275 | * not gonna read it off disk, but rather fill it ourselves. Right | ||
276 | * now, we don't do anything special with this (it turns into a write | ||
277 | * request), but this is a good placeholder in case we do... | ||
278 | * | ||
279 | * Write access is for when we read a block off disk and are going to | ||
280 | * modify it. This way the journalling layer knows it may need to make | ||
281 | * a copy of that block (if it's part of another, uncommitted | ||
282 | * transaction) before we do so. | ||
283 | */ | ||
284 | #define OCFS2_JOURNAL_ACCESS_CREATE 0 | ||
285 | #define OCFS2_JOURNAL_ACCESS_WRITE 1 | ||
286 | #define OCFS2_JOURNAL_ACCESS_UNDO 2 | ||
287 | |||
288 | int ocfs2_journal_access(struct ocfs2_journal_handle *handle, | ||
289 | struct inode *inode, | ||
290 | struct buffer_head *bh, | ||
291 | int type); | ||
292 | /* | ||
293 | * A word about the journal_access/journal_dirty "dance". It is | ||
294 | * entirely legal to journal_access a buffer more than once (as long | ||
295 | * as the access type is the same -- I'm not sure what will happen if | ||
296 | * access type is different but this should never happen anyway) It is | ||
297 | * also legal to journal_dirty a buffer more than once. In fact, you | ||
298 | * can even journal_access a buffer after you've done a | ||
299 | * journal_access/journal_dirty pair. The only thing you cannot do | ||
300 | * however, is journal_dirty a buffer which you haven't yet passed to | ||
301 | * journal_access at least once. | ||
302 | * | ||
303 | * That said, 99% of the time this doesn't matter and this is what the | ||
304 | * path looks like: | ||
305 | * | ||
306 | * <read a bh> | ||
307 | * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); | ||
308 | * <modify the bh> | ||
309 | * ocfs2_journal_dirty(handle, bh); | ||
310 | */ | ||
311 | int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, | ||
312 | struct buffer_head *bh); | ||
313 | int ocfs2_journal_dirty_data(handle_t *handle, | ||
314 | struct buffer_head *bh); | ||
315 | int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, | ||
316 | struct inode *inode); | ||
317 | /* | ||
318 | * Use this to protect from other processes reading buffer state while | ||
319 | * it's in flight. | ||
320 | */ | ||
321 | void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, | ||
322 | struct inode *inode); | ||
323 | |||
324 | /* | ||
325 | * Credit Macros: | ||
326 | * Convenience macros to calculate number of credits needed. | ||
327 | * | ||
328 | * For convenience sake, I have a set of macros here which calculate | ||
329 | * the *maximum* number of sectors which will be changed for various | ||
330 | * metadata updates. | ||
331 | */ | ||
332 | |||
333 | /* simple file updates like chmod, etc. */ | ||
334 | #define OCFS2_INODE_UPDATE_CREDITS 1 | ||
335 | |||
336 | /* get one bit out of a suballocator: dinode + group descriptor + | ||
337 | * prev. group desc. if we relink. */ | ||
338 | #define OCFS2_SUBALLOC_ALLOC (3) | ||
339 | |||
340 | /* dinode + group descriptor update. We don't relink on free yet. */ | ||
341 | #define OCFS2_SUBALLOC_FREE (2) | ||
342 | |||
343 | #define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS | ||
344 | #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ | ||
345 | + OCFS2_TRUNCATE_LOG_UPDATE) | ||
346 | |||
347 | /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + | ||
348 | * bitmap block for the new bit) */ | ||
349 | #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) | ||
350 | |||
351 | /* parent fe, parent block, new file entry, inode alloc fe, inode alloc | ||
352 | * group descriptor + mkdir/symlink blocks */ | ||
353 | #define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ | ||
354 | + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) | ||
355 | |||
356 | /* local alloc metadata change + main bitmap updates */ | ||
357 | #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ | ||
358 | + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) | ||
359 | |||
360 | /* used when we don't need an allocation change for a dir extend. One | ||
361 | * for the dinode, one for the new block. */ | ||
362 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) | ||
363 | |||
364 | /* file update (nlink, etc) + dir entry block */ | ||
365 | #define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) | ||
366 | |||
367 | /* inode + dir inode (if we unlink a dir), + dir entry block + orphan | ||
368 | * dir inode link */ | ||
369 | #define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ | ||
370 | + OCFS2_LINK_CREDITS) | ||
371 | |||
372 | /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + | ||
373 | * inode alloc group descriptor */ | ||
374 | #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) | ||
375 | |||
376 | /* dinode update, old dir dinode update, new dir dinode update, old | ||
377 | * dir dir entry, new dir dir entry, dir entry update for renaming | ||
378 | * directory + target unlink */ | ||
379 | #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ | ||
380 | + OCFS2_UNLINK_CREDITS) | ||
381 | |||
382 | static inline int ocfs2_calc_extend_credits(struct super_block *sb, | ||
383 | struct ocfs2_dinode *fe, | ||
384 | u32 bits_wanted) | ||
385 | { | ||
386 | int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; | ||
387 | |||
388 | /* bitmap dinode, group desc. + relinked group. */ | ||
389 | bitmap_blocks = OCFS2_SUBALLOC_ALLOC; | ||
390 | |||
391 | /* we might need to shift tree depth so lets assume an | ||
392 | * absolute worst case of complete fragmentation. Even with | ||
393 | * that, we only need one update for the dinode, and then | ||
394 | * however many metadata chunks needed * a remaining suballoc | ||
395 | * alloc. */ | ||
396 | sysfile_bitmap_blocks = 1 + | ||
397 | (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); | ||
398 | |||
399 | /* this does not include *new* metadata blocks, which are | ||
400 | * accounted for in sysfile_bitmap_blocks. fe + | ||
401 | * prev. last_eb_blk + blocks along edge of tree. | ||
402 | * calc_symlink_credits passes because we just need 1 | ||
403 | * credit for the dinode there. */ | ||
404 | dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
405 | |||
406 | return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; | ||
407 | } | ||
408 | |||
409 | static inline int ocfs2_calc_symlink_credits(struct super_block *sb) | ||
410 | { | ||
411 | int blocks = OCFS2_MKNOD_CREDITS; | ||
412 | |||
413 | /* links can be longer than one block so we may update many | ||
414 | * within our single allocated extent. */ | ||
415 | blocks += ocfs2_clusters_to_blocks(sb, 1); | ||
416 | |||
417 | return blocks; | ||
418 | } | ||
419 | |||
420 | static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, | ||
421 | unsigned int cpg) | ||
422 | { | ||
423 | int blocks; | ||
424 | int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; | ||
425 | /* parent inode update + new block group header + bitmap inode update | ||
426 | + bitmap blocks affected */ | ||
427 | blocks = 1 + 1 + 1 + bitmap_blocks; | ||
428 | return blocks; | ||
429 | } | ||
430 | |||
431 | static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, | ||
432 | unsigned int clusters_to_del, | ||
433 | struct ocfs2_dinode *fe, | ||
434 | struct ocfs2_extent_list *last_el) | ||
435 | { | ||
436 | /* for dinode + all headers in this pass + update to next leaf */ | ||
437 | u16 next_free = le16_to_cpu(last_el->l_next_free_rec); | ||
438 | u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
439 | int credits = 1 + tree_depth + 1; | ||
440 | int i; | ||
441 | |||
442 | i = next_free - 1; | ||
443 | BUG_ON(i < 0); | ||
444 | |||
445 | /* We may be deleting metadata blocks, so metadata alloc dinode + | ||
446 | one desc. block for each possible delete. */ | ||
447 | if (tree_depth && next_free == 1 && | ||
448 | le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) | ||
449 | credits += 1 + tree_depth; | ||
450 | |||
451 | /* update to the truncate log. */ | ||
452 | credits += OCFS2_TRUNCATE_LOG_UPDATE; | ||
453 | |||
454 | return credits; | ||
455 | } | ||
456 | |||
457 | #endif /* OCFS2_JOURNAL_H */ | ||
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c new file mode 100644 index 000000000000..fe373a2101d9 --- /dev/null +++ b/fs/ocfs2/localalloc.c | |||
@@ -0,0 +1,983 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * localalloc.c | ||
5 | * | ||
6 | * Node local data allocation | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/bitops.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "super.h" | ||
44 | #include "sysfile.h" | ||
45 | |||
46 | #include "buffer_head_io.h" | ||
47 | |||
48 | #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) | ||
49 | |||
50 | static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); | ||
51 | |||
52 | static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); | ||
53 | |||
54 | static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, | ||
55 | struct ocfs2_dinode *alloc, | ||
56 | u32 numbits); | ||
57 | |||
58 | static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); | ||
59 | |||
60 | static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, | ||
61 | struct ocfs2_journal_handle *handle, | ||
62 | struct ocfs2_dinode *alloc, | ||
63 | struct inode *main_bm_inode, | ||
64 | struct buffer_head *main_bm_bh); | ||
65 | |||
66 | static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, | ||
67 | struct ocfs2_journal_handle *handle, | ||
68 | struct ocfs2_alloc_context **ac, | ||
69 | struct inode **bitmap_inode, | ||
70 | struct buffer_head **bitmap_bh); | ||
71 | |||
72 | static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, | ||
73 | struct ocfs2_journal_handle *handle, | ||
74 | struct ocfs2_alloc_context *ac); | ||
75 | |||
76 | static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, | ||
77 | struct inode *local_alloc_inode); | ||
78 | |||
79 | /* | ||
80 | * Determine how large our local alloc window should be, in bits. | ||
81 | * | ||
82 | * These values (and the behavior in ocfs2_alloc_should_use_local) have | ||
83 | * been chosen so that most allocations, including new block groups go | ||
84 | * through local alloc. | ||
85 | */ | ||
86 | static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) | ||
87 | { | ||
88 | BUG_ON(osb->s_clustersize_bits < 12); | ||
89 | |||
90 | return 2048 >> (osb->s_clustersize_bits - 12); | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Tell us whether a given allocation should use the local alloc | ||
95 | * file. Otherwise, it has to go to the main bitmap. | ||
96 | */ | ||
97 | int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) | ||
98 | { | ||
99 | int la_bits = ocfs2_local_alloc_window_bits(osb); | ||
100 | |||
101 | if (osb->local_alloc_state != OCFS2_LA_ENABLED) | ||
102 | return 0; | ||
103 | |||
104 | /* la_bits should be at least twice the size (in clusters) of | ||
105 | * a new block group. We want to be sure block group | ||
106 | * allocations go through the local alloc, so allow an | ||
107 | * allocation to take up to half the bitmap. */ | ||
108 | if (bits > (la_bits / 2)) | ||
109 | return 0; | ||
110 | |||
111 | return 1; | ||
112 | } | ||
113 | |||
114 | int ocfs2_load_local_alloc(struct ocfs2_super *osb) | ||
115 | { | ||
116 | int status = 0; | ||
117 | struct ocfs2_dinode *alloc = NULL; | ||
118 | struct buffer_head *alloc_bh = NULL; | ||
119 | u32 num_used; | ||
120 | struct inode *inode = NULL; | ||
121 | struct ocfs2_local_alloc *la; | ||
122 | |||
123 | mlog_entry_void(); | ||
124 | |||
125 | /* read the alloc off disk */ | ||
126 | inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, | ||
127 | osb->slot_num); | ||
128 | if (!inode) { | ||
129 | status = -EINVAL; | ||
130 | mlog_errno(status); | ||
131 | goto bail; | ||
132 | } | ||
133 | |||
134 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, | ||
135 | &alloc_bh, 0, inode); | ||
136 | if (status < 0) { | ||
137 | mlog_errno(status); | ||
138 | goto bail; | ||
139 | } | ||
140 | |||
141 | alloc = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
142 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
143 | |||
144 | if (!(le32_to_cpu(alloc->i_flags) & | ||
145 | (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { | ||
146 | mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n", | ||
147 | OCFS2_I(inode)->ip_blkno); | ||
148 | status = -EINVAL; | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | if ((la->la_size == 0) || | ||
153 | (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { | ||
154 | mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", | ||
155 | le16_to_cpu(la->la_size)); | ||
156 | status = -EINVAL; | ||
157 | goto bail; | ||
158 | } | ||
159 | |||
160 | /* do a little verification. */ | ||
161 | num_used = ocfs2_local_alloc_count_bits(alloc); | ||
162 | |||
163 | /* hopefully the local alloc has always been recovered before | ||
164 | * we load it. */ | ||
165 | if (num_used | ||
166 | || alloc->id1.bitmap1.i_used | ||
167 | || alloc->id1.bitmap1.i_total | ||
168 | || la->la_bm_off) | ||
169 | mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" | ||
170 | "found = %u, set = %u, taken = %u, off = %u\n", | ||
171 | num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), | ||
172 | le32_to_cpu(alloc->id1.bitmap1.i_total), | ||
173 | OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); | ||
174 | |||
175 | osb->local_alloc_bh = alloc_bh; | ||
176 | osb->local_alloc_state = OCFS2_LA_ENABLED; | ||
177 | |||
178 | bail: | ||
179 | if (status < 0) | ||
180 | if (alloc_bh) | ||
181 | brelse(alloc_bh); | ||
182 | if (inode) | ||
183 | iput(inode); | ||
184 | |||
185 | mlog_exit(status); | ||
186 | return status; | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * return any unused bits to the bitmap and write out a clean | ||
191 | * local_alloc. | ||
192 | * | ||
193 | * local_alloc_bh is optional. If not passed, we will simply use the | ||
194 | * one off osb. If you do pass it however, be warned that it *will* be | ||
195 | * returned brelse'd and NULL'd out.*/ | ||
196 | void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) | ||
197 | { | ||
198 | int status; | ||
199 | struct ocfs2_journal_handle *handle = NULL; | ||
200 | struct inode *local_alloc_inode = NULL; | ||
201 | struct buffer_head *bh = NULL; | ||
202 | struct buffer_head *main_bm_bh = NULL; | ||
203 | struct inode *main_bm_inode = NULL; | ||
204 | struct ocfs2_dinode *alloc_copy = NULL; | ||
205 | struct ocfs2_dinode *alloc = NULL; | ||
206 | |||
207 | mlog_entry_void(); | ||
208 | |||
209 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) | ||
210 | goto bail; | ||
211 | |||
212 | local_alloc_inode = | ||
213 | ocfs2_get_system_file_inode(osb, | ||
214 | LOCAL_ALLOC_SYSTEM_INODE, | ||
215 | osb->slot_num); | ||
216 | if (!local_alloc_inode) { | ||
217 | status = -ENOENT; | ||
218 | mlog_errno(status); | ||
219 | goto bail; | ||
220 | } | ||
221 | |||
222 | osb->local_alloc_state = OCFS2_LA_DISABLED; | ||
223 | |||
224 | handle = ocfs2_alloc_handle(osb); | ||
225 | if (!handle) { | ||
226 | status = -ENOMEM; | ||
227 | mlog_errno(status); | ||
228 | goto bail; | ||
229 | } | ||
230 | |||
231 | main_bm_inode = ocfs2_get_system_file_inode(osb, | ||
232 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
233 | OCFS2_INVALID_SLOT); | ||
234 | if (!main_bm_inode) { | ||
235 | status = -EINVAL; | ||
236 | mlog_errno(status); | ||
237 | goto bail; | ||
238 | } | ||
239 | |||
240 | ocfs2_handle_add_inode(handle, main_bm_inode); | ||
241 | status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); | ||
242 | if (status < 0) { | ||
243 | mlog_errno(status); | ||
244 | goto bail; | ||
245 | } | ||
246 | |||
247 | /* WINDOW_MOVE_CREDITS is a bit heavy... */ | ||
248 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
249 | if (IS_ERR(handle)) { | ||
250 | mlog_errno(PTR_ERR(handle)); | ||
251 | handle = NULL; | ||
252 | goto bail; | ||
253 | } | ||
254 | |||
255 | bh = osb->local_alloc_bh; | ||
256 | alloc = (struct ocfs2_dinode *) bh->b_data; | ||
257 | |||
258 | alloc_copy = kmalloc(bh->b_size, GFP_KERNEL); | ||
259 | if (!alloc_copy) { | ||
260 | status = -ENOMEM; | ||
261 | goto bail; | ||
262 | } | ||
263 | memcpy(alloc_copy, alloc, bh->b_size); | ||
264 | |||
265 | status = ocfs2_journal_access(handle, local_alloc_inode, bh, | ||
266 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
267 | if (status < 0) { | ||
268 | mlog_errno(status); | ||
269 | goto bail; | ||
270 | } | ||
271 | |||
272 | ocfs2_clear_local_alloc(alloc); | ||
273 | |||
274 | status = ocfs2_journal_dirty(handle, bh); | ||
275 | if (status < 0) { | ||
276 | mlog_errno(status); | ||
277 | goto bail; | ||
278 | } | ||
279 | |||
280 | brelse(bh); | ||
281 | osb->local_alloc_bh = NULL; | ||
282 | osb->local_alloc_state = OCFS2_LA_UNUSED; | ||
283 | |||
284 | status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, | ||
285 | main_bm_inode, main_bm_bh); | ||
286 | if (status < 0) | ||
287 | mlog_errno(status); | ||
288 | |||
289 | bail: | ||
290 | if (handle) | ||
291 | ocfs2_commit_trans(handle); | ||
292 | |||
293 | if (main_bm_bh) | ||
294 | brelse(main_bm_bh); | ||
295 | |||
296 | if (main_bm_inode) | ||
297 | iput(main_bm_inode); | ||
298 | |||
299 | if (local_alloc_inode) | ||
300 | iput(local_alloc_inode); | ||
301 | |||
302 | if (alloc_copy) | ||
303 | kfree(alloc_copy); | ||
304 | |||
305 | mlog_exit_void(); | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * We want to free the bitmap bits outside of any recovery context as | ||
310 | * we'll need a cluster lock to do so, but we must clear the local | ||
311 | * alloc before giving up the recovered nodes journal. To solve this, | ||
312 | * we kmalloc a copy of the local alloc before it's change for the | ||
313 | * caller to process with ocfs2_complete_local_alloc_recovery | ||
314 | */ | ||
315 | int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, | ||
316 | int slot_num, | ||
317 | struct ocfs2_dinode **alloc_copy) | ||
318 | { | ||
319 | int status = 0; | ||
320 | struct buffer_head *alloc_bh = NULL; | ||
321 | struct inode *inode = NULL; | ||
322 | struct ocfs2_dinode *alloc; | ||
323 | |||
324 | mlog_entry("(slot_num = %d)\n", slot_num); | ||
325 | |||
326 | *alloc_copy = NULL; | ||
327 | |||
328 | inode = ocfs2_get_system_file_inode(osb, | ||
329 | LOCAL_ALLOC_SYSTEM_INODE, | ||
330 | slot_num); | ||
331 | if (!inode) { | ||
332 | status = -EINVAL; | ||
333 | mlog_errno(status); | ||
334 | goto bail; | ||
335 | } | ||
336 | |||
337 | down(&inode->i_sem); | ||
338 | |||
339 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, | ||
340 | &alloc_bh, 0, inode); | ||
341 | if (status < 0) { | ||
342 | mlog_errno(status); | ||
343 | goto bail; | ||
344 | } | ||
345 | |||
346 | *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); | ||
347 | if (!(*alloc_copy)) { | ||
348 | status = -ENOMEM; | ||
349 | goto bail; | ||
350 | } | ||
351 | memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); | ||
352 | |||
353 | alloc = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
354 | ocfs2_clear_local_alloc(alloc); | ||
355 | |||
356 | status = ocfs2_write_block(osb, alloc_bh, inode); | ||
357 | if (status < 0) | ||
358 | mlog_errno(status); | ||
359 | |||
360 | bail: | ||
361 | if ((status < 0) && (*alloc_copy)) { | ||
362 | kfree(*alloc_copy); | ||
363 | *alloc_copy = NULL; | ||
364 | } | ||
365 | |||
366 | if (alloc_bh) | ||
367 | brelse(alloc_bh); | ||
368 | |||
369 | if (inode) { | ||
370 | up(&inode->i_sem); | ||
371 | iput(inode); | ||
372 | } | ||
373 | |||
374 | mlog_exit(status); | ||
375 | return status; | ||
376 | } | ||
377 | |||
378 | /* | ||
379 | * Step 2: By now, we've completed the journal recovery, we've stamped | ||
380 | * a clean local alloc on disk and dropped the node out of the | ||
381 | * recovery map. Dlm locks will no longer stall, so lets clear out the | ||
382 | * main bitmap. | ||
383 | */ | ||
384 | int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, | ||
385 | struct ocfs2_dinode *alloc) | ||
386 | { | ||
387 | int status; | ||
388 | struct ocfs2_journal_handle *handle = NULL; | ||
389 | struct buffer_head *main_bm_bh = NULL; | ||
390 | struct inode *main_bm_inode = NULL; | ||
391 | |||
392 | mlog_entry_void(); | ||
393 | |||
394 | handle = ocfs2_alloc_handle(osb); | ||
395 | if (!handle) { | ||
396 | status = -ENOMEM; | ||
397 | mlog_errno(status); | ||
398 | goto bail; | ||
399 | } | ||
400 | |||
401 | main_bm_inode = ocfs2_get_system_file_inode(osb, | ||
402 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
403 | OCFS2_INVALID_SLOT); | ||
404 | if (!main_bm_inode) { | ||
405 | status = -EINVAL; | ||
406 | mlog_errno(status); | ||
407 | goto bail; | ||
408 | } | ||
409 | |||
410 | ocfs2_handle_add_inode(handle, main_bm_inode); | ||
411 | status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); | ||
412 | if (status < 0) { | ||
413 | mlog_errno(status); | ||
414 | goto bail; | ||
415 | } | ||
416 | |||
417 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
418 | if (IS_ERR(handle)) { | ||
419 | status = PTR_ERR(handle); | ||
420 | handle = NULL; | ||
421 | mlog_errno(status); | ||
422 | goto bail; | ||
423 | } | ||
424 | |||
425 | /* we want the bitmap change to be recorded on disk asap */ | ||
426 | ocfs2_handle_set_sync(handle, 1); | ||
427 | |||
428 | status = ocfs2_sync_local_to_main(osb, handle, alloc, | ||
429 | main_bm_inode, main_bm_bh); | ||
430 | if (status < 0) | ||
431 | mlog_errno(status); | ||
432 | |||
433 | bail: | ||
434 | if (handle) | ||
435 | ocfs2_commit_trans(handle); | ||
436 | |||
437 | if (main_bm_bh) | ||
438 | brelse(main_bm_bh); | ||
439 | |||
440 | if (main_bm_inode) | ||
441 | iput(main_bm_inode); | ||
442 | |||
443 | mlog_exit(status); | ||
444 | return status; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * make sure we've got at least bitswanted contiguous bits in the | ||
449 | * local alloc. You lose them when you drop i_sem. | ||
450 | * | ||
451 | * We will add ourselves to the transaction passed in, but may start | ||
452 | * our own in order to shift windows. | ||
453 | */ | ||
454 | int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | ||
455 | struct ocfs2_journal_handle *passed_handle, | ||
456 | u32 bits_wanted, | ||
457 | struct ocfs2_alloc_context *ac) | ||
458 | { | ||
459 | int status; | ||
460 | struct ocfs2_dinode *alloc; | ||
461 | struct inode *local_alloc_inode; | ||
462 | unsigned int free_bits; | ||
463 | |||
464 | mlog_entry_void(); | ||
465 | |||
466 | BUG_ON(!passed_handle); | ||
467 | BUG_ON(!ac); | ||
468 | BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED); | ||
469 | |||
470 | local_alloc_inode = | ||
471 | ocfs2_get_system_file_inode(osb, | ||
472 | LOCAL_ALLOC_SYSTEM_INODE, | ||
473 | osb->slot_num); | ||
474 | if (!local_alloc_inode) { | ||
475 | status = -ENOENT; | ||
476 | mlog_errno(status); | ||
477 | goto bail; | ||
478 | } | ||
479 | ocfs2_handle_add_inode(passed_handle, local_alloc_inode); | ||
480 | |||
481 | if (osb->local_alloc_state != OCFS2_LA_ENABLED) { | ||
482 | status = -ENOSPC; | ||
483 | goto bail; | ||
484 | } | ||
485 | |||
486 | if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { | ||
487 | mlog(0, "Asking for more than my max window size!\n"); | ||
488 | status = -ENOSPC; | ||
489 | goto bail; | ||
490 | } | ||
491 | |||
492 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
493 | |||
494 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) != | ||
495 | ocfs2_local_alloc_count_bits(alloc)) { | ||
496 | ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has " | ||
497 | "%u free bits, but a count shows %u", | ||
498 | le64_to_cpu(alloc->i_blkno), | ||
499 | le32_to_cpu(alloc->id1.bitmap1.i_used), | ||
500 | ocfs2_local_alloc_count_bits(alloc)); | ||
501 | status = -EIO; | ||
502 | goto bail; | ||
503 | } | ||
504 | |||
505 | free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - | ||
506 | le32_to_cpu(alloc->id1.bitmap1.i_used); | ||
507 | if (bits_wanted > free_bits) { | ||
508 | /* uhoh, window change time. */ | ||
509 | status = | ||
510 | ocfs2_local_alloc_slide_window(osb, local_alloc_inode); | ||
511 | if (status < 0) { | ||
512 | if (status != -ENOSPC) | ||
513 | mlog_errno(status); | ||
514 | goto bail; | ||
515 | } | ||
516 | } | ||
517 | |||
518 | ac->ac_inode = igrab(local_alloc_inode); | ||
519 | get_bh(osb->local_alloc_bh); | ||
520 | ac->ac_bh = osb->local_alloc_bh; | ||
521 | ac->ac_which = OCFS2_AC_USE_LOCAL; | ||
522 | status = 0; | ||
523 | bail: | ||
524 | if (local_alloc_inode) | ||
525 | iput(local_alloc_inode); | ||
526 | |||
527 | mlog_exit(status); | ||
528 | return status; | ||
529 | } | ||
530 | |||
531 | int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, | ||
532 | struct ocfs2_journal_handle *handle, | ||
533 | struct ocfs2_alloc_context *ac, | ||
534 | u32 min_bits, | ||
535 | u32 *bit_off, | ||
536 | u32 *num_bits) | ||
537 | { | ||
538 | int status, start; | ||
539 | struct inode *local_alloc_inode; | ||
540 | u32 bits_wanted; | ||
541 | void *bitmap; | ||
542 | struct ocfs2_dinode *alloc; | ||
543 | struct ocfs2_local_alloc *la; | ||
544 | |||
545 | mlog_entry_void(); | ||
546 | BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); | ||
547 | |||
548 | bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; | ||
549 | local_alloc_inode = ac->ac_inode; | ||
550 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
551 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
552 | |||
553 | start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); | ||
554 | if (start == -1) { | ||
555 | /* TODO: Shouldn't we just BUG here? */ | ||
556 | status = -ENOSPC; | ||
557 | mlog_errno(status); | ||
558 | goto bail; | ||
559 | } | ||
560 | |||
561 | bitmap = la->la_bitmap; | ||
562 | *bit_off = le32_to_cpu(la->la_bm_off) + start; | ||
563 | /* local alloc is always contiguous by nature -- we never | ||
564 | * delete bits from it! */ | ||
565 | *num_bits = bits_wanted; | ||
566 | |||
567 | status = ocfs2_journal_access(handle, local_alloc_inode, | ||
568 | osb->local_alloc_bh, | ||
569 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
570 | if (status < 0) { | ||
571 | mlog_errno(status); | ||
572 | goto bail; | ||
573 | } | ||
574 | |||
575 | while(bits_wanted--) | ||
576 | ocfs2_set_bit(start++, bitmap); | ||
577 | |||
578 | alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits + | ||
579 | le32_to_cpu(alloc->id1.bitmap1.i_used)); | ||
580 | |||
581 | status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); | ||
582 | if (status < 0) { | ||
583 | mlog_errno(status); | ||
584 | goto bail; | ||
585 | } | ||
586 | |||
587 | status = 0; | ||
588 | bail: | ||
589 | mlog_exit(status); | ||
590 | return status; | ||
591 | } | ||
592 | |||
593 | static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) | ||
594 | { | ||
595 | int i; | ||
596 | u8 *buffer; | ||
597 | u32 count = 0; | ||
598 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
599 | |||
600 | mlog_entry_void(); | ||
601 | |||
602 | buffer = la->la_bitmap; | ||
603 | for (i = 0; i < le16_to_cpu(la->la_size); i++) | ||
604 | count += hweight8(buffer[i]); | ||
605 | |||
606 | mlog_exit(count); | ||
607 | return count; | ||
608 | } | ||
609 | |||
610 | static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, | ||
611 | struct ocfs2_dinode *alloc, | ||
612 | u32 numbits) | ||
613 | { | ||
614 | int numfound, bitoff, left, startoff, lastzero; | ||
615 | void *bitmap = NULL; | ||
616 | |||
617 | mlog_entry("(numbits wanted = %u)\n", numbits); | ||
618 | |||
619 | if (!alloc->id1.bitmap1.i_total) { | ||
620 | mlog(0, "No bits in my window!\n"); | ||
621 | bitoff = -1; | ||
622 | goto bail; | ||
623 | } | ||
624 | |||
625 | bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; | ||
626 | |||
627 | numfound = bitoff = startoff = 0; | ||
628 | lastzero = -1; | ||
629 | left = le32_to_cpu(alloc->id1.bitmap1.i_total); | ||
630 | while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { | ||
631 | if (bitoff == left) { | ||
632 | /* mlog(0, "bitoff (%d) == left", bitoff); */ | ||
633 | break; | ||
634 | } | ||
635 | /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " | ||
636 | "numfound = %d\n", bitoff, startoff, numfound);*/ | ||
637 | |||
638 | /* Ok, we found a zero bit... is it contig. or do we | ||
639 | * start over?*/ | ||
640 | if (bitoff == startoff) { | ||
641 | /* we found a zero */ | ||
642 | numfound++; | ||
643 | startoff++; | ||
644 | } else { | ||
645 | /* got a zero after some ones */ | ||
646 | numfound = 1; | ||
647 | startoff = bitoff+1; | ||
648 | } | ||
649 | /* we got everything we needed */ | ||
650 | if (numfound == numbits) { | ||
651 | /* mlog(0, "Found it all!\n"); */ | ||
652 | break; | ||
653 | } | ||
654 | } | ||
655 | |||
656 | mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, | ||
657 | numfound); | ||
658 | |||
659 | if (numfound == numbits) | ||
660 | bitoff = startoff - numfound; | ||
661 | else | ||
662 | bitoff = -1; | ||
663 | |||
664 | bail: | ||
665 | mlog_exit(bitoff); | ||
666 | return bitoff; | ||
667 | } | ||
668 | |||
669 | static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) | ||
670 | { | ||
671 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
672 | int i; | ||
673 | mlog_entry_void(); | ||
674 | |||
675 | alloc->id1.bitmap1.i_total = 0; | ||
676 | alloc->id1.bitmap1.i_used = 0; | ||
677 | la->la_bm_off = 0; | ||
678 | for(i = 0; i < le16_to_cpu(la->la_size); i++) | ||
679 | la->la_bitmap[i] = 0; | ||
680 | |||
681 | mlog_exit_void(); | ||
682 | } | ||
683 | |||
684 | #if 0 | ||
685 | /* turn this on and uncomment below to aid debugging window shifts. */ | ||
686 | static void ocfs2_verify_zero_bits(unsigned long *bitmap, | ||
687 | unsigned int start, | ||
688 | unsigned int count) | ||
689 | { | ||
690 | unsigned int tmp = count; | ||
691 | while(tmp--) { | ||
692 | if (ocfs2_test_bit(start + tmp, bitmap)) { | ||
693 | printk("ocfs2_verify_zero_bits: start = %u, count = " | ||
694 | "%u\n", start, count); | ||
695 | printk("ocfs2_verify_zero_bits: bit %u is set!", | ||
696 | start + tmp); | ||
697 | BUG(); | ||
698 | } | ||
699 | } | ||
700 | } | ||
701 | #endif | ||
702 | |||
703 | /* | ||
704 | * sync the local alloc to main bitmap. | ||
705 | * | ||
706 | * assumes you've already locked the main bitmap -- the bitmap inode | ||
707 | * passed is used for caching. | ||
708 | */ | ||
709 | static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, | ||
710 | struct ocfs2_journal_handle *handle, | ||
711 | struct ocfs2_dinode *alloc, | ||
712 | struct inode *main_bm_inode, | ||
713 | struct buffer_head *main_bm_bh) | ||
714 | { | ||
715 | int status = 0; | ||
716 | int bit_off, left, count, start; | ||
717 | u64 la_start_blk; | ||
718 | u64 blkno; | ||
719 | void *bitmap; | ||
720 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
721 | |||
722 | mlog_entry("total = %u, COUNT = %u, used = %u\n", | ||
723 | le32_to_cpu(alloc->id1.bitmap1.i_total), | ||
724 | ocfs2_local_alloc_count_bits(alloc), | ||
725 | le32_to_cpu(alloc->id1.bitmap1.i_used)); | ||
726 | |||
727 | if (!alloc->id1.bitmap1.i_total) { | ||
728 | mlog(0, "nothing to sync!\n"); | ||
729 | goto bail; | ||
730 | } | ||
731 | |||
732 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) == | ||
733 | le32_to_cpu(alloc->id1.bitmap1.i_total)) { | ||
734 | mlog(0, "all bits were taken!\n"); | ||
735 | goto bail; | ||
736 | } | ||
737 | |||
738 | la_start_blk = ocfs2_clusters_to_blocks(osb->sb, | ||
739 | le32_to_cpu(la->la_bm_off)); | ||
740 | bitmap = la->la_bitmap; | ||
741 | start = count = bit_off = 0; | ||
742 | left = le32_to_cpu(alloc->id1.bitmap1.i_total); | ||
743 | |||
744 | while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) | ||
745 | != -1) { | ||
746 | if ((bit_off < left) && (bit_off == start)) { | ||
747 | count++; | ||
748 | start++; | ||
749 | continue; | ||
750 | } | ||
751 | if (count) { | ||
752 | blkno = la_start_blk + | ||
753 | ocfs2_clusters_to_blocks(osb->sb, | ||
754 | start - count); | ||
755 | |||
756 | mlog(0, "freeing %u bits starting at local " | ||
757 | "alloc bit %u (la_start_blk = %"MLFu64", " | ||
758 | "blkno = %"MLFu64")\n", count, start - count, | ||
759 | la_start_blk, blkno); | ||
760 | |||
761 | status = ocfs2_free_clusters(handle, main_bm_inode, | ||
762 | main_bm_bh, blkno, count); | ||
763 | if (status < 0) { | ||
764 | mlog_errno(status); | ||
765 | goto bail; | ||
766 | } | ||
767 | } | ||
768 | if (bit_off >= left) | ||
769 | break; | ||
770 | count = 1; | ||
771 | start = bit_off + 1; | ||
772 | } | ||
773 | |||
774 | bail: | ||
775 | mlog_exit(status); | ||
776 | return status; | ||
777 | } | ||
778 | |||
779 | static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, | ||
780 | struct ocfs2_journal_handle *handle, | ||
781 | struct ocfs2_alloc_context **ac, | ||
782 | struct inode **bitmap_inode, | ||
783 | struct buffer_head **bitmap_bh) | ||
784 | { | ||
785 | int status; | ||
786 | |||
787 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
788 | if (!(*ac)) { | ||
789 | status = -ENOMEM; | ||
790 | mlog_errno(status); | ||
791 | goto bail; | ||
792 | } | ||
793 | |||
794 | (*ac)->ac_handle = handle; | ||
795 | (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); | ||
796 | |||
797 | status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); | ||
798 | if (status < 0) { | ||
799 | if (status != -ENOSPC) | ||
800 | mlog_errno(status); | ||
801 | goto bail; | ||
802 | } | ||
803 | |||
804 | *bitmap_inode = (*ac)->ac_inode; | ||
805 | igrab(*bitmap_inode); | ||
806 | *bitmap_bh = (*ac)->ac_bh; | ||
807 | get_bh(*bitmap_bh); | ||
808 | status = 0; | ||
809 | bail: | ||
810 | if ((status < 0) && *ac) { | ||
811 | ocfs2_free_alloc_context(*ac); | ||
812 | *ac = NULL; | ||
813 | } | ||
814 | |||
815 | mlog_exit(status); | ||
816 | return status; | ||
817 | } | ||
818 | |||
819 | /* | ||
820 | * pass it the bitmap lock in lock_bh if you have it. | ||
821 | */ | ||
822 | static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, | ||
823 | struct ocfs2_journal_handle *handle, | ||
824 | struct ocfs2_alloc_context *ac) | ||
825 | { | ||
826 | int status = 0; | ||
827 | u32 cluster_off, cluster_count; | ||
828 | struct ocfs2_dinode *alloc = NULL; | ||
829 | struct ocfs2_local_alloc *la; | ||
830 | |||
831 | mlog_entry_void(); | ||
832 | |||
833 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
834 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
835 | |||
836 | if (alloc->id1.bitmap1.i_total) | ||
837 | mlog(0, "asking me to alloc a new window over a non-empty " | ||
838 | "one\n"); | ||
839 | |||
840 | mlog(0, "Allocating %u clusters for a new window.\n", | ||
841 | ocfs2_local_alloc_window_bits(osb)); | ||
842 | /* we used the generic suballoc reserve function, but we set | ||
843 | * everything up nicely, so there's no reason why we can't use | ||
844 | * the more specific cluster api to claim bits. */ | ||
845 | status = ocfs2_claim_clusters(osb, handle, ac, | ||
846 | ocfs2_local_alloc_window_bits(osb), | ||
847 | &cluster_off, &cluster_count); | ||
848 | if (status < 0) { | ||
849 | if (status != -ENOSPC) | ||
850 | mlog_errno(status); | ||
851 | goto bail; | ||
852 | } | ||
853 | |||
854 | la->la_bm_off = cpu_to_le32(cluster_off); | ||
855 | alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); | ||
856 | /* just in case... In the future when we find space ourselves, | ||
857 | * we don't have to get all contiguous -- but we'll have to | ||
858 | * set all previously used bits in bitmap and update | ||
859 | * la_bits_set before setting the bits in the main bitmap. */ | ||
860 | alloc->id1.bitmap1.i_used = 0; | ||
861 | memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, | ||
862 | le16_to_cpu(la->la_size)); | ||
863 | |||
864 | mlog(0, "New window allocated:\n"); | ||
865 | mlog(0, "window la_bm_off = %u\n", | ||
866 | OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); | ||
867 | mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total)); | ||
868 | |||
869 | bail: | ||
870 | mlog_exit(status); | ||
871 | return status; | ||
872 | } | ||
873 | |||
874 | /* Note that we do *NOT* lock the local alloc inode here as | ||
875 | * it's been locked already for us. */ | ||
876 | static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, | ||
877 | struct inode *local_alloc_inode) | ||
878 | { | ||
879 | int status = 0; | ||
880 | struct buffer_head *main_bm_bh = NULL; | ||
881 | struct inode *main_bm_inode = NULL; | ||
882 | struct ocfs2_journal_handle *handle = NULL; | ||
883 | struct ocfs2_dinode *alloc; | ||
884 | struct ocfs2_dinode *alloc_copy = NULL; | ||
885 | struct ocfs2_alloc_context *ac = NULL; | ||
886 | |||
887 | mlog_entry_void(); | ||
888 | |||
889 | handle = ocfs2_alloc_handle(osb); | ||
890 | if (!handle) { | ||
891 | status = -ENOMEM; | ||
892 | mlog_errno(status); | ||
893 | goto bail; | ||
894 | } | ||
895 | |||
896 | /* This will lock the main bitmap for us. */ | ||
897 | status = ocfs2_local_alloc_reserve_for_window(osb, | ||
898 | handle, | ||
899 | &ac, | ||
900 | &main_bm_inode, | ||
901 | &main_bm_bh); | ||
902 | if (status < 0) { | ||
903 | if (status != -ENOSPC) | ||
904 | mlog_errno(status); | ||
905 | goto bail; | ||
906 | } | ||
907 | |||
908 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
909 | if (IS_ERR(handle)) { | ||
910 | status = PTR_ERR(handle); | ||
911 | handle = NULL; | ||
912 | mlog_errno(status); | ||
913 | goto bail; | ||
914 | } | ||
915 | |||
916 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
917 | |||
918 | /* We want to clear the local alloc before doing anything | ||
919 | * else, so that if we error later during this operation, | ||
920 | * local alloc shutdown won't try to double free main bitmap | ||
921 | * bits. Make a copy so the sync function knows which bits to | ||
922 | * free. */ | ||
923 | alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL); | ||
924 | if (!alloc_copy) { | ||
925 | status = -ENOMEM; | ||
926 | mlog_errno(status); | ||
927 | goto bail; | ||
928 | } | ||
929 | memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); | ||
930 | |||
931 | status = ocfs2_journal_access(handle, local_alloc_inode, | ||
932 | osb->local_alloc_bh, | ||
933 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
934 | if (status < 0) { | ||
935 | mlog_errno(status); | ||
936 | goto bail; | ||
937 | } | ||
938 | |||
939 | ocfs2_clear_local_alloc(alloc); | ||
940 | |||
941 | status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); | ||
942 | if (status < 0) { | ||
943 | mlog_errno(status); | ||
944 | goto bail; | ||
945 | } | ||
946 | |||
947 | status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, | ||
948 | main_bm_inode, main_bm_bh); | ||
949 | if (status < 0) { | ||
950 | mlog_errno(status); | ||
951 | goto bail; | ||
952 | } | ||
953 | |||
954 | status = ocfs2_local_alloc_new_window(osb, handle, ac); | ||
955 | if (status < 0) { | ||
956 | if (status != -ENOSPC) | ||
957 | mlog_errno(status); | ||
958 | goto bail; | ||
959 | } | ||
960 | |||
961 | atomic_inc(&osb->alloc_stats.moves); | ||
962 | |||
963 | status = 0; | ||
964 | bail: | ||
965 | if (handle) | ||
966 | ocfs2_commit_trans(handle); | ||
967 | |||
968 | if (main_bm_bh) | ||
969 | brelse(main_bm_bh); | ||
970 | |||
971 | if (main_bm_inode) | ||
972 | iput(main_bm_inode); | ||
973 | |||
974 | if (alloc_copy) | ||
975 | kfree(alloc_copy); | ||
976 | |||
977 | if (ac) | ||
978 | ocfs2_free_alloc_context(ac); | ||
979 | |||
980 | mlog_exit(status); | ||
981 | return status; | ||
982 | } | ||
983 | |||
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h new file mode 100644 index 000000000000..30f88ce14e46 --- /dev/null +++ b/fs/ocfs2/localalloc.h | |||
@@ -0,0 +1,56 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * localalloc.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_LOCALALLOC_H | ||
27 | #define OCFS2_LOCALALLOC_H | ||
28 | |||
29 | int ocfs2_load_local_alloc(struct ocfs2_super *osb); | ||
30 | |||
31 | void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); | ||
32 | |||
33 | int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, | ||
34 | int node_num, | ||
35 | struct ocfs2_dinode **alloc_copy); | ||
36 | |||
37 | int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, | ||
38 | struct ocfs2_dinode *alloc); | ||
39 | |||
40 | int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, | ||
41 | u64 bits); | ||
42 | |||
43 | struct ocfs2_alloc_context; | ||
44 | int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | ||
45 | struct ocfs2_journal_handle *passed_handle, | ||
46 | u32 bits_wanted, | ||
47 | struct ocfs2_alloc_context *ac); | ||
48 | |||
49 | int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, | ||
50 | struct ocfs2_journal_handle *handle, | ||
51 | struct ocfs2_alloc_context *ac, | ||
52 | u32 min_bits, | ||
53 | u32 *bit_off, | ||
54 | u32 *num_bits); | ||
55 | |||
56 | #endif /* OCFS2_LOCALALLOC_H */ | ||
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c new file mode 100644 index 000000000000..afdeec4b0eef --- /dev/null +++ b/fs/ocfs2/mmap.c | |||
@@ -0,0 +1,102 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * mmap.c | ||
5 | * | ||
6 | * Code to deal with the mess that is clustered mmap. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/uio.h> | ||
32 | #include <linux/signal.h> | ||
33 | #include <linux/rbtree.h> | ||
34 | |||
35 | #define MLOG_MASK_PREFIX ML_FILE_IO | ||
36 | #include <cluster/masklog.h> | ||
37 | |||
38 | #include "ocfs2.h" | ||
39 | |||
40 | #include "dlmglue.h" | ||
41 | #include "file.h" | ||
42 | #include "inode.h" | ||
43 | #include "mmap.h" | ||
44 | |||
45 | static struct page *ocfs2_nopage(struct vm_area_struct * area, | ||
46 | unsigned long address, | ||
47 | int *type) | ||
48 | { | ||
49 | struct inode *inode = area->vm_file->f_dentry->d_inode; | ||
50 | struct page *page = NOPAGE_SIGBUS; | ||
51 | sigset_t blocked, oldset; | ||
52 | int ret; | ||
53 | |||
54 | mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address); | ||
55 | |||
56 | /* The best way to deal with signals in this path is | ||
57 | * to block them upfront, rather than allowing the | ||
58 | * locking paths to return -ERESTARTSYS. */ | ||
59 | sigfillset(&blocked); | ||
60 | |||
61 | /* We should technically never get a bad ret return | ||
62 | * from sigprocmask */ | ||
63 | ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
64 | if (ret < 0) { | ||
65 | mlog_errno(ret); | ||
66 | goto out; | ||
67 | } | ||
68 | |||
69 | page = filemap_nopage(area, address, type); | ||
70 | |||
71 | ret = sigprocmask(SIG_SETMASK, &oldset, NULL); | ||
72 | if (ret < 0) | ||
73 | mlog_errno(ret); | ||
74 | out: | ||
75 | mlog_exit_ptr(page); | ||
76 | return page; | ||
77 | } | ||
78 | |||
79 | static struct vm_operations_struct ocfs2_file_vm_ops = { | ||
80 | .nopage = ocfs2_nopage, | ||
81 | }; | ||
82 | |||
83 | int ocfs2_mmap(struct file *file, | ||
84 | struct vm_area_struct *vma) | ||
85 | { | ||
86 | struct address_space *mapping = file->f_dentry->d_inode->i_mapping; | ||
87 | struct inode *inode = mapping->host; | ||
88 | |||
89 | /* We don't want to support shared writable mappings yet. */ | ||
90 | if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) | ||
91 | && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | ||
92 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | ||
93 | /* This is -EINVAL because generic_file_readonly_mmap | ||
94 | * returns it in a similar situation. */ | ||
95 | return -EINVAL; | ||
96 | } | ||
97 | |||
98 | update_atime(inode); | ||
99 | vma->vm_ops = &ocfs2_file_vm_ops; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h new file mode 100644 index 000000000000..1274ee0f1fe2 --- /dev/null +++ b/fs/ocfs2/mmap.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef OCFS2_MMAP_H | ||
2 | #define OCFS2_MMAP_H | ||
3 | |||
4 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); | ||
5 | |||
6 | #endif /* OCFS2_MMAP_H */ | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c new file mode 100644 index 000000000000..f6b77ff1d2bf --- /dev/null +++ b/fs/ocfs2/namei.c | |||
@@ -0,0 +1,2264 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * namei.c | ||
5 | * | ||
6 | * Create and rename file, directory, symlinks | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * Portions of this code from linux/fs/ext3/dir.c | ||
11 | * | ||
12 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
13 | * Remy Card (card@masi.ibp.fr) | ||
14 | * Laboratoire MASI - Institut Blaise pascal | ||
15 | * Universite Pierre et Marie Curie (Paris VI) | ||
16 | * | ||
17 | * from | ||
18 | * | ||
19 | * linux/fs/minix/dir.c | ||
20 | * | ||
21 | * Copyright (C) 1991, 1992 Linux Torvalds | ||
22 | * | ||
23 | * This program is free software; you can redistribute it and/or | ||
24 | * modify it under the terms of the GNU General Public | ||
25 | * License as published by the Free Software Foundation; either | ||
26 | * version 2 of the License, or (at your option) any later version. | ||
27 | * | ||
28 | * This program is distributed in the hope that it will be useful, | ||
29 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
30 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
31 | * General Public License for more details. | ||
32 | * | ||
33 | * You should have received a copy of the GNU General Public | ||
34 | * License along with this program; if not, write to the | ||
35 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
36 | * Boston, MA 021110-1307, USA. | ||
37 | */ | ||
38 | |||
39 | #include <linux/fs.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/highmem.h> | ||
43 | |||
44 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
45 | #include <cluster/masklog.h> | ||
46 | |||
47 | #include "ocfs2.h" | ||
48 | |||
49 | #include "alloc.h" | ||
50 | #include "dcache.h" | ||
51 | #include "dir.h" | ||
52 | #include "dlmglue.h" | ||
53 | #include "extent_map.h" | ||
54 | #include "file.h" | ||
55 | #include "inode.h" | ||
56 | #include "journal.h" | ||
57 | #include "namei.h" | ||
58 | #include "suballoc.h" | ||
59 | #include "symlink.h" | ||
60 | #include "sysfile.h" | ||
61 | #include "uptodate.h" | ||
62 | #include "vote.h" | ||
63 | |||
64 | #include "buffer_head_io.h" | ||
65 | |||
66 | #define NAMEI_RA_CHUNKS 2 | ||
67 | #define NAMEI_RA_BLOCKS 4 | ||
68 | #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) | ||
69 | #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) | ||
70 | |||
71 | static int inline ocfs2_search_dirblock(struct buffer_head *bh, | ||
72 | struct inode *dir, | ||
73 | const char *name, int namelen, | ||
74 | unsigned long offset, | ||
75 | struct ocfs2_dir_entry **res_dir); | ||
76 | |||
77 | static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, | ||
78 | struct inode *dir, | ||
79 | struct ocfs2_dir_entry *de_del, | ||
80 | struct buffer_head *bh); | ||
81 | |||
82 | static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
83 | struct inode *dir, | ||
84 | const char *name, int namelen, | ||
85 | struct inode *inode, u64 blkno, | ||
86 | struct buffer_head *parent_fe_bh, | ||
87 | struct buffer_head *insert_bh); | ||
88 | |||
89 | static int ocfs2_mknod_locked(struct ocfs2_super *osb, | ||
90 | struct inode *dir, | ||
91 | struct dentry *dentry, int mode, | ||
92 | dev_t dev, | ||
93 | struct buffer_head **new_fe_bh, | ||
94 | struct buffer_head *parent_fe_bh, | ||
95 | struct ocfs2_journal_handle *handle, | ||
96 | struct inode **ret_inode, | ||
97 | struct ocfs2_alloc_context *inode_ac); | ||
98 | |||
99 | static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | ||
100 | struct ocfs2_journal_handle *handle, | ||
101 | struct inode *parent, | ||
102 | struct inode *inode, | ||
103 | struct buffer_head *fe_bh, | ||
104 | struct ocfs2_alloc_context *data_ac); | ||
105 | |||
106 | static int ocfs2_double_lock(struct ocfs2_super *osb, | ||
107 | struct ocfs2_journal_handle *handle, | ||
108 | struct buffer_head **bh1, | ||
109 | struct inode *inode1, | ||
110 | struct buffer_head **bh2, | ||
111 | struct inode *inode2); | ||
112 | |||
113 | static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, | ||
114 | struct ocfs2_journal_handle *handle, | ||
115 | struct inode *inode, | ||
116 | char *name, | ||
117 | struct buffer_head **de_bh); | ||
118 | |||
119 | static int ocfs2_orphan_add(struct ocfs2_super *osb, | ||
120 | struct ocfs2_journal_handle *handle, | ||
121 | struct inode *inode, | ||
122 | struct ocfs2_dinode *fe, | ||
123 | char *name, | ||
124 | struct buffer_head *de_bh); | ||
125 | |||
126 | static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | ||
127 | struct ocfs2_journal_handle *handle, | ||
128 | struct inode *inode, | ||
129 | const char *symname); | ||
130 | |||
131 | static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
132 | struct dentry *dentry, | ||
133 | struct inode *inode, u64 blkno, | ||
134 | struct buffer_head *parent_fe_bh, | ||
135 | struct buffer_head *insert_bh) | ||
136 | { | ||
137 | return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, | ||
138 | dentry->d_name.name, dentry->d_name.len, | ||
139 | inode, blkno, parent_fe_bh, insert_bh); | ||
140 | } | ||
141 | |||
142 | /* An orphan dir name is an 8 byte value, printed as a hex string */ | ||
143 | #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) | ||
144 | |||
145 | static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | ||
146 | struct nameidata *nd) | ||
147 | { | ||
148 | int status; | ||
149 | u64 blkno; | ||
150 | struct buffer_head *dirent_bh = NULL; | ||
151 | struct inode *inode = NULL; | ||
152 | struct dentry *ret; | ||
153 | struct ocfs2_dir_entry *dirent; | ||
154 | struct ocfs2_inode_info *oi; | ||
155 | |||
156 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, | ||
157 | dentry->d_name.len, dentry->d_name.name); | ||
158 | |||
159 | if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { | ||
160 | ret = ERR_PTR(-ENAMETOOLONG); | ||
161 | goto bail; | ||
162 | } | ||
163 | |||
164 | mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len, | ||
165 | dentry->d_name.name, OCFS2_I(dir)->ip_blkno); | ||
166 | |||
167 | status = ocfs2_meta_lock(dir, NULL, NULL, 0); | ||
168 | if (status < 0) { | ||
169 | if (status != -ENOENT) | ||
170 | mlog_errno(status); | ||
171 | ret = ERR_PTR(status); | ||
172 | goto bail; | ||
173 | } | ||
174 | |||
175 | status = ocfs2_find_files_on_disk(dentry->d_name.name, | ||
176 | dentry->d_name.len, &blkno, | ||
177 | dir, &dirent_bh, &dirent); | ||
178 | if (status < 0) | ||
179 | goto bail_add; | ||
180 | |||
181 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | ||
182 | if (IS_ERR(inode)) { | ||
183 | mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); | ||
184 | ret = ERR_PTR(-EACCES); | ||
185 | goto bail_unlock; | ||
186 | } | ||
187 | |||
188 | oi = OCFS2_I(inode); | ||
189 | /* Clear any orphaned state... If we were able to look up the | ||
190 | * inode from a directory, it certainly can't be orphaned. We | ||
191 | * might have the bad state from a node which intended to | ||
192 | * orphan this inode but crashed before it could commit the | ||
193 | * unlink. */ | ||
194 | spin_lock(&oi->ip_lock); | ||
195 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; | ||
196 | oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
197 | spin_unlock(&oi->ip_lock); | ||
198 | |||
199 | bail_add: | ||
200 | |||
201 | dentry->d_op = &ocfs2_dentry_ops; | ||
202 | ret = d_splice_alias(inode, dentry); | ||
203 | |||
204 | bail_unlock: | ||
205 | /* Don't drop the cluster lock until *after* the d_add -- | ||
206 | * unlink on another node will message us to remove that | ||
207 | * dentry under this lock so otherwise we can race this with | ||
208 | * the vote thread and have a stale dentry. */ | ||
209 | ocfs2_meta_unlock(dir, 0); | ||
210 | |||
211 | bail: | ||
212 | if (dirent_bh) | ||
213 | brelse(dirent_bh); | ||
214 | |||
215 | mlog_exit_ptr(ret); | ||
216 | |||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | ||
221 | struct ocfs2_journal_handle *handle, | ||
222 | struct inode *parent, | ||
223 | struct inode *inode, | ||
224 | struct buffer_head *fe_bh, | ||
225 | struct ocfs2_alloc_context *data_ac) | ||
226 | { | ||
227 | int status; | ||
228 | struct buffer_head *new_bh = NULL; | ||
229 | struct ocfs2_dir_entry *de = NULL; | ||
230 | |||
231 | mlog_entry_void(); | ||
232 | |||
233 | status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, | ||
234 | data_ac, NULL, &new_bh); | ||
235 | if (status < 0) { | ||
236 | mlog_errno(status); | ||
237 | goto bail; | ||
238 | } | ||
239 | |||
240 | ocfs2_set_new_buffer_uptodate(inode, new_bh); | ||
241 | |||
242 | status = ocfs2_journal_access(handle, inode, new_bh, | ||
243 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
244 | if (status < 0) { | ||
245 | mlog_errno(status); | ||
246 | goto bail; | ||
247 | } | ||
248 | memset(new_bh->b_data, 0, osb->sb->s_blocksize); | ||
249 | |||
250 | de = (struct ocfs2_dir_entry *) new_bh->b_data; | ||
251 | de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); | ||
252 | de->name_len = 1; | ||
253 | de->rec_len = | ||
254 | cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); | ||
255 | strcpy(de->name, "."); | ||
256 | ocfs2_set_de_type(de, S_IFDIR); | ||
257 | de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); | ||
258 | de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); | ||
259 | de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize - | ||
260 | OCFS2_DIR_REC_LEN(1)); | ||
261 | de->name_len = 2; | ||
262 | strcpy(de->name, ".."); | ||
263 | ocfs2_set_de_type(de, S_IFDIR); | ||
264 | |||
265 | status = ocfs2_journal_dirty(handle, new_bh); | ||
266 | if (status < 0) { | ||
267 | mlog_errno(status); | ||
268 | goto bail; | ||
269 | } | ||
270 | |||
271 | i_size_write(inode, inode->i_sb->s_blocksize); | ||
272 | inode->i_nlink = 2; | ||
273 | inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); | ||
274 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
275 | if (status < 0) { | ||
276 | mlog_errno(status); | ||
277 | goto bail; | ||
278 | } | ||
279 | |||
280 | status = 0; | ||
281 | bail: | ||
282 | if (new_bh) | ||
283 | brelse(new_bh); | ||
284 | |||
285 | mlog_exit(status); | ||
286 | return status; | ||
287 | } | ||
288 | |||
289 | static int ocfs2_mknod(struct inode *dir, | ||
290 | struct dentry *dentry, | ||
291 | int mode, | ||
292 | dev_t dev) | ||
293 | { | ||
294 | int status = 0; | ||
295 | struct buffer_head *parent_fe_bh = NULL; | ||
296 | struct ocfs2_journal_handle *handle = NULL; | ||
297 | struct ocfs2_super *osb; | ||
298 | struct ocfs2_dinode *dirfe; | ||
299 | struct buffer_head *new_fe_bh = NULL; | ||
300 | struct buffer_head *de_bh = NULL; | ||
301 | struct inode *inode = NULL; | ||
302 | struct ocfs2_alloc_context *inode_ac = NULL; | ||
303 | struct ocfs2_alloc_context *data_ac = NULL; | ||
304 | |||
305 | mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, | ||
306 | (unsigned long)dev, dentry->d_name.len, | ||
307 | dentry->d_name.name); | ||
308 | |||
309 | /* get our super block */ | ||
310 | osb = OCFS2_SB(dir->i_sb); | ||
311 | |||
312 | if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { | ||
313 | mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n", | ||
314 | OCFS2_I(dir)->ip_blkno, dir->i_nlink); | ||
315 | status = -EMLINK; | ||
316 | goto leave; | ||
317 | } | ||
318 | |||
319 | handle = ocfs2_alloc_handle(osb); | ||
320 | if (handle == NULL) { | ||
321 | status = -ENOMEM; | ||
322 | mlog_errno(status); | ||
323 | goto leave; | ||
324 | } | ||
325 | |||
326 | status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
327 | if (status < 0) { | ||
328 | if (status != -ENOENT) | ||
329 | mlog_errno(status); | ||
330 | goto leave; | ||
331 | } | ||
332 | |||
333 | dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
334 | if (!dirfe->i_links_count) { | ||
335 | /* can't make a file in a deleted directory. */ | ||
336 | status = -ENOENT; | ||
337 | goto leave; | ||
338 | } | ||
339 | |||
340 | status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
341 | dentry->d_name.len); | ||
342 | if (status) | ||
343 | goto leave; | ||
344 | |||
345 | /* get a spot inside the dir. */ | ||
346 | status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
347 | dentry->d_name.name, | ||
348 | dentry->d_name.len, &de_bh); | ||
349 | if (status < 0) { | ||
350 | mlog_errno(status); | ||
351 | goto leave; | ||
352 | } | ||
353 | |||
354 | /* reserve an inode spot */ | ||
355 | status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); | ||
356 | if (status < 0) { | ||
357 | if (status != -ENOSPC) | ||
358 | mlog_errno(status); | ||
359 | goto leave; | ||
360 | } | ||
361 | |||
362 | /* are we making a directory? If so, reserve a cluster for his | ||
363 | * 1st extent. */ | ||
364 | if (S_ISDIR(mode)) { | ||
365 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
366 | if (status < 0) { | ||
367 | if (status != -ENOSPC) | ||
368 | mlog_errno(status); | ||
369 | goto leave; | ||
370 | } | ||
371 | } | ||
372 | |||
373 | handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS); | ||
374 | if (IS_ERR(handle)) { | ||
375 | status = PTR_ERR(handle); | ||
376 | handle = NULL; | ||
377 | mlog_errno(status); | ||
378 | goto leave; | ||
379 | } | ||
380 | |||
381 | /* do the real work now. */ | ||
382 | status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, | ||
383 | &new_fe_bh, parent_fe_bh, handle, | ||
384 | &inode, inode_ac); | ||
385 | if (status < 0) { | ||
386 | mlog_errno(status); | ||
387 | goto leave; | ||
388 | } | ||
389 | |||
390 | if (S_ISDIR(mode)) { | ||
391 | status = ocfs2_fill_new_dir(osb, handle, dir, inode, | ||
392 | new_fe_bh, data_ac); | ||
393 | if (status < 0) { | ||
394 | mlog_errno(status); | ||
395 | goto leave; | ||
396 | } | ||
397 | |||
398 | status = ocfs2_journal_access(handle, dir, parent_fe_bh, | ||
399 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
400 | if (status < 0) { | ||
401 | mlog_errno(status); | ||
402 | goto leave; | ||
403 | } | ||
404 | le16_add_cpu(&dirfe->i_links_count, 1); | ||
405 | status = ocfs2_journal_dirty(handle, parent_fe_bh); | ||
406 | if (status < 0) { | ||
407 | mlog_errno(status); | ||
408 | goto leave; | ||
409 | } | ||
410 | dir->i_nlink++; | ||
411 | } | ||
412 | |||
413 | status = ocfs2_add_entry(handle, dentry, inode, | ||
414 | OCFS2_I(inode)->ip_blkno, parent_fe_bh, | ||
415 | de_bh); | ||
416 | if (status < 0) { | ||
417 | mlog_errno(status); | ||
418 | goto leave; | ||
419 | } | ||
420 | |||
421 | insert_inode_hash(inode); | ||
422 | dentry->d_op = &ocfs2_dentry_ops; | ||
423 | d_instantiate(dentry, inode); | ||
424 | status = 0; | ||
425 | leave: | ||
426 | if (handle) | ||
427 | ocfs2_commit_trans(handle); | ||
428 | |||
429 | if (status == -ENOSPC) | ||
430 | mlog(0, "Disk is full\n"); | ||
431 | |||
432 | if (new_fe_bh) | ||
433 | brelse(new_fe_bh); | ||
434 | |||
435 | if (de_bh) | ||
436 | brelse(de_bh); | ||
437 | |||
438 | if (parent_fe_bh) | ||
439 | brelse(parent_fe_bh); | ||
440 | |||
441 | if ((status < 0) && inode) | ||
442 | iput(inode); | ||
443 | |||
444 | if (inode_ac) | ||
445 | ocfs2_free_alloc_context(inode_ac); | ||
446 | |||
447 | if (data_ac) | ||
448 | ocfs2_free_alloc_context(data_ac); | ||
449 | |||
450 | mlog_exit(status); | ||
451 | |||
452 | return status; | ||
453 | } | ||
454 | |||
455 | static int ocfs2_mknod_locked(struct ocfs2_super *osb, | ||
456 | struct inode *dir, | ||
457 | struct dentry *dentry, int mode, | ||
458 | dev_t dev, | ||
459 | struct buffer_head **new_fe_bh, | ||
460 | struct buffer_head *parent_fe_bh, | ||
461 | struct ocfs2_journal_handle *handle, | ||
462 | struct inode **ret_inode, | ||
463 | struct ocfs2_alloc_context *inode_ac) | ||
464 | { | ||
465 | int status = 0; | ||
466 | struct ocfs2_dinode *fe = NULL; | ||
467 | struct ocfs2_extent_list *fel; | ||
468 | u64 fe_blkno = 0; | ||
469 | u16 suballoc_bit; | ||
470 | struct inode *inode = NULL; | ||
471 | |||
472 | mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, | ||
473 | (unsigned long)dev, dentry->d_name.len, | ||
474 | dentry->d_name.name); | ||
475 | |||
476 | *new_fe_bh = NULL; | ||
477 | *ret_inode = NULL; | ||
478 | |||
479 | status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, | ||
480 | &fe_blkno); | ||
481 | if (status < 0) { | ||
482 | mlog_errno(status); | ||
483 | goto leave; | ||
484 | } | ||
485 | |||
486 | inode = new_inode(dir->i_sb); | ||
487 | if (IS_ERR(inode)) { | ||
488 | status = PTR_ERR(inode); | ||
489 | mlog(ML_ERROR, "new_inode failed!\n"); | ||
490 | goto leave; | ||
491 | } | ||
492 | |||
493 | /* populate as many fields early on as possible - many of | ||
494 | * these are used by the support functions here and in | ||
495 | * callers. */ | ||
496 | inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); | ||
497 | OCFS2_I(inode)->ip_blkno = fe_blkno; | ||
498 | if (S_ISDIR(mode)) | ||
499 | inode->i_nlink = 2; | ||
500 | else | ||
501 | inode->i_nlink = 1; | ||
502 | inode->i_mode = mode; | ||
503 | spin_lock(&osb->osb_lock); | ||
504 | inode->i_generation = osb->s_next_generation++; | ||
505 | spin_unlock(&osb->osb_lock); | ||
506 | |||
507 | *new_fe_bh = sb_getblk(osb->sb, fe_blkno); | ||
508 | if (!*new_fe_bh) { | ||
509 | status = -EIO; | ||
510 | mlog_errno(status); | ||
511 | goto leave; | ||
512 | } | ||
513 | ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); | ||
514 | |||
515 | status = ocfs2_journal_access(handle, inode, *new_fe_bh, | ||
516 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
517 | if (status < 0) { | ||
518 | mlog_errno(status); | ||
519 | goto leave; | ||
520 | } | ||
521 | |||
522 | fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; | ||
523 | memset(fe, 0, osb->sb->s_blocksize); | ||
524 | |||
525 | fe->i_generation = cpu_to_le32(inode->i_generation); | ||
526 | fe->i_fs_generation = cpu_to_le32(osb->fs_generation); | ||
527 | fe->i_blkno = cpu_to_le64(fe_blkno); | ||
528 | fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); | ||
529 | fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); | ||
530 | fe->i_uid = cpu_to_le32(current->fsuid); | ||
531 | if (dir->i_mode & S_ISGID) { | ||
532 | fe->i_gid = cpu_to_le32(dir->i_gid); | ||
533 | if (S_ISDIR(mode)) | ||
534 | mode |= S_ISGID; | ||
535 | } else | ||
536 | fe->i_gid = cpu_to_le32(current->fsgid); | ||
537 | fe->i_mode = cpu_to_le16(mode); | ||
538 | if (S_ISCHR(mode) || S_ISBLK(mode)) | ||
539 | fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); | ||
540 | |||
541 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
542 | |||
543 | fe->i_last_eb_blk = 0; | ||
544 | strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); | ||
545 | le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); | ||
546 | fe->i_atime = fe->i_ctime = fe->i_mtime = | ||
547 | cpu_to_le64(CURRENT_TIME.tv_sec); | ||
548 | fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = | ||
549 | cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
550 | fe->i_dtime = 0; | ||
551 | |||
552 | fel = &fe->id2.i_list; | ||
553 | fel->l_tree_depth = 0; | ||
554 | fel->l_next_free_rec = 0; | ||
555 | fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); | ||
556 | |||
557 | status = ocfs2_journal_dirty(handle, *new_fe_bh); | ||
558 | if (status < 0) { | ||
559 | mlog_errno(status); | ||
560 | goto leave; | ||
561 | } | ||
562 | |||
563 | if (ocfs2_populate_inode(inode, fe, 1) < 0) { | ||
564 | mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " | ||
565 | "i_blkno=%"MLFu64", i_ino=%lu\n", | ||
566 | (unsigned long long) (*new_fe_bh)->b_blocknr, | ||
567 | fe->i_blkno, inode->i_ino); | ||
568 | BUG(); | ||
569 | } | ||
570 | |||
571 | ocfs2_inode_set_new(osb, inode); | ||
572 | status = ocfs2_create_new_inode_locks(inode); | ||
573 | if (status < 0) | ||
574 | mlog_errno(status); | ||
575 | |||
576 | status = 0; /* error in ocfs2_create_new_inode_locks is not | ||
577 | * critical */ | ||
578 | |||
579 | *ret_inode = inode; | ||
580 | leave: | ||
581 | if (status < 0) { | ||
582 | if (*new_fe_bh) { | ||
583 | brelse(*new_fe_bh); | ||
584 | *new_fe_bh = NULL; | ||
585 | } | ||
586 | if (inode) | ||
587 | iput(inode); | ||
588 | } | ||
589 | |||
590 | mlog_exit(status); | ||
591 | return status; | ||
592 | } | ||
593 | |||
594 | static int ocfs2_mkdir(struct inode *dir, | ||
595 | struct dentry *dentry, | ||
596 | int mode) | ||
597 | { | ||
598 | int ret; | ||
599 | |||
600 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, | ||
601 | dentry->d_name.len, dentry->d_name.name); | ||
602 | ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); | ||
603 | mlog_exit(ret); | ||
604 | |||
605 | return ret; | ||
606 | } | ||
607 | |||
608 | static int ocfs2_create(struct inode *dir, | ||
609 | struct dentry *dentry, | ||
610 | int mode, | ||
611 | struct nameidata *nd) | ||
612 | { | ||
613 | int ret; | ||
614 | |||
615 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, | ||
616 | dentry->d_name.len, dentry->d_name.name); | ||
617 | ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); | ||
618 | mlog_exit(ret); | ||
619 | |||
620 | return ret; | ||
621 | } | ||
622 | |||
623 | static int ocfs2_link(struct dentry *old_dentry, | ||
624 | struct inode *dir, | ||
625 | struct dentry *dentry) | ||
626 | { | ||
627 | struct ocfs2_journal_handle *handle = NULL; | ||
628 | struct inode *inode = old_dentry->d_inode; | ||
629 | int err; | ||
630 | struct buffer_head *fe_bh = NULL; | ||
631 | struct buffer_head *parent_fe_bh = NULL; | ||
632 | struct buffer_head *de_bh = NULL; | ||
633 | struct ocfs2_dinode *fe = NULL; | ||
634 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); | ||
635 | |||
636 | mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, | ||
637 | old_dentry->d_name.len, old_dentry->d_name.name, | ||
638 | dentry->d_name.len, dentry->d_name.name); | ||
639 | |||
640 | if (S_ISDIR(inode->i_mode)) { | ||
641 | err = -EPERM; | ||
642 | goto bail; | ||
643 | } | ||
644 | |||
645 | if (inode->i_nlink >= OCFS2_LINK_MAX) { | ||
646 | err = -EMLINK; | ||
647 | goto bail; | ||
648 | } | ||
649 | |||
650 | handle = ocfs2_alloc_handle(osb); | ||
651 | if (handle == NULL) { | ||
652 | err = -ENOMEM; | ||
653 | goto bail; | ||
654 | } | ||
655 | |||
656 | err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
657 | if (err < 0) { | ||
658 | if (err != -ENOENT) | ||
659 | mlog_errno(err); | ||
660 | goto bail; | ||
661 | } | ||
662 | |||
663 | err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
664 | dentry->d_name.len); | ||
665 | if (err) | ||
666 | goto bail; | ||
667 | |||
668 | err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
669 | dentry->d_name.name, | ||
670 | dentry->d_name.len, &de_bh); | ||
671 | if (err < 0) { | ||
672 | mlog_errno(err); | ||
673 | goto bail; | ||
674 | } | ||
675 | |||
676 | err = ocfs2_meta_lock(inode, handle, &fe_bh, 1); | ||
677 | if (err < 0) { | ||
678 | if (err != -ENOENT) | ||
679 | mlog_errno(err); | ||
680 | goto bail; | ||
681 | } | ||
682 | |||
683 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
684 | if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { | ||
685 | err = -EMLINK; | ||
686 | goto bail; | ||
687 | } | ||
688 | |||
689 | handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS); | ||
690 | if (IS_ERR(handle)) { | ||
691 | err = PTR_ERR(handle); | ||
692 | handle = NULL; | ||
693 | mlog_errno(err); | ||
694 | goto bail; | ||
695 | } | ||
696 | |||
697 | err = ocfs2_journal_access(handle, inode, fe_bh, | ||
698 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
699 | if (err < 0) { | ||
700 | mlog_errno(err); | ||
701 | goto bail; | ||
702 | } | ||
703 | |||
704 | inode->i_nlink++; | ||
705 | inode->i_ctime = CURRENT_TIME; | ||
706 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
707 | fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
708 | fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
709 | |||
710 | err = ocfs2_journal_dirty(handle, fe_bh); | ||
711 | if (err < 0) { | ||
712 | le16_add_cpu(&fe->i_links_count, -1); | ||
713 | inode->i_nlink--; | ||
714 | mlog_errno(err); | ||
715 | goto bail; | ||
716 | } | ||
717 | |||
718 | err = ocfs2_add_entry(handle, dentry, inode, | ||
719 | OCFS2_I(inode)->ip_blkno, | ||
720 | parent_fe_bh, de_bh); | ||
721 | if (err) { | ||
722 | le16_add_cpu(&fe->i_links_count, -1); | ||
723 | inode->i_nlink--; | ||
724 | mlog_errno(err); | ||
725 | goto bail; | ||
726 | } | ||
727 | |||
728 | atomic_inc(&inode->i_count); | ||
729 | dentry->d_op = &ocfs2_dentry_ops; | ||
730 | d_instantiate(dentry, inode); | ||
731 | bail: | ||
732 | if (handle) | ||
733 | ocfs2_commit_trans(handle); | ||
734 | if (de_bh) | ||
735 | brelse(de_bh); | ||
736 | if (fe_bh) | ||
737 | brelse(fe_bh); | ||
738 | if (parent_fe_bh) | ||
739 | brelse(parent_fe_bh); | ||
740 | |||
741 | mlog_exit(err); | ||
742 | |||
743 | return err; | ||
744 | } | ||
745 | |||
746 | static int ocfs2_unlink(struct inode *dir, | ||
747 | struct dentry *dentry) | ||
748 | { | ||
749 | int status; | ||
750 | unsigned int saved_nlink = 0; | ||
751 | struct inode *inode = dentry->d_inode; | ||
752 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); | ||
753 | u64 blkno; | ||
754 | struct ocfs2_dinode *fe = NULL; | ||
755 | struct buffer_head *fe_bh = NULL; | ||
756 | struct buffer_head *parent_node_bh = NULL; | ||
757 | struct ocfs2_journal_handle *handle = NULL; | ||
758 | struct ocfs2_dir_entry *dirent = NULL; | ||
759 | struct buffer_head *dirent_bh = NULL; | ||
760 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
761 | struct buffer_head *orphan_entry_bh = NULL; | ||
762 | |||
763 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, | ||
764 | dentry->d_name.len, dentry->d_name.name); | ||
765 | |||
766 | BUG_ON(dentry->d_parent->d_inode != dir); | ||
767 | |||
768 | mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
769 | |||
770 | if (inode == osb->root_inode) { | ||
771 | mlog(0, "Cannot delete the root directory\n"); | ||
772 | status = -EPERM; | ||
773 | goto leave; | ||
774 | } | ||
775 | |||
776 | handle = ocfs2_alloc_handle(osb); | ||
777 | if (handle == NULL) { | ||
778 | status = -ENOMEM; | ||
779 | mlog_errno(status); | ||
780 | goto leave; | ||
781 | } | ||
782 | |||
783 | status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1); | ||
784 | if (status < 0) { | ||
785 | if (status != -ENOENT) | ||
786 | mlog_errno(status); | ||
787 | goto leave; | ||
788 | } | ||
789 | |||
790 | status = ocfs2_find_files_on_disk(dentry->d_name.name, | ||
791 | dentry->d_name.len, &blkno, | ||
792 | dir, &dirent_bh, &dirent); | ||
793 | if (status < 0) { | ||
794 | if (status != -ENOENT) | ||
795 | mlog_errno(status); | ||
796 | goto leave; | ||
797 | } | ||
798 | |||
799 | if (OCFS2_I(inode)->ip_blkno != blkno) { | ||
800 | status = -ENOENT; | ||
801 | |||
802 | mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") " | ||
803 | "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno, | ||
804 | OCFS2_I(inode)->ip_flags); | ||
805 | goto leave; | ||
806 | } | ||
807 | |||
808 | status = ocfs2_meta_lock(inode, handle, &fe_bh, 1); | ||
809 | if (status < 0) { | ||
810 | if (status != -ENOENT) | ||
811 | mlog_errno(status); | ||
812 | goto leave; | ||
813 | } | ||
814 | |||
815 | if (S_ISDIR(inode->i_mode)) { | ||
816 | if (!ocfs2_empty_dir(inode)) { | ||
817 | status = -ENOTEMPTY; | ||
818 | goto leave; | ||
819 | } else if (inode->i_nlink != 2) { | ||
820 | status = -ENOTEMPTY; | ||
821 | goto leave; | ||
822 | } | ||
823 | } | ||
824 | |||
825 | /* There are still a few steps left until we can consider the | ||
826 | * unlink to have succeeded. Save off nlink here before | ||
827 | * modification so we can set it back in case we hit an issue | ||
828 | * before commit. */ | ||
829 | saved_nlink = inode->i_nlink; | ||
830 | if (S_ISDIR(inode->i_mode)) | ||
831 | inode->i_nlink = 0; | ||
832 | else | ||
833 | inode->i_nlink--; | ||
834 | |||
835 | status = ocfs2_request_unlink_vote(inode, dentry, | ||
836 | (unsigned int) inode->i_nlink); | ||
837 | if (status < 0) { | ||
838 | /* This vote should succeed under all normal | ||
839 | * circumstances. */ | ||
840 | mlog_errno(status); | ||
841 | goto leave; | ||
842 | } | ||
843 | |||
844 | if (!inode->i_nlink) { | ||
845 | status = ocfs2_prepare_orphan_dir(osb, handle, inode, | ||
846 | orphan_name, | ||
847 | &orphan_entry_bh); | ||
848 | if (status < 0) { | ||
849 | mlog_errno(status); | ||
850 | goto leave; | ||
851 | } | ||
852 | } | ||
853 | |||
854 | handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS); | ||
855 | if (IS_ERR(handle)) { | ||
856 | status = PTR_ERR(handle); | ||
857 | handle = NULL; | ||
858 | mlog_errno(status); | ||
859 | goto leave; | ||
860 | } | ||
861 | |||
862 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
863 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
864 | if (status < 0) { | ||
865 | mlog_errno(status); | ||
866 | goto leave; | ||
867 | } | ||
868 | |||
869 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
870 | |||
871 | if (!inode->i_nlink) { | ||
872 | status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, | ||
873 | orphan_entry_bh); | ||
874 | if (status < 0) { | ||
875 | mlog_errno(status); | ||
876 | goto leave; | ||
877 | } | ||
878 | } | ||
879 | |||
880 | /* delete the name from the parent dir */ | ||
881 | status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); | ||
882 | if (status < 0) { | ||
883 | mlog_errno(status); | ||
884 | goto leave; | ||
885 | } | ||
886 | |||
887 | /* We can set nlink on the dinode now. clear the saved version | ||
888 | * so that it doesn't get set later. */ | ||
889 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
890 | saved_nlink = 0; | ||
891 | |||
892 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
893 | if (status < 0) { | ||
894 | mlog_errno(status); | ||
895 | goto leave; | ||
896 | } | ||
897 | |||
898 | if (S_ISDIR(inode->i_mode)) { | ||
899 | dir->i_nlink--; | ||
900 | status = ocfs2_mark_inode_dirty(handle, dir, | ||
901 | parent_node_bh); | ||
902 | if (status < 0) { | ||
903 | mlog_errno(status); | ||
904 | dir->i_nlink++; | ||
905 | } | ||
906 | } | ||
907 | |||
908 | leave: | ||
909 | if (status < 0 && saved_nlink) | ||
910 | inode->i_nlink = saved_nlink; | ||
911 | |||
912 | if (handle) | ||
913 | ocfs2_commit_trans(handle); | ||
914 | |||
915 | if (fe_bh) | ||
916 | brelse(fe_bh); | ||
917 | |||
918 | if (dirent_bh) | ||
919 | brelse(dirent_bh); | ||
920 | |||
921 | if (parent_node_bh) | ||
922 | brelse(parent_node_bh); | ||
923 | |||
924 | if (orphan_entry_bh) | ||
925 | brelse(orphan_entry_bh); | ||
926 | |||
927 | mlog_exit(status); | ||
928 | |||
929 | return status; | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * The only place this should be used is rename! | ||
934 | * if they have the same id, then the 1st one is the only one locked. | ||
935 | */ | ||
936 | static int ocfs2_double_lock(struct ocfs2_super *osb, | ||
937 | struct ocfs2_journal_handle *handle, | ||
938 | struct buffer_head **bh1, | ||
939 | struct inode *inode1, | ||
940 | struct buffer_head **bh2, | ||
941 | struct inode *inode2) | ||
942 | { | ||
943 | int status; | ||
944 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); | ||
945 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); | ||
946 | struct buffer_head **tmpbh; | ||
947 | struct inode *tmpinode; | ||
948 | |||
949 | mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n", | ||
950 | oi1->ip_blkno, oi2->ip_blkno); | ||
951 | |||
952 | BUG_ON(!handle); | ||
953 | |||
954 | if (*bh1) | ||
955 | *bh1 = NULL; | ||
956 | if (*bh2) | ||
957 | *bh2 = NULL; | ||
958 | |||
959 | /* we always want to lock the one with the lower lockid first. */ | ||
960 | if (oi1->ip_blkno != oi2->ip_blkno) { | ||
961 | if (oi1->ip_blkno < oi2->ip_blkno) { | ||
962 | /* switch id1 and id2 around */ | ||
963 | mlog(0, "switching them around...\n"); | ||
964 | tmpbh = bh2; | ||
965 | bh2 = bh1; | ||
966 | bh1 = tmpbh; | ||
967 | |||
968 | tmpinode = inode2; | ||
969 | inode2 = inode1; | ||
970 | inode1 = tmpinode; | ||
971 | } | ||
972 | /* lock id2 */ | ||
973 | status = ocfs2_meta_lock(inode2, handle, bh2, 1); | ||
974 | if (status < 0) { | ||
975 | if (status != -ENOENT) | ||
976 | mlog_errno(status); | ||
977 | goto bail; | ||
978 | } | ||
979 | } | ||
980 | /* lock id1 */ | ||
981 | status = ocfs2_meta_lock(inode1, handle, bh1, 1); | ||
982 | if (status < 0) { | ||
983 | if (status != -ENOENT) | ||
984 | mlog_errno(status); | ||
985 | goto bail; | ||
986 | } | ||
987 | bail: | ||
988 | mlog_exit(status); | ||
989 | return status; | ||
990 | } | ||
991 | |||
992 | #define PARENT_INO(buffer) \ | ||
993 | ((struct ocfs2_dir_entry *) \ | ||
994 | ((char *)buffer + \ | ||
995 | le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode | ||
996 | |||
997 | static int ocfs2_rename(struct inode *old_dir, | ||
998 | struct dentry *old_dentry, | ||
999 | struct inode *new_dir, | ||
1000 | struct dentry *new_dentry) | ||
1001 | { | ||
1002 | int status = 0, rename_lock = 0; | ||
1003 | struct inode *old_inode = old_dentry->d_inode; | ||
1004 | struct inode *new_inode = new_dentry->d_inode; | ||
1005 | struct ocfs2_dinode *newfe = NULL; | ||
1006 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
1007 | struct buffer_head *orphan_entry_bh = NULL; | ||
1008 | struct buffer_head *newfe_bh = NULL; | ||
1009 | struct buffer_head *insert_entry_bh = NULL; | ||
1010 | struct ocfs2_super *osb = NULL; | ||
1011 | u64 newfe_blkno; | ||
1012 | struct ocfs2_journal_handle *handle = NULL; | ||
1013 | struct buffer_head *old_dir_bh = NULL; | ||
1014 | struct buffer_head *new_dir_bh = NULL; | ||
1015 | struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry | ||
1016 | // and new_dentry | ||
1017 | struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above | ||
1018 | struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, | ||
1019 | // this is the 1st dirent bh | ||
1020 | nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; | ||
1021 | unsigned int links_count; | ||
1022 | |||
1023 | /* At some point it might be nice to break this function up a | ||
1024 | * bit. */ | ||
1025 | |||
1026 | mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", | ||
1027 | old_dir, old_dentry, new_dir, new_dentry, | ||
1028 | old_dentry->d_name.len, old_dentry->d_name.name, | ||
1029 | new_dentry->d_name.len, new_dentry->d_name.name); | ||
1030 | |||
1031 | osb = OCFS2_SB(old_dir->i_sb); | ||
1032 | |||
1033 | if (new_inode) { | ||
1034 | if (!igrab(new_inode)) | ||
1035 | BUG(); | ||
1036 | } | ||
1037 | |||
1038 | if (atomic_read(&old_dentry->d_count) > 2) { | ||
1039 | shrink_dcache_parent(old_dentry); | ||
1040 | if (atomic_read(&old_dentry->d_count) > 2) { | ||
1041 | status = -EBUSY; | ||
1042 | goto bail; | ||
1043 | } | ||
1044 | } | ||
1045 | |||
1046 | /* Assume a directory heirarchy thusly: | ||
1047 | * a/b/c | ||
1048 | * a/d | ||
1049 | * a,b,c, and d are all directories. | ||
1050 | * | ||
1051 | * from cwd of 'a' on both nodes: | ||
1052 | * node1: mv b/c d | ||
1053 | * node2: mv d b/c | ||
1054 | * | ||
1055 | * And that's why, just like the VFS, we need a file system | ||
1056 | * rename lock. */ | ||
1057 | if (old_dentry != new_dentry) { | ||
1058 | status = ocfs2_rename_lock(osb); | ||
1059 | if (status < 0) { | ||
1060 | mlog_errno(status); | ||
1061 | goto bail; | ||
1062 | } | ||
1063 | rename_lock = 1; | ||
1064 | } | ||
1065 | |||
1066 | handle = ocfs2_alloc_handle(osb); | ||
1067 | if (handle == NULL) { | ||
1068 | status = -ENOMEM; | ||
1069 | mlog_errno(status); | ||
1070 | goto bail; | ||
1071 | } | ||
1072 | |||
1073 | /* if old and new are the same, this'll just do one lock. */ | ||
1074 | status = ocfs2_double_lock(osb, handle, | ||
1075 | &old_dir_bh, old_dir, | ||
1076 | &new_dir_bh, new_dir); | ||
1077 | if (status < 0) { | ||
1078 | mlog_errno(status); | ||
1079 | goto bail; | ||
1080 | } | ||
1081 | |||
1082 | /* make sure both dirs have bhs | ||
1083 | * get an extra ref on old_dir_bh if old==new */ | ||
1084 | if (!new_dir_bh) { | ||
1085 | if (old_dir_bh) { | ||
1086 | new_dir_bh = old_dir_bh; | ||
1087 | get_bh(new_dir_bh); | ||
1088 | } else { | ||
1089 | mlog(ML_ERROR, "no old_dir_bh!\n"); | ||
1090 | status = -EIO; | ||
1091 | goto bail; | ||
1092 | } | ||
1093 | } | ||
1094 | |||
1095 | if (S_ISDIR(old_inode->i_mode)) { | ||
1096 | /* Directories actually require metadata updates to | ||
1097 | * the directory info so we can't get away with not | ||
1098 | * doing node locking on it. */ | ||
1099 | status = ocfs2_meta_lock(old_inode, handle, NULL, 1); | ||
1100 | if (status < 0) { | ||
1101 | if (status != -ENOENT) | ||
1102 | mlog_errno(status); | ||
1103 | goto bail; | ||
1104 | } | ||
1105 | |||
1106 | status = ocfs2_request_rename_vote(old_inode, old_dentry); | ||
1107 | if (status < 0) { | ||
1108 | mlog_errno(status); | ||
1109 | goto bail; | ||
1110 | } | ||
1111 | |||
1112 | status = -EIO; | ||
1113 | old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); | ||
1114 | if (!old_inode_de_bh) | ||
1115 | goto bail; | ||
1116 | |||
1117 | status = -EIO; | ||
1118 | if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != | ||
1119 | OCFS2_I(old_dir)->ip_blkno) | ||
1120 | goto bail; | ||
1121 | status = -EMLINK; | ||
1122 | if (!new_inode && new_dir!=old_dir && | ||
1123 | new_dir->i_nlink >= OCFS2_LINK_MAX) | ||
1124 | goto bail; | ||
1125 | } else { | ||
1126 | /* Ah, the simple case - we're a file so just send a | ||
1127 | * message. */ | ||
1128 | status = ocfs2_request_rename_vote(old_inode, old_dentry); | ||
1129 | if (status < 0) { | ||
1130 | mlog_errno(status); | ||
1131 | goto bail; | ||
1132 | } | ||
1133 | } | ||
1134 | |||
1135 | status = -ENOENT; | ||
1136 | old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, | ||
1137 | old_dentry->d_name.len, | ||
1138 | old_dir, &old_de); | ||
1139 | if (!old_de_bh) | ||
1140 | goto bail; | ||
1141 | |||
1142 | /* | ||
1143 | * Check for inode number is _not_ due to possible IO errors. | ||
1144 | * We might rmdir the source, keep it as pwd of some process | ||
1145 | * and merrily kill the link to whatever was created under the | ||
1146 | * same name. Goodbye sticky bit ;-< | ||
1147 | */ | ||
1148 | if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) | ||
1149 | goto bail; | ||
1150 | |||
1151 | /* check if the target already exists (in which case we need | ||
1152 | * to delete it */ | ||
1153 | status = ocfs2_find_files_on_disk(new_dentry->d_name.name, | ||
1154 | new_dentry->d_name.len, | ||
1155 | &newfe_blkno, new_dir, &new_de_bh, | ||
1156 | &new_de); | ||
1157 | /* The only error we allow here is -ENOENT because the new | ||
1158 | * file not existing is perfectly valid. */ | ||
1159 | if ((status < 0) && (status != -ENOENT)) { | ||
1160 | /* If we cannot find the file specified we should just */ | ||
1161 | /* return the error... */ | ||
1162 | mlog_errno(status); | ||
1163 | goto bail; | ||
1164 | } | ||
1165 | |||
1166 | if (!new_de && new_inode) | ||
1167 | mlog(ML_ERROR, "inode %lu does not exist in it's parent " | ||
1168 | "directory!", new_inode->i_ino); | ||
1169 | |||
1170 | /* In case we need to overwrite an existing file, we blow it | ||
1171 | * away first */ | ||
1172 | if (new_de) { | ||
1173 | /* VFS didn't think there existed an inode here, but | ||
1174 | * someone else in the cluster must have raced our | ||
1175 | * rename to create one. Today we error cleanly, in | ||
1176 | * the future we should consider calling iget to build | ||
1177 | * a new struct inode for this entry. */ | ||
1178 | if (!new_inode) { | ||
1179 | status = -EACCES; | ||
1180 | |||
1181 | mlog(0, "We found an inode for name %.*s but VFS " | ||
1182 | "didn't give us one.\n", new_dentry->d_name.len, | ||
1183 | new_dentry->d_name.name); | ||
1184 | goto bail; | ||
1185 | } | ||
1186 | |||
1187 | if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { | ||
1188 | status = -EACCES; | ||
1189 | |||
1190 | mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") " | ||
1191 | "disagree. ip_flags = %x\n", | ||
1192 | OCFS2_I(new_inode)->ip_blkno, newfe_blkno, | ||
1193 | OCFS2_I(new_inode)->ip_flags); | ||
1194 | goto bail; | ||
1195 | } | ||
1196 | |||
1197 | status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1); | ||
1198 | if (status < 0) { | ||
1199 | if (status != -ENOENT) | ||
1200 | mlog_errno(status); | ||
1201 | goto bail; | ||
1202 | } | ||
1203 | |||
1204 | if (S_ISDIR(new_inode->i_mode)) | ||
1205 | links_count = 0; | ||
1206 | else | ||
1207 | links_count = (unsigned int) (new_inode->i_nlink - 1); | ||
1208 | |||
1209 | status = ocfs2_request_unlink_vote(new_inode, new_dentry, | ||
1210 | links_count); | ||
1211 | if (status < 0) { | ||
1212 | mlog_errno(status); | ||
1213 | goto bail; | ||
1214 | } | ||
1215 | |||
1216 | newfe = (struct ocfs2_dinode *) newfe_bh->b_data; | ||
1217 | |||
1218 | mlog(0, "aha rename over existing... new_de=%p " | ||
1219 | "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n", | ||
1220 | new_de, newfe_blkno, newfe_bh, newfe_bh ? | ||
1221 | (unsigned long long)newfe_bh->b_blocknr : 0ULL); | ||
1222 | |||
1223 | if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { | ||
1224 | status = ocfs2_prepare_orphan_dir(osb, handle, | ||
1225 | new_inode, | ||
1226 | orphan_name, | ||
1227 | &orphan_entry_bh); | ||
1228 | if (status < 0) { | ||
1229 | mlog_errno(status); | ||
1230 | goto bail; | ||
1231 | } | ||
1232 | } | ||
1233 | } else { | ||
1234 | BUG_ON(new_dentry->d_parent->d_inode != new_dir); | ||
1235 | |||
1236 | status = ocfs2_check_dir_for_entry(new_dir, | ||
1237 | new_dentry->d_name.name, | ||
1238 | new_dentry->d_name.len); | ||
1239 | if (status) | ||
1240 | goto bail; | ||
1241 | |||
1242 | status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, | ||
1243 | new_dentry->d_name.name, | ||
1244 | new_dentry->d_name.len, | ||
1245 | &insert_entry_bh); | ||
1246 | if (status < 0) { | ||
1247 | mlog_errno(status); | ||
1248 | goto bail; | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS); | ||
1253 | if (IS_ERR(handle)) { | ||
1254 | status = PTR_ERR(handle); | ||
1255 | handle = NULL; | ||
1256 | mlog_errno(status); | ||
1257 | goto bail; | ||
1258 | } | ||
1259 | |||
1260 | if (new_de) { | ||
1261 | if (S_ISDIR(new_inode->i_mode)) { | ||
1262 | if (!ocfs2_empty_dir(new_inode) || | ||
1263 | new_inode->i_nlink != 2) { | ||
1264 | status = -ENOTEMPTY; | ||
1265 | goto bail; | ||
1266 | } | ||
1267 | } | ||
1268 | status = ocfs2_journal_access(handle, new_inode, newfe_bh, | ||
1269 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1270 | if (status < 0) { | ||
1271 | mlog_errno(status); | ||
1272 | goto bail; | ||
1273 | } | ||
1274 | |||
1275 | if (S_ISDIR(new_inode->i_mode) || | ||
1276 | (newfe->i_links_count == cpu_to_le16(1))){ | ||
1277 | status = ocfs2_orphan_add(osb, handle, new_inode, | ||
1278 | newfe, orphan_name, | ||
1279 | orphan_entry_bh); | ||
1280 | if (status < 0) { | ||
1281 | mlog_errno(status); | ||
1282 | goto bail; | ||
1283 | } | ||
1284 | } | ||
1285 | |||
1286 | /* change the dirent to point to the correct inode */ | ||
1287 | status = ocfs2_journal_access(handle, new_dir, new_de_bh, | ||
1288 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1289 | if (status < 0) { | ||
1290 | mlog_errno(status); | ||
1291 | goto bail; | ||
1292 | } | ||
1293 | new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno); | ||
1294 | new_de->file_type = old_de->file_type; | ||
1295 | new_dir->i_version++; | ||
1296 | status = ocfs2_journal_dirty(handle, new_de_bh); | ||
1297 | if (status < 0) { | ||
1298 | mlog_errno(status); | ||
1299 | goto bail; | ||
1300 | } | ||
1301 | |||
1302 | if (S_ISDIR(new_inode->i_mode)) | ||
1303 | newfe->i_links_count = 0; | ||
1304 | else | ||
1305 | le16_add_cpu(&newfe->i_links_count, -1); | ||
1306 | |||
1307 | status = ocfs2_journal_dirty(handle, newfe_bh); | ||
1308 | if (status < 0) { | ||
1309 | mlog_errno(status); | ||
1310 | goto bail; | ||
1311 | } | ||
1312 | } else { | ||
1313 | /* if the name was not found in new_dir, add it now */ | ||
1314 | status = ocfs2_add_entry(handle, new_dentry, old_inode, | ||
1315 | OCFS2_I(old_inode)->ip_blkno, | ||
1316 | new_dir_bh, insert_entry_bh); | ||
1317 | } | ||
1318 | |||
1319 | old_inode->i_ctime = CURRENT_TIME; | ||
1320 | mark_inode_dirty(old_inode); | ||
1321 | |||
1322 | /* now that the name has been added to new_dir, remove the old name */ | ||
1323 | status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); | ||
1324 | if (status < 0) { | ||
1325 | mlog_errno(status); | ||
1326 | goto bail; | ||
1327 | } | ||
1328 | |||
1329 | if (new_inode) { | ||
1330 | new_inode->i_nlink--; | ||
1331 | new_inode->i_ctime = CURRENT_TIME; | ||
1332 | } | ||
1333 | old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; | ||
1334 | if (old_inode_de_bh) { | ||
1335 | status = ocfs2_journal_access(handle, old_inode, | ||
1336 | old_inode_de_bh, | ||
1337 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1338 | PARENT_INO(old_inode_de_bh->b_data) = | ||
1339 | cpu_to_le64(OCFS2_I(new_dir)->ip_blkno); | ||
1340 | status = ocfs2_journal_dirty(handle, old_inode_de_bh); | ||
1341 | old_dir->i_nlink--; | ||
1342 | if (new_inode) { | ||
1343 | new_inode->i_nlink--; | ||
1344 | } else { | ||
1345 | new_dir->i_nlink++; | ||
1346 | mark_inode_dirty(new_dir); | ||
1347 | } | ||
1348 | } | ||
1349 | mark_inode_dirty(old_dir); | ||
1350 | if (new_inode) | ||
1351 | mark_inode_dirty(new_inode); | ||
1352 | |||
1353 | if (old_dir != new_dir) | ||
1354 | if (new_dir_nlink != new_dir->i_nlink) { | ||
1355 | if (!new_dir_bh) { | ||
1356 | mlog(ML_ERROR, "need to change nlink for new " | ||
1357 | "dir %"MLFu64" from %d to %d but bh is " | ||
1358 | "NULL\n", OCFS2_I(new_dir)->ip_blkno, | ||
1359 | (int)new_dir_nlink, new_dir->i_nlink); | ||
1360 | } else { | ||
1361 | struct ocfs2_dinode *fe; | ||
1362 | status = ocfs2_journal_access(handle, | ||
1363 | new_dir, | ||
1364 | new_dir_bh, | ||
1365 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1366 | fe = (struct ocfs2_dinode *) new_dir_bh->b_data; | ||
1367 | fe->i_links_count = cpu_to_le16(new_dir->i_nlink); | ||
1368 | status = ocfs2_journal_dirty(handle, new_dir_bh); | ||
1369 | } | ||
1370 | } | ||
1371 | |||
1372 | if (old_dir_nlink != old_dir->i_nlink) { | ||
1373 | if (!old_dir_bh) { | ||
1374 | mlog(ML_ERROR, "need to change nlink for old dir " | ||
1375 | "%"MLFu64" from %d to %d but bh is NULL!\n", | ||
1376 | OCFS2_I(old_dir)->ip_blkno, | ||
1377 | (int)old_dir_nlink, | ||
1378 | old_dir->i_nlink); | ||
1379 | } else { | ||
1380 | struct ocfs2_dinode *fe; | ||
1381 | status = ocfs2_journal_access(handle, old_dir, | ||
1382 | old_dir_bh, | ||
1383 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1384 | fe = (struct ocfs2_dinode *) old_dir_bh->b_data; | ||
1385 | fe->i_links_count = cpu_to_le16(old_dir->i_nlink); | ||
1386 | status = ocfs2_journal_dirty(handle, old_dir_bh); | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | status = 0; | ||
1391 | bail: | ||
1392 | if (rename_lock) | ||
1393 | ocfs2_rename_unlock(osb); | ||
1394 | |||
1395 | if (handle) | ||
1396 | ocfs2_commit_trans(handle); | ||
1397 | |||
1398 | if (new_inode) | ||
1399 | sync_mapping_buffers(old_inode->i_mapping); | ||
1400 | |||
1401 | if (new_inode) | ||
1402 | iput(new_inode); | ||
1403 | if (newfe_bh) | ||
1404 | brelse(newfe_bh); | ||
1405 | if (old_dir_bh) | ||
1406 | brelse(old_dir_bh); | ||
1407 | if (new_dir_bh) | ||
1408 | brelse(new_dir_bh); | ||
1409 | if (new_de_bh) | ||
1410 | brelse(new_de_bh); | ||
1411 | if (old_de_bh) | ||
1412 | brelse(old_de_bh); | ||
1413 | if (old_inode_de_bh) | ||
1414 | brelse(old_inode_de_bh); | ||
1415 | if (orphan_entry_bh) | ||
1416 | brelse(orphan_entry_bh); | ||
1417 | if (insert_entry_bh) | ||
1418 | brelse(insert_entry_bh); | ||
1419 | |||
1420 | mlog_exit(status); | ||
1421 | |||
1422 | return status; | ||
1423 | } | ||
1424 | |||
1425 | /* | ||
1426 | * we expect i_size = strlen(symname). Copy symname into the file | ||
1427 | * data, including the null terminator. | ||
1428 | */ | ||
1429 | static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | ||
1430 | struct ocfs2_journal_handle *handle, | ||
1431 | struct inode *inode, | ||
1432 | const char *symname) | ||
1433 | { | ||
1434 | struct buffer_head **bhs = NULL; | ||
1435 | const char *c; | ||
1436 | struct super_block *sb = osb->sb; | ||
1437 | u64 p_blkno; | ||
1438 | int p_blocks; | ||
1439 | int virtual, blocks, status, i, bytes_left; | ||
1440 | |||
1441 | bytes_left = i_size_read(inode) + 1; | ||
1442 | /* we can't trust i_blocks because we're actually going to | ||
1443 | * write i_size + 1 bytes. */ | ||
1444 | blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; | ||
1445 | |||
1446 | mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n", | ||
1447 | inode->i_blocks, i_size_read(inode), blocks); | ||
1448 | |||
1449 | /* Sanity check -- make sure we're going to fit. */ | ||
1450 | if (bytes_left > | ||
1451 | ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { | ||
1452 | status = -EIO; | ||
1453 | mlog_errno(status); | ||
1454 | goto bail; | ||
1455 | } | ||
1456 | |||
1457 | bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); | ||
1458 | if (!bhs) { | ||
1459 | status = -ENOMEM; | ||
1460 | mlog_errno(status); | ||
1461 | goto bail; | ||
1462 | } | ||
1463 | |||
1464 | status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, | ||
1465 | &p_blocks); | ||
1466 | if (status < 0) { | ||
1467 | mlog_errno(status); | ||
1468 | goto bail; | ||
1469 | } | ||
1470 | |||
1471 | /* links can never be larger than one cluster so we know this | ||
1472 | * is all going to be contiguous, but do a sanity check | ||
1473 | * anyway. */ | ||
1474 | if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { | ||
1475 | status = -EIO; | ||
1476 | mlog_errno(status); | ||
1477 | goto bail; | ||
1478 | } | ||
1479 | |||
1480 | virtual = 0; | ||
1481 | while(bytes_left > 0) { | ||
1482 | c = &symname[virtual * sb->s_blocksize]; | ||
1483 | |||
1484 | bhs[virtual] = sb_getblk(sb, p_blkno); | ||
1485 | if (!bhs[virtual]) { | ||
1486 | status = -ENOMEM; | ||
1487 | mlog_errno(status); | ||
1488 | goto bail; | ||
1489 | } | ||
1490 | ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); | ||
1491 | |||
1492 | status = ocfs2_journal_access(handle, inode, bhs[virtual], | ||
1493 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
1494 | if (status < 0) { | ||
1495 | mlog_errno(status); | ||
1496 | goto bail; | ||
1497 | } | ||
1498 | |||
1499 | memset(bhs[virtual]->b_data, 0, sb->s_blocksize); | ||
1500 | |||
1501 | memcpy(bhs[virtual]->b_data, c, | ||
1502 | (bytes_left > sb->s_blocksize) ? sb->s_blocksize : | ||
1503 | bytes_left); | ||
1504 | |||
1505 | status = ocfs2_journal_dirty(handle, bhs[virtual]); | ||
1506 | if (status < 0) { | ||
1507 | mlog_errno(status); | ||
1508 | goto bail; | ||
1509 | } | ||
1510 | |||
1511 | virtual++; | ||
1512 | p_blkno++; | ||
1513 | bytes_left -= sb->s_blocksize; | ||
1514 | } | ||
1515 | |||
1516 | status = 0; | ||
1517 | bail: | ||
1518 | |||
1519 | if (bhs) { | ||
1520 | for(i = 0; i < blocks; i++) | ||
1521 | if (bhs[i]) | ||
1522 | brelse(bhs[i]); | ||
1523 | kfree(bhs); | ||
1524 | } | ||
1525 | |||
1526 | mlog_exit(status); | ||
1527 | return status; | ||
1528 | } | ||
1529 | |||
1530 | static int ocfs2_symlink(struct inode *dir, | ||
1531 | struct dentry *dentry, | ||
1532 | const char *symname) | ||
1533 | { | ||
1534 | int status, l, credits; | ||
1535 | u64 newsize; | ||
1536 | struct ocfs2_super *osb = NULL; | ||
1537 | struct inode *inode = NULL; | ||
1538 | struct super_block *sb; | ||
1539 | struct buffer_head *new_fe_bh = NULL; | ||
1540 | struct buffer_head *de_bh = NULL; | ||
1541 | struct buffer_head *parent_fe_bh = NULL; | ||
1542 | struct ocfs2_dinode *fe = NULL; | ||
1543 | struct ocfs2_dinode *dirfe; | ||
1544 | struct ocfs2_journal_handle *handle = NULL; | ||
1545 | struct ocfs2_alloc_context *inode_ac = NULL; | ||
1546 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1547 | |||
1548 | mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, | ||
1549 | dentry, symname, dentry->d_name.len, dentry->d_name.name); | ||
1550 | |||
1551 | sb = dir->i_sb; | ||
1552 | osb = OCFS2_SB(sb); | ||
1553 | |||
1554 | l = strlen(symname) + 1; | ||
1555 | |||
1556 | credits = ocfs2_calc_symlink_credits(sb); | ||
1557 | |||
1558 | handle = ocfs2_alloc_handle(osb); | ||
1559 | if (handle == NULL) { | ||
1560 | status = -ENOMEM; | ||
1561 | mlog_errno(status); | ||
1562 | goto bail; | ||
1563 | } | ||
1564 | |||
1565 | /* lock the parent directory */ | ||
1566 | status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
1567 | if (status < 0) { | ||
1568 | if (status != -ENOENT) | ||
1569 | mlog_errno(status); | ||
1570 | goto bail; | ||
1571 | } | ||
1572 | |||
1573 | dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
1574 | if (!dirfe->i_links_count) { | ||
1575 | /* can't make a file in a deleted directory. */ | ||
1576 | status = -ENOENT; | ||
1577 | goto bail; | ||
1578 | } | ||
1579 | |||
1580 | status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
1581 | dentry->d_name.len); | ||
1582 | if (status) | ||
1583 | goto bail; | ||
1584 | |||
1585 | status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
1586 | dentry->d_name.name, | ||
1587 | dentry->d_name.len, &de_bh); | ||
1588 | if (status < 0) { | ||
1589 | mlog_errno(status); | ||
1590 | goto bail; | ||
1591 | } | ||
1592 | |||
1593 | status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); | ||
1594 | if (status < 0) { | ||
1595 | if (status != -ENOSPC) | ||
1596 | mlog_errno(status); | ||
1597 | goto bail; | ||
1598 | } | ||
1599 | |||
1600 | /* don't reserve bitmap space for fast symlinks. */ | ||
1601 | if (l > ocfs2_fast_symlink_chars(sb)) { | ||
1602 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
1603 | if (status < 0) { | ||
1604 | if (status != -ENOSPC) | ||
1605 | mlog_errno(status); | ||
1606 | goto bail; | ||
1607 | } | ||
1608 | } | ||
1609 | |||
1610 | handle = ocfs2_start_trans(osb, handle, credits); | ||
1611 | if (IS_ERR(handle)) { | ||
1612 | status = PTR_ERR(handle); | ||
1613 | handle = NULL; | ||
1614 | mlog_errno(status); | ||
1615 | goto bail; | ||
1616 | } | ||
1617 | |||
1618 | status = ocfs2_mknod_locked(osb, dir, dentry, | ||
1619 | S_IFLNK | S_IRWXUGO, 0, | ||
1620 | &new_fe_bh, parent_fe_bh, handle, | ||
1621 | &inode, inode_ac); | ||
1622 | if (status < 0) { | ||
1623 | mlog_errno(status); | ||
1624 | goto bail; | ||
1625 | } | ||
1626 | |||
1627 | fe = (struct ocfs2_dinode *) new_fe_bh->b_data; | ||
1628 | inode->i_rdev = 0; | ||
1629 | newsize = l - 1; | ||
1630 | if (l > ocfs2_fast_symlink_chars(sb)) { | ||
1631 | inode->i_op = &ocfs2_symlink_inode_operations; | ||
1632 | status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, | ||
1633 | handle, data_ac, NULL, | ||
1634 | NULL); | ||
1635 | if (status < 0) { | ||
1636 | if (status != -ENOSPC && status != -EINTR) { | ||
1637 | mlog(ML_ERROR, "Failed to extend file to " | ||
1638 | "%"MLFu64"\n", | ||
1639 | newsize); | ||
1640 | mlog_errno(status); | ||
1641 | status = -ENOSPC; | ||
1642 | } | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | i_size_write(inode, newsize); | ||
1646 | inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); | ||
1647 | } else { | ||
1648 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | ||
1649 | memcpy((char *) fe->id2.i_symlink, symname, l); | ||
1650 | i_size_write(inode, newsize); | ||
1651 | inode->i_blocks = 0; | ||
1652 | } | ||
1653 | |||
1654 | status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); | ||
1655 | if (status < 0) { | ||
1656 | mlog_errno(status); | ||
1657 | goto bail; | ||
1658 | } | ||
1659 | |||
1660 | if (!ocfs2_inode_is_fast_symlink(inode)) { | ||
1661 | status = ocfs2_create_symlink_data(osb, handle, inode, | ||
1662 | symname); | ||
1663 | if (status < 0) { | ||
1664 | mlog_errno(status); | ||
1665 | goto bail; | ||
1666 | } | ||
1667 | } | ||
1668 | |||
1669 | status = ocfs2_add_entry(handle, dentry, inode, | ||
1670 | le64_to_cpu(fe->i_blkno), parent_fe_bh, | ||
1671 | de_bh); | ||
1672 | if (status < 0) { | ||
1673 | mlog_errno(status); | ||
1674 | goto bail; | ||
1675 | } | ||
1676 | |||
1677 | insert_inode_hash(inode); | ||
1678 | dentry->d_op = &ocfs2_dentry_ops; | ||
1679 | d_instantiate(dentry, inode); | ||
1680 | bail: | ||
1681 | if (handle) | ||
1682 | ocfs2_commit_trans(handle); | ||
1683 | if (new_fe_bh) | ||
1684 | brelse(new_fe_bh); | ||
1685 | if (parent_fe_bh) | ||
1686 | brelse(parent_fe_bh); | ||
1687 | if (de_bh) | ||
1688 | brelse(de_bh); | ||
1689 | if (inode_ac) | ||
1690 | ocfs2_free_alloc_context(inode_ac); | ||
1691 | if (data_ac) | ||
1692 | ocfs2_free_alloc_context(data_ac); | ||
1693 | if ((status < 0) && inode) | ||
1694 | iput(inode); | ||
1695 | |||
1696 | mlog_exit(status); | ||
1697 | |||
1698 | return status; | ||
1699 | } | ||
1700 | |||
1701 | int ocfs2_check_dir_entry(struct inode * dir, | ||
1702 | struct ocfs2_dir_entry * de, | ||
1703 | struct buffer_head * bh, | ||
1704 | unsigned long offset) | ||
1705 | { | ||
1706 | const char *error_msg = NULL; | ||
1707 | const int rlen = le16_to_cpu(de->rec_len); | ||
1708 | |||
1709 | if (rlen < OCFS2_DIR_REC_LEN(1)) | ||
1710 | error_msg = "rec_len is smaller than minimal"; | ||
1711 | else if (rlen % 4 != 0) | ||
1712 | error_msg = "rec_len % 4 != 0"; | ||
1713 | else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) | ||
1714 | error_msg = "rec_len is too small for name_len"; | ||
1715 | else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) | ||
1716 | error_msg = "directory entry across blocks"; | ||
1717 | |||
1718 | if (error_msg != NULL) | ||
1719 | mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - " | ||
1720 | "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n", | ||
1721 | OCFS2_I(dir)->ip_blkno, error_msg, offset, | ||
1722 | le64_to_cpu(de->inode), rlen, de->name_len); | ||
1723 | return error_msg == NULL ? 1 : 0; | ||
1724 | } | ||
1725 | |||
1726 | /* we don't always have a dentry for what we want to add, so people | ||
1727 | * like orphan dir can call this instead. | ||
1728 | * | ||
1729 | * If you pass me insert_bh, I'll skip the search of the other dir | ||
1730 | * blocks and put the record in there. | ||
1731 | */ | ||
1732 | static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
1733 | struct inode *dir, | ||
1734 | const char *name, int namelen, | ||
1735 | struct inode *inode, u64 blkno, | ||
1736 | struct buffer_head *parent_fe_bh, | ||
1737 | struct buffer_head *insert_bh) | ||
1738 | { | ||
1739 | unsigned long offset; | ||
1740 | unsigned short rec_len; | ||
1741 | struct ocfs2_dir_entry *de, *de1; | ||
1742 | struct super_block *sb; | ||
1743 | int retval, status; | ||
1744 | |||
1745 | mlog_entry_void(); | ||
1746 | |||
1747 | sb = dir->i_sb; | ||
1748 | |||
1749 | if (!namelen) | ||
1750 | return -EINVAL; | ||
1751 | |||
1752 | rec_len = OCFS2_DIR_REC_LEN(namelen); | ||
1753 | offset = 0; | ||
1754 | de = (struct ocfs2_dir_entry *) insert_bh->b_data; | ||
1755 | while (1) { | ||
1756 | BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data); | ||
1757 | /* These checks should've already been passed by the | ||
1758 | * prepare function, but I guess we can leave them | ||
1759 | * here anyway. */ | ||
1760 | if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { | ||
1761 | retval = -ENOENT; | ||
1762 | goto bail; | ||
1763 | } | ||
1764 | if (ocfs2_match(namelen, name, de)) { | ||
1765 | retval = -EEXIST; | ||
1766 | goto bail; | ||
1767 | } | ||
1768 | if (((le64_to_cpu(de->inode) == 0) && | ||
1769 | (le16_to_cpu(de->rec_len) >= rec_len)) || | ||
1770 | (le16_to_cpu(de->rec_len) >= | ||
1771 | (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { | ||
1772 | status = ocfs2_journal_access(handle, dir, insert_bh, | ||
1773 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1774 | /* By now the buffer is marked for journaling */ | ||
1775 | offset += le16_to_cpu(de->rec_len); | ||
1776 | if (le64_to_cpu(de->inode)) { | ||
1777 | de1 = (struct ocfs2_dir_entry *)((char *) de + | ||
1778 | OCFS2_DIR_REC_LEN(de->name_len)); | ||
1779 | de1->rec_len = | ||
1780 | cpu_to_le16(le16_to_cpu(de->rec_len) - | ||
1781 | OCFS2_DIR_REC_LEN(de->name_len)); | ||
1782 | de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); | ||
1783 | de = de1; | ||
1784 | } | ||
1785 | de->file_type = OCFS2_FT_UNKNOWN; | ||
1786 | if (blkno) { | ||
1787 | de->inode = cpu_to_le64(blkno); | ||
1788 | ocfs2_set_de_type(de, inode->i_mode); | ||
1789 | } else | ||
1790 | de->inode = 0; | ||
1791 | de->name_len = namelen; | ||
1792 | memcpy(de->name, name, namelen); | ||
1793 | |||
1794 | dir->i_mtime = dir->i_ctime = CURRENT_TIME; | ||
1795 | dir->i_version++; | ||
1796 | status = ocfs2_journal_dirty(handle, insert_bh); | ||
1797 | retval = 0; | ||
1798 | goto bail; | ||
1799 | } | ||
1800 | offset += le16_to_cpu(de->rec_len); | ||
1801 | de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); | ||
1802 | } | ||
1803 | |||
1804 | /* when you think about it, the assert above should prevent us | ||
1805 | * from ever getting here. */ | ||
1806 | retval = -ENOSPC; | ||
1807 | bail: | ||
1808 | |||
1809 | mlog_exit(retval); | ||
1810 | return retval; | ||
1811 | } | ||
1812 | |||
1813 | |||
1814 | /* | ||
1815 | * ocfs2_delete_entry deletes a directory entry by merging it with the | ||
1816 | * previous entry | ||
1817 | */ | ||
1818 | static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, | ||
1819 | struct inode *dir, | ||
1820 | struct ocfs2_dir_entry *de_del, | ||
1821 | struct buffer_head *bh) | ||
1822 | { | ||
1823 | struct ocfs2_dir_entry *de, *pde; | ||
1824 | int i, status = -ENOENT; | ||
1825 | |||
1826 | mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); | ||
1827 | |||
1828 | i = 0; | ||
1829 | pde = NULL; | ||
1830 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
1831 | while (i < bh->b_size) { | ||
1832 | if (!ocfs2_check_dir_entry(dir, de, bh, i)) { | ||
1833 | status = -EIO; | ||
1834 | mlog_errno(status); | ||
1835 | goto bail; | ||
1836 | } | ||
1837 | if (de == de_del) { | ||
1838 | status = ocfs2_journal_access(handle, dir, bh, | ||
1839 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1840 | if (status < 0) { | ||
1841 | status = -EIO; | ||
1842 | mlog_errno(status); | ||
1843 | goto bail; | ||
1844 | } | ||
1845 | if (pde) | ||
1846 | pde->rec_len = | ||
1847 | cpu_to_le16(le16_to_cpu(pde->rec_len) + | ||
1848 | le16_to_cpu(de->rec_len)); | ||
1849 | else | ||
1850 | de->inode = 0; | ||
1851 | dir->i_version++; | ||
1852 | status = ocfs2_journal_dirty(handle, bh); | ||
1853 | goto bail; | ||
1854 | } | ||
1855 | i += le16_to_cpu(de->rec_len); | ||
1856 | pde = de; | ||
1857 | de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); | ||
1858 | } | ||
1859 | bail: | ||
1860 | mlog_exit(status); | ||
1861 | return status; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1865 | * Returns 0 if not found, -1 on failure, and 1 on success | ||
1866 | */ | ||
1867 | static int inline ocfs2_search_dirblock(struct buffer_head *bh, | ||
1868 | struct inode *dir, | ||
1869 | const char *name, int namelen, | ||
1870 | unsigned long offset, | ||
1871 | struct ocfs2_dir_entry **res_dir) | ||
1872 | { | ||
1873 | struct ocfs2_dir_entry *de; | ||
1874 | char *dlimit, *de_buf; | ||
1875 | int de_len; | ||
1876 | int ret = 0; | ||
1877 | |||
1878 | mlog_entry_void(); | ||
1879 | |||
1880 | de_buf = bh->b_data; | ||
1881 | dlimit = de_buf + dir->i_sb->s_blocksize; | ||
1882 | |||
1883 | while (de_buf < dlimit) { | ||
1884 | /* this code is executed quadratically often */ | ||
1885 | /* do minimal checking `by hand' */ | ||
1886 | |||
1887 | de = (struct ocfs2_dir_entry *) de_buf; | ||
1888 | |||
1889 | if (de_buf + namelen <= dlimit && | ||
1890 | ocfs2_match(namelen, name, de)) { | ||
1891 | /* found a match - just to be sure, do a full check */ | ||
1892 | if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { | ||
1893 | ret = -1; | ||
1894 | goto bail; | ||
1895 | } | ||
1896 | *res_dir = de; | ||
1897 | ret = 1; | ||
1898 | goto bail; | ||
1899 | } | ||
1900 | |||
1901 | /* prevent looping on a bad block */ | ||
1902 | de_len = le16_to_cpu(de->rec_len); | ||
1903 | if (de_len <= 0) { | ||
1904 | ret = -1; | ||
1905 | goto bail; | ||
1906 | } | ||
1907 | |||
1908 | de_buf += de_len; | ||
1909 | offset += de_len; | ||
1910 | } | ||
1911 | |||
1912 | bail: | ||
1913 | mlog_exit(ret); | ||
1914 | return ret; | ||
1915 | } | ||
1916 | |||
1917 | struct buffer_head *ocfs2_find_entry(const char *name, int namelen, | ||
1918 | struct inode *dir, | ||
1919 | struct ocfs2_dir_entry **res_dir) | ||
1920 | { | ||
1921 | struct super_block *sb; | ||
1922 | struct buffer_head *bh_use[NAMEI_RA_SIZE]; | ||
1923 | struct buffer_head *bh, *ret = NULL; | ||
1924 | unsigned long start, block, b; | ||
1925 | int ra_max = 0; /* Number of bh's in the readahead | ||
1926 | buffer, bh_use[] */ | ||
1927 | int ra_ptr = 0; /* Current index into readahead | ||
1928 | buffer */ | ||
1929 | int num = 0; | ||
1930 | int nblocks, i, err; | ||
1931 | |||
1932 | mlog_entry_void(); | ||
1933 | |||
1934 | *res_dir = NULL; | ||
1935 | sb = dir->i_sb; | ||
1936 | |||
1937 | nblocks = i_size_read(dir) >> sb->s_blocksize_bits; | ||
1938 | start = OCFS2_I(dir)->ip_dir_start_lookup; | ||
1939 | if (start >= nblocks) | ||
1940 | start = 0; | ||
1941 | block = start; | ||
1942 | |||
1943 | restart: | ||
1944 | do { | ||
1945 | /* | ||
1946 | * We deal with the read-ahead logic here. | ||
1947 | */ | ||
1948 | if (ra_ptr >= ra_max) { | ||
1949 | /* Refill the readahead buffer */ | ||
1950 | ra_ptr = 0; | ||
1951 | b = block; | ||
1952 | for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { | ||
1953 | /* | ||
1954 | * Terminate if we reach the end of the | ||
1955 | * directory and must wrap, or if our | ||
1956 | * search has finished at this block. | ||
1957 | */ | ||
1958 | if (b >= nblocks || (num && block == start)) { | ||
1959 | bh_use[ra_max] = NULL; | ||
1960 | break; | ||
1961 | } | ||
1962 | num++; | ||
1963 | |||
1964 | /* XXX: questionable readahead stuff here */ | ||
1965 | bh = ocfs2_bread(dir, b++, &err, 1); | ||
1966 | bh_use[ra_max] = bh; | ||
1967 | #if 0 // ??? | ||
1968 | if (bh) | ||
1969 | ll_rw_block(READ, 1, &bh); | ||
1970 | #endif | ||
1971 | } | ||
1972 | } | ||
1973 | if ((bh = bh_use[ra_ptr++]) == NULL) | ||
1974 | goto next; | ||
1975 | wait_on_buffer(bh); | ||
1976 | if (!buffer_uptodate(bh)) { | ||
1977 | /* read error, skip block & hope for the best */ | ||
1978 | brelse(bh); | ||
1979 | goto next; | ||
1980 | } | ||
1981 | i = ocfs2_search_dirblock(bh, dir, name, namelen, | ||
1982 | block << sb->s_blocksize_bits, | ||
1983 | res_dir); | ||
1984 | if (i == 1) { | ||
1985 | OCFS2_I(dir)->ip_dir_start_lookup = block; | ||
1986 | ret = bh; | ||
1987 | goto cleanup_and_exit; | ||
1988 | } else { | ||
1989 | brelse(bh); | ||
1990 | if (i < 0) | ||
1991 | goto cleanup_and_exit; | ||
1992 | } | ||
1993 | next: | ||
1994 | if (++block >= nblocks) | ||
1995 | block = 0; | ||
1996 | } while (block != start); | ||
1997 | |||
1998 | /* | ||
1999 | * If the directory has grown while we were searching, then | ||
2000 | * search the last part of the directory before giving up. | ||
2001 | */ | ||
2002 | block = nblocks; | ||
2003 | nblocks = i_size_read(dir) >> sb->s_blocksize_bits; | ||
2004 | if (block < nblocks) { | ||
2005 | start = 0; | ||
2006 | goto restart; | ||
2007 | } | ||
2008 | |||
2009 | cleanup_and_exit: | ||
2010 | /* Clean up the read-ahead blocks */ | ||
2011 | for (; ra_ptr < ra_max; ra_ptr++) | ||
2012 | brelse(bh_use[ra_ptr]); | ||
2013 | |||
2014 | mlog_exit_ptr(ret); | ||
2015 | return ret; | ||
2016 | } | ||
2017 | |||
2018 | static int ocfs2_blkno_stringify(u64 blkno, char *name) | ||
2019 | { | ||
2020 | int status, namelen; | ||
2021 | |||
2022 | mlog_entry_void(); | ||
2023 | |||
2024 | namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64, | ||
2025 | blkno); | ||
2026 | if (namelen <= 0) { | ||
2027 | if (namelen) | ||
2028 | status = namelen; | ||
2029 | else | ||
2030 | status = -EINVAL; | ||
2031 | mlog_errno(status); | ||
2032 | goto bail; | ||
2033 | } | ||
2034 | if (namelen != OCFS2_ORPHAN_NAMELEN) { | ||
2035 | status = -EINVAL; | ||
2036 | mlog_errno(status); | ||
2037 | goto bail; | ||
2038 | } | ||
2039 | |||
2040 | mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, | ||
2041 | namelen); | ||
2042 | |||
2043 | status = 0; | ||
2044 | bail: | ||
2045 | mlog_exit(status); | ||
2046 | return status; | ||
2047 | } | ||
2048 | |||
2049 | static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, | ||
2050 | struct ocfs2_journal_handle *handle, | ||
2051 | struct inode *inode, | ||
2052 | char *name, | ||
2053 | struct buffer_head **de_bh) | ||
2054 | { | ||
2055 | struct inode *orphan_dir_inode = NULL; | ||
2056 | struct buffer_head *orphan_dir_bh = NULL; | ||
2057 | int status = 0; | ||
2058 | |||
2059 | status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); | ||
2060 | if (status < 0) { | ||
2061 | mlog_errno(status); | ||
2062 | goto leave; | ||
2063 | } | ||
2064 | |||
2065 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
2066 | ORPHAN_DIR_SYSTEM_INODE, | ||
2067 | osb->slot_num); | ||
2068 | if (!orphan_dir_inode) { | ||
2069 | status = -ENOENT; | ||
2070 | mlog_errno(status); | ||
2071 | goto leave; | ||
2072 | } | ||
2073 | |||
2074 | ocfs2_handle_add_inode(handle, orphan_dir_inode); | ||
2075 | status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1); | ||
2076 | if (status < 0) { | ||
2077 | mlog_errno(status); | ||
2078 | goto leave; | ||
2079 | } | ||
2080 | |||
2081 | status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, | ||
2082 | orphan_dir_bh, name, | ||
2083 | OCFS2_ORPHAN_NAMELEN, de_bh); | ||
2084 | if (status < 0) { | ||
2085 | mlog_errno(status); | ||
2086 | goto leave; | ||
2087 | } | ||
2088 | |||
2089 | leave: | ||
2090 | if (orphan_dir_inode) | ||
2091 | iput(orphan_dir_inode); | ||
2092 | |||
2093 | if (orphan_dir_bh) | ||
2094 | brelse(orphan_dir_bh); | ||
2095 | |||
2096 | mlog_exit(status); | ||
2097 | return status; | ||
2098 | } | ||
2099 | |||
2100 | static int ocfs2_orphan_add(struct ocfs2_super *osb, | ||
2101 | struct ocfs2_journal_handle *handle, | ||
2102 | struct inode *inode, | ||
2103 | struct ocfs2_dinode *fe, | ||
2104 | char *name, | ||
2105 | struct buffer_head *de_bh) | ||
2106 | { | ||
2107 | struct inode *orphan_dir_inode = NULL; | ||
2108 | struct buffer_head *orphan_dir_bh = NULL; | ||
2109 | int status = 0; | ||
2110 | struct ocfs2_dinode *orphan_fe; | ||
2111 | |||
2112 | mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); | ||
2113 | |||
2114 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
2115 | ORPHAN_DIR_SYSTEM_INODE, | ||
2116 | osb->slot_num); | ||
2117 | if (!orphan_dir_inode) { | ||
2118 | status = -ENOENT; | ||
2119 | mlog_errno(status); | ||
2120 | goto leave; | ||
2121 | } | ||
2122 | |||
2123 | status = ocfs2_read_block(osb, | ||
2124 | OCFS2_I(orphan_dir_inode)->ip_blkno, | ||
2125 | &orphan_dir_bh, OCFS2_BH_CACHED, | ||
2126 | orphan_dir_inode); | ||
2127 | if (status < 0) { | ||
2128 | mlog_errno(status); | ||
2129 | goto leave; | ||
2130 | } | ||
2131 | |||
2132 | status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, | ||
2133 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2134 | if (status < 0) { | ||
2135 | mlog_errno(status); | ||
2136 | goto leave; | ||
2137 | } | ||
2138 | |||
2139 | /* we're a cluster, and nlink can change on disk from | ||
2140 | * underneath us... */ | ||
2141 | orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; | ||
2142 | if (S_ISDIR(inode->i_mode)) | ||
2143 | le16_add_cpu(&orphan_fe->i_links_count, 1); | ||
2144 | orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); | ||
2145 | |||
2146 | status = ocfs2_journal_dirty(handle, orphan_dir_bh); | ||
2147 | if (status < 0) { | ||
2148 | mlog_errno(status); | ||
2149 | goto leave; | ||
2150 | } | ||
2151 | |||
2152 | status = __ocfs2_add_entry(handle, orphan_dir_inode, name, | ||
2153 | OCFS2_ORPHAN_NAMELEN, inode, | ||
2154 | OCFS2_I(inode)->ip_blkno, | ||
2155 | orphan_dir_bh, de_bh); | ||
2156 | if (status < 0) { | ||
2157 | mlog_errno(status); | ||
2158 | goto leave; | ||
2159 | } | ||
2160 | |||
2161 | le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); | ||
2162 | |||
2163 | /* Record which orphan dir our inode now resides | ||
2164 | * in. delete_inode will use this to determine which orphan | ||
2165 | * dir to lock. */ | ||
2166 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
2167 | OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; | ||
2168 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
2169 | |||
2170 | mlog(0, "Inode %"MLFu64" orphaned in slot %d\n", | ||
2171 | OCFS2_I(inode)->ip_blkno, osb->slot_num); | ||
2172 | |||
2173 | leave: | ||
2174 | if (orphan_dir_inode) | ||
2175 | iput(orphan_dir_inode); | ||
2176 | |||
2177 | if (orphan_dir_bh) | ||
2178 | brelse(orphan_dir_bh); | ||
2179 | |||
2180 | mlog_exit(status); | ||
2181 | return status; | ||
2182 | } | ||
2183 | |||
2184 | /* unlike orphan_add, we expect the orphan dir to already be locked here. */ | ||
2185 | int ocfs2_orphan_del(struct ocfs2_super *osb, | ||
2186 | struct ocfs2_journal_handle *handle, | ||
2187 | struct inode *orphan_dir_inode, | ||
2188 | struct inode *inode, | ||
2189 | struct buffer_head *orphan_dir_bh) | ||
2190 | { | ||
2191 | char name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
2192 | struct ocfs2_dinode *orphan_fe; | ||
2193 | int status = 0; | ||
2194 | struct buffer_head *target_de_bh = NULL; | ||
2195 | struct ocfs2_dir_entry *target_de = NULL; | ||
2196 | |||
2197 | mlog_entry_void(); | ||
2198 | |||
2199 | status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); | ||
2200 | if (status < 0) { | ||
2201 | mlog_errno(status); | ||
2202 | goto leave; | ||
2203 | } | ||
2204 | |||
2205 | mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n", | ||
2206 | name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN); | ||
2207 | |||
2208 | /* find it's spot in the orphan directory */ | ||
2209 | target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, | ||
2210 | orphan_dir_inode, &target_de); | ||
2211 | if (!target_de_bh) { | ||
2212 | status = -ENOENT; | ||
2213 | mlog_errno(status); | ||
2214 | goto leave; | ||
2215 | } | ||
2216 | |||
2217 | /* remove it from the orphan directory */ | ||
2218 | status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, | ||
2219 | target_de_bh); | ||
2220 | if (status < 0) { | ||
2221 | mlog_errno(status); | ||
2222 | goto leave; | ||
2223 | } | ||
2224 | |||
2225 | status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, | ||
2226 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2227 | if (status < 0) { | ||
2228 | mlog_errno(status); | ||
2229 | goto leave; | ||
2230 | } | ||
2231 | |||
2232 | /* do the i_nlink dance! :) */ | ||
2233 | orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; | ||
2234 | if (S_ISDIR(inode->i_mode)) | ||
2235 | le16_add_cpu(&orphan_fe->i_links_count, -1); | ||
2236 | orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); | ||
2237 | |||
2238 | status = ocfs2_journal_dirty(handle, orphan_dir_bh); | ||
2239 | if (status < 0) { | ||
2240 | mlog_errno(status); | ||
2241 | goto leave; | ||
2242 | } | ||
2243 | |||
2244 | leave: | ||
2245 | if (target_de_bh) | ||
2246 | brelse(target_de_bh); | ||
2247 | |||
2248 | mlog_exit(status); | ||
2249 | return status; | ||
2250 | } | ||
2251 | |||
2252 | struct inode_operations ocfs2_dir_iops = { | ||
2253 | .create = ocfs2_create, | ||
2254 | .lookup = ocfs2_lookup, | ||
2255 | .link = ocfs2_link, | ||
2256 | .unlink = ocfs2_unlink, | ||
2257 | .rmdir = ocfs2_unlink, | ||
2258 | .symlink = ocfs2_symlink, | ||
2259 | .mkdir = ocfs2_mkdir, | ||
2260 | .mknod = ocfs2_mknod, | ||
2261 | .rename = ocfs2_rename, | ||
2262 | .setattr = ocfs2_setattr, | ||
2263 | .getattr = ocfs2_getattr, | ||
2264 | }; | ||
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h new file mode 100644 index 000000000000..deaaa97dbf0b --- /dev/null +++ b/fs/ocfs2/namei.h | |||
@@ -0,0 +1,58 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * namei.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_NAMEI_H | ||
27 | #define OCFS2_NAMEI_H | ||
28 | |||
29 | extern struct inode_operations ocfs2_dir_iops; | ||
30 | |||
31 | struct dentry *ocfs2_get_parent(struct dentry *child); | ||
32 | |||
33 | int ocfs2_check_dir_entry (struct inode *dir, | ||
34 | struct ocfs2_dir_entry *de, | ||
35 | struct buffer_head *bh, | ||
36 | unsigned long offset); | ||
37 | struct buffer_head *ocfs2_find_entry(const char *name, | ||
38 | int namelen, | ||
39 | struct inode *dir, | ||
40 | struct ocfs2_dir_entry **res_dir); | ||
41 | int ocfs2_orphan_del(struct ocfs2_super *osb, | ||
42 | struct ocfs2_journal_handle *handle, | ||
43 | struct inode *orphan_dir_inode, | ||
44 | struct inode *inode, | ||
45 | struct buffer_head *orphan_dir_bh); | ||
46 | |||
47 | static inline int ocfs2_match(int len, | ||
48 | const char * const name, | ||
49 | struct ocfs2_dir_entry *de) | ||
50 | { | ||
51 | if (len != de->name_len) | ||
52 | return 0; | ||
53 | if (!de->inode) | ||
54 | return 0; | ||
55 | return !memcmp(name, de->name, len); | ||
56 | } | ||
57 | |||
58 | #endif /* OCFS2_NAMEI_H */ | ||
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 000000000000..0b499bccec5a --- /dev/null +++ b/fs/ocfs2/ocfs1_fs_compat.h | |||
@@ -0,0 +1,109 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs1_fs_compat.h | ||
5 | * | ||
6 | * OCFS1 volume header definitions. OCFS2 creates valid but unmountable | ||
7 | * OCFS1 volume headers on the first two sectors of an OCFS2 volume. | ||
8 | * This allows an OCFS1 volume to see the partition and cleanly fail to | ||
9 | * mount it. | ||
10 | * | ||
11 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public | ||
15 | * License, version 2, as published by the Free Software Foundation. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
20 | * General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public | ||
23 | * License along with this program; if not, write to the | ||
24 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
25 | * Boston, MA 021110-1307, USA. | ||
26 | */ | ||
27 | |||
28 | #ifndef _OCFS1_FS_COMPAT_H | ||
29 | #define _OCFS1_FS_COMPAT_H | ||
30 | |||
31 | #define OCFS1_MAX_VOL_SIGNATURE_LEN 128 | ||
32 | #define OCFS1_MAX_MOUNT_POINT_LEN 128 | ||
33 | #define OCFS1_MAX_VOL_ID_LENGTH 16 | ||
34 | #define OCFS1_MAX_VOL_LABEL_LEN 64 | ||
35 | #define OCFS1_MAX_CLUSTER_NAME_LEN 64 | ||
36 | |||
37 | #define OCFS1_MAJOR_VERSION (2) | ||
38 | #define OCFS1_MINOR_VERSION (0) | ||
39 | #define OCFS1_VOLUME_SIGNATURE "OracleCFS" | ||
40 | |||
41 | /* | ||
42 | * OCFS1 superblock. Lives at sector 0. | ||
43 | */ | ||
44 | struct ocfs1_vol_disk_hdr | ||
45 | { | ||
46 | /*00*/ __u32 minor_version; | ||
47 | __u32 major_version; | ||
48 | /*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; | ||
49 | /*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; | ||
50 | /*108*/ __u64 serial_num; | ||
51 | /*110*/ __u64 device_size; | ||
52 | __u64 start_off; | ||
53 | /*120*/ __u64 bitmap_off; | ||
54 | __u64 publ_off; | ||
55 | /*130*/ __u64 vote_off; | ||
56 | __u64 root_bitmap_off; | ||
57 | /*140*/ __u64 data_start_off; | ||
58 | __u64 root_bitmap_size; | ||
59 | /*150*/ __u64 root_off; | ||
60 | __u64 root_size; | ||
61 | /*160*/ __u64 cluster_size; | ||
62 | __u64 num_nodes; | ||
63 | /*170*/ __u64 num_clusters; | ||
64 | __u64 dir_node_size; | ||
65 | /*180*/ __u64 file_node_size; | ||
66 | __u64 internal_off; | ||
67 | /*190*/ __u64 node_cfg_off; | ||
68 | __u64 node_cfg_size; | ||
69 | /*1A0*/ __u64 new_cfg_off; | ||
70 | __u32 prot_bits; | ||
71 | __s32 excl_mount; | ||
72 | /*1B0*/ | ||
73 | }; | ||
74 | |||
75 | |||
76 | struct ocfs1_disk_lock | ||
77 | { | ||
78 | /*00*/ __u32 curr_master; | ||
79 | __u8 file_lock; | ||
80 | __u8 compat_pad[3]; /* Not in orignal definition. Used to | ||
81 | make the already existing alignment | ||
82 | explicit */ | ||
83 | __u64 last_write_time; | ||
84 | /*10*/ __u64 last_read_time; | ||
85 | __u32 writer_node_num; | ||
86 | __u32 reader_node_num; | ||
87 | /*20*/ __u64 oin_node_map; | ||
88 | __u64 dlock_seq_num; | ||
89 | /*30*/ | ||
90 | }; | ||
91 | |||
92 | /* | ||
93 | * OCFS1 volume label. Lives at sector 1. | ||
94 | */ | ||
95 | struct ocfs1_vol_label | ||
96 | { | ||
97 | /*00*/ struct ocfs1_disk_lock disk_lock; | ||
98 | /*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; | ||
99 | /*70*/ __u16 label_len; | ||
100 | /*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; | ||
101 | /*82*/ __u16 vol_id_len; | ||
102 | /*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; | ||
103 | /*A4*/ __u16 cluster_name_len; | ||
104 | /*A6*/ | ||
105 | }; | ||
106 | |||
107 | |||
108 | #endif /* _OCFS1_FS_COMPAT_H */ | ||
109 | |||
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h new file mode 100644 index 000000000000..f468c600cf92 --- /dev/null +++ b/fs/ocfs2/ocfs2.h | |||
@@ -0,0 +1,464 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2.h | ||
5 | * | ||
6 | * Defines macros and structures used in OCFS2 | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_H | ||
27 | #define OCFS2_H | ||
28 | |||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/wait.h> | ||
32 | #include <linux/list.h> | ||
33 | #include <linux/rbtree.h> | ||
34 | #include <linux/workqueue.h> | ||
35 | #include <linux/kref.h> | ||
36 | |||
37 | #include "cluster/nodemanager.h" | ||
38 | #include "cluster/heartbeat.h" | ||
39 | #include "cluster/tcp.h" | ||
40 | |||
41 | #include "dlm/dlmapi.h" | ||
42 | |||
43 | #include "ocfs2_fs.h" | ||
44 | #include "endian.h" | ||
45 | #include "ocfs2_lockid.h" | ||
46 | |||
47 | struct ocfs2_extent_map { | ||
48 | u32 em_clusters; | ||
49 | struct rb_root em_extents; | ||
50 | }; | ||
51 | |||
52 | /* Most user visible OCFS2 inodes will have very few pieces of | ||
53 | * metadata, but larger files (including bitmaps, etc) must be taken | ||
54 | * into account when designing an access scheme. We allow a small | ||
55 | * amount of inlined blocks to be stored on an array and grow the | ||
56 | * structure into a rb tree when necessary. */ | ||
57 | #define OCFS2_INODE_MAX_CACHE_ARRAY 2 | ||
58 | |||
59 | struct ocfs2_caching_info { | ||
60 | unsigned int ci_num_cached; | ||
61 | union { | ||
62 | sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; | ||
63 | struct rb_root ci_tree; | ||
64 | } ci_cache; | ||
65 | }; | ||
66 | |||
67 | /* this limits us to 256 nodes | ||
68 | * if we need more, we can do a kmalloc for the map */ | ||
69 | #define OCFS2_NODE_MAP_MAX_NODES 256 | ||
70 | struct ocfs2_node_map { | ||
71 | u16 num_nodes; | ||
72 | unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; | ||
73 | }; | ||
74 | |||
75 | enum ocfs2_ast_action { | ||
76 | OCFS2_AST_INVALID = 0, | ||
77 | OCFS2_AST_ATTACH, | ||
78 | OCFS2_AST_CONVERT, | ||
79 | OCFS2_AST_DOWNCONVERT, | ||
80 | }; | ||
81 | |||
82 | /* actions for an unlockast function to take. */ | ||
83 | enum ocfs2_unlock_action { | ||
84 | OCFS2_UNLOCK_INVALID = 0, | ||
85 | OCFS2_UNLOCK_CANCEL_CONVERT, | ||
86 | OCFS2_UNLOCK_DROP_LOCK, | ||
87 | }; | ||
88 | |||
89 | /* ocfs2_lock_res->l_flags flags. */ | ||
90 | #define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized | ||
91 | * the lvb */ | ||
92 | #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in | ||
93 | * dlm_lock */ | ||
94 | #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to | ||
95 | * downconvert*/ | ||
96 | #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ | ||
97 | #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) | ||
98 | #define OCFS2_LOCK_REFRESHING (0x00000020) | ||
99 | #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization | ||
100 | * for shutdown paths */ | ||
101 | #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track | ||
102 | * when to skip queueing | ||
103 | * a lock because it's | ||
104 | * about to be | ||
105 | * dropped. */ | ||
106 | #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ | ||
107 | |||
108 | struct ocfs2_lock_res_ops; | ||
109 | |||
110 | typedef void (*ocfs2_lock_callback)(int status, unsigned long data); | ||
111 | |||
112 | struct ocfs2_lock_res { | ||
113 | void *l_priv; | ||
114 | struct ocfs2_lock_res_ops *l_ops; | ||
115 | spinlock_t l_lock; | ||
116 | |||
117 | struct list_head l_blocked_list; | ||
118 | struct list_head l_mask_waiters; | ||
119 | |||
120 | enum ocfs2_lock_type l_type; | ||
121 | unsigned long l_flags; | ||
122 | char l_name[OCFS2_LOCK_ID_MAX_LEN]; | ||
123 | int l_level; | ||
124 | unsigned int l_ro_holders; | ||
125 | unsigned int l_ex_holders; | ||
126 | struct dlm_lockstatus l_lksb; | ||
127 | |||
128 | /* used from AST/BAST funcs. */ | ||
129 | enum ocfs2_ast_action l_action; | ||
130 | enum ocfs2_unlock_action l_unlock_action; | ||
131 | int l_requested; | ||
132 | int l_blocking; | ||
133 | |||
134 | wait_queue_head_t l_event; | ||
135 | |||
136 | struct list_head l_debug_list; | ||
137 | }; | ||
138 | |||
139 | struct ocfs2_dlm_debug { | ||
140 | struct kref d_refcnt; | ||
141 | struct dentry *d_locking_state; | ||
142 | struct list_head d_lockres_tracking; | ||
143 | }; | ||
144 | |||
145 | enum ocfs2_vol_state | ||
146 | { | ||
147 | VOLUME_INIT = 0, | ||
148 | VOLUME_MOUNTED, | ||
149 | VOLUME_DISMOUNTED, | ||
150 | VOLUME_DISABLED | ||
151 | }; | ||
152 | |||
153 | struct ocfs2_alloc_stats | ||
154 | { | ||
155 | atomic_t moves; | ||
156 | atomic_t local_data; | ||
157 | atomic_t bitmap_data; | ||
158 | atomic_t bg_allocs; | ||
159 | atomic_t bg_extends; | ||
160 | }; | ||
161 | |||
162 | enum ocfs2_local_alloc_state | ||
163 | { | ||
164 | OCFS2_LA_UNUSED = 0, | ||
165 | OCFS2_LA_ENABLED, | ||
166 | OCFS2_LA_DISABLED | ||
167 | }; | ||
168 | |||
169 | enum ocfs2_mount_options | ||
170 | { | ||
171 | OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ | ||
172 | OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ | ||
173 | OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ | ||
174 | OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ | ||
175 | OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ | ||
176 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
177 | OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ | ||
178 | #endif | ||
179 | }; | ||
180 | |||
181 | #define OCFS2_OSB_SOFT_RO 0x0001 | ||
182 | #define OCFS2_OSB_HARD_RO 0x0002 | ||
183 | #define OCFS2_OSB_ERROR_FS 0x0004 | ||
184 | |||
185 | struct ocfs2_journal; | ||
186 | struct ocfs2_journal_handle; | ||
187 | struct ocfs2_super | ||
188 | { | ||
189 | u32 osb_id; /* id used by the proc interface */ | ||
190 | struct task_struct *commit_task; | ||
191 | struct super_block *sb; | ||
192 | struct inode *root_inode; | ||
193 | struct inode *sys_root_inode; | ||
194 | struct inode *system_inodes[NUM_SYSTEM_INODES]; | ||
195 | |||
196 | struct ocfs2_slot_info *slot_info; | ||
197 | |||
198 | spinlock_t node_map_lock; | ||
199 | struct ocfs2_node_map mounted_map; | ||
200 | struct ocfs2_node_map recovery_map; | ||
201 | struct ocfs2_node_map umount_map; | ||
202 | |||
203 | u32 num_clusters; | ||
204 | u64 root_blkno; | ||
205 | u64 system_dir_blkno; | ||
206 | u64 bitmap_blkno; | ||
207 | u32 bitmap_cpg; | ||
208 | u8 *uuid; | ||
209 | char *uuid_str; | ||
210 | u8 *vol_label; | ||
211 | u64 first_cluster_group_blkno; | ||
212 | u32 fs_generation; | ||
213 | |||
214 | u32 s_feature_compat; | ||
215 | u32 s_feature_incompat; | ||
216 | u32 s_feature_ro_compat; | ||
217 | |||
218 | /* Protects s_next_generaion, osb_flags. Could protect more on | ||
219 | * osb as it's very short lived. */ | ||
220 | spinlock_t osb_lock; | ||
221 | u32 s_next_generation; | ||
222 | unsigned long osb_flags; | ||
223 | |||
224 | unsigned long s_mount_opt; | ||
225 | |||
226 | u16 max_slots; | ||
227 | u16 num_nodes; | ||
228 | s16 node_num; | ||
229 | s16 slot_num; | ||
230 | int s_sectsize_bits; | ||
231 | int s_clustersize; | ||
232 | int s_clustersize_bits; | ||
233 | struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ | ||
234 | |||
235 | atomic_t vol_state; | ||
236 | struct semaphore recovery_lock; | ||
237 | struct task_struct *recovery_thread_task; | ||
238 | int disable_recovery; | ||
239 | wait_queue_head_t checkpoint_event; | ||
240 | atomic_t needs_checkpoint; | ||
241 | struct ocfs2_journal *journal; | ||
242 | |||
243 | enum ocfs2_local_alloc_state local_alloc_state; | ||
244 | struct buffer_head *local_alloc_bh; | ||
245 | |||
246 | /* Next two fields are for local node slot recovery during | ||
247 | * mount. */ | ||
248 | int dirty; | ||
249 | struct ocfs2_dinode *local_alloc_copy; | ||
250 | |||
251 | struct ocfs2_alloc_stats alloc_stats; | ||
252 | char dev_str[20]; /* "major,minor" of the device */ | ||
253 | |||
254 | struct dlm_ctxt *dlm; | ||
255 | struct ocfs2_lock_res osb_super_lockres; | ||
256 | struct ocfs2_lock_res osb_rename_lockres; | ||
257 | struct dlm_eviction_cb osb_eviction_cb; | ||
258 | struct ocfs2_dlm_debug *osb_dlm_debug; | ||
259 | |||
260 | struct dentry *osb_debug_root; | ||
261 | |||
262 | wait_queue_head_t recovery_event; | ||
263 | |||
264 | spinlock_t vote_task_lock; | ||
265 | struct task_struct *vote_task; | ||
266 | wait_queue_head_t vote_event; | ||
267 | unsigned long vote_wake_sequence; | ||
268 | unsigned long vote_work_sequence; | ||
269 | |||
270 | struct list_head blocked_lock_list; | ||
271 | unsigned long blocked_lock_count; | ||
272 | |||
273 | struct list_head vote_list; | ||
274 | int vote_count; | ||
275 | |||
276 | u32 net_key; | ||
277 | spinlock_t net_response_lock; | ||
278 | unsigned int net_response_ids; | ||
279 | struct list_head net_response_list; | ||
280 | |||
281 | struct o2hb_callback_func osb_hb_up; | ||
282 | struct o2hb_callback_func osb_hb_down; | ||
283 | |||
284 | struct list_head osb_net_handlers; | ||
285 | |||
286 | wait_queue_head_t osb_mount_event; | ||
287 | |||
288 | /* Truncate log info */ | ||
289 | struct inode *osb_tl_inode; | ||
290 | struct buffer_head *osb_tl_bh; | ||
291 | struct work_struct osb_truncate_log_wq; | ||
292 | }; | ||
293 | |||
294 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) | ||
295 | #define OCFS2_MAX_OSB_ID 65536 | ||
296 | |||
297 | static inline int ocfs2_should_order_data(struct inode *inode) | ||
298 | { | ||
299 | if (!S_ISREG(inode->i_mode)) | ||
300 | return 0; | ||
301 | if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) | ||
302 | return 0; | ||
303 | return 1; | ||
304 | } | ||
305 | |||
306 | /* set / clear functions because cluster events can make these happen | ||
307 | * in parallel so we want the transitions to be atomic. this also | ||
308 | * means that any future flags osb_flags must be protected by spinlock | ||
309 | * too! */ | ||
310 | static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, | ||
311 | unsigned long flag) | ||
312 | { | ||
313 | spin_lock(&osb->osb_lock); | ||
314 | osb->osb_flags |= flag; | ||
315 | spin_unlock(&osb->osb_lock); | ||
316 | } | ||
317 | |||
318 | static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, | ||
319 | int hard) | ||
320 | { | ||
321 | spin_lock(&osb->osb_lock); | ||
322 | osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); | ||
323 | if (hard) | ||
324 | osb->osb_flags |= OCFS2_OSB_HARD_RO; | ||
325 | else | ||
326 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | ||
327 | spin_unlock(&osb->osb_lock); | ||
328 | } | ||
329 | |||
330 | static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) | ||
331 | { | ||
332 | int ret; | ||
333 | |||
334 | spin_lock(&osb->osb_lock); | ||
335 | ret = osb->osb_flags & OCFS2_OSB_HARD_RO; | ||
336 | spin_unlock(&osb->osb_lock); | ||
337 | |||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) | ||
342 | { | ||
343 | int ret; | ||
344 | |||
345 | spin_lock(&osb->osb_lock); | ||
346 | ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; | ||
347 | spin_unlock(&osb->osb_lock); | ||
348 | |||
349 | return ret; | ||
350 | } | ||
351 | |||
352 | #define OCFS2_IS_VALID_DINODE(ptr) \ | ||
353 | (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) | ||
354 | |||
355 | #define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ | ||
356 | typeof(__di) ____di = (__di); \ | ||
357 | ocfs2_error((__sb), \ | ||
358 | "Dinode # %"MLFu64" has bad signature %.*s", \ | ||
359 | (____di)->i_blkno, 7, \ | ||
360 | (____di)->i_signature); \ | ||
361 | } while (0); | ||
362 | |||
363 | #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ | ||
364 | (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) | ||
365 | |||
366 | #define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ | ||
367 | typeof(__eb) ____eb = (__eb); \ | ||
368 | ocfs2_error((__sb), \ | ||
369 | "Extent Block # %"MLFu64" has bad signature %.*s", \ | ||
370 | (____eb)->h_blkno, 7, \ | ||
371 | (____eb)->h_signature); \ | ||
372 | } while (0); | ||
373 | |||
374 | #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ | ||
375 | (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) | ||
376 | |||
377 | #define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ | ||
378 | typeof(__gd) ____gd = (__gd); \ | ||
379 | ocfs2_error((__sb), \ | ||
380 | "Group Descriptor # %"MLFu64" has bad signature %.*s", \ | ||
381 | (____gd)->bg_blkno, 7, \ | ||
382 | (____gd)->bg_signature); \ | ||
383 | } while (0); | ||
384 | |||
385 | static inline unsigned long ino_from_blkno(struct super_block *sb, | ||
386 | u64 blkno) | ||
387 | { | ||
388 | return (unsigned long)(blkno & (u64)ULONG_MAX); | ||
389 | } | ||
390 | |||
391 | static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, | ||
392 | u32 clusters) | ||
393 | { | ||
394 | int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - | ||
395 | sb->s_blocksize_bits; | ||
396 | |||
397 | return (u64)clusters << c_to_b_bits; | ||
398 | } | ||
399 | |||
400 | static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, | ||
401 | u64 blocks) | ||
402 | { | ||
403 | int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - | ||
404 | sb->s_blocksize_bits; | ||
405 | |||
406 | return (u32)(blocks >> b_to_c_bits); | ||
407 | } | ||
408 | |||
409 | static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, | ||
410 | u64 bytes) | ||
411 | { | ||
412 | int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; | ||
413 | unsigned int clusters; | ||
414 | |||
415 | bytes += OCFS2_SB(sb)->s_clustersize - 1; | ||
416 | /* OCFS2 just cannot have enough clusters to overflow this */ | ||
417 | clusters = (unsigned int)(bytes >> cl_bits); | ||
418 | |||
419 | return clusters; | ||
420 | } | ||
421 | |||
422 | static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, | ||
423 | u64 bytes) | ||
424 | { | ||
425 | bytes += sb->s_blocksize - 1; | ||
426 | return bytes >> sb->s_blocksize_bits; | ||
427 | } | ||
428 | |||
429 | static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, | ||
430 | u32 clusters) | ||
431 | { | ||
432 | return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; | ||
433 | } | ||
434 | |||
435 | static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, | ||
436 | u64 bytes) | ||
437 | { | ||
438 | int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; | ||
439 | unsigned int clusters; | ||
440 | |||
441 | clusters = ocfs2_clusters_for_bytes(sb, bytes); | ||
442 | return (u64)clusters << cl_bits; | ||
443 | } | ||
444 | |||
445 | static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, | ||
446 | u64 bytes) | ||
447 | { | ||
448 | u64 blocks; | ||
449 | |||
450 | blocks = ocfs2_blocks_for_bytes(sb, bytes); | ||
451 | return blocks << sb->s_blocksize_bits; | ||
452 | } | ||
453 | |||
454 | static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) | ||
455 | { | ||
456 | return (unsigned long)((bytes + 511) >> 9); | ||
457 | } | ||
458 | |||
459 | #define ocfs2_set_bit ext2_set_bit | ||
460 | #define ocfs2_clear_bit ext2_clear_bit | ||
461 | #define ocfs2_test_bit ext2_test_bit | ||
462 | #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit | ||
463 | #endif /* OCFS2_H */ | ||
464 | |||
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 000000000000..dfb8a5bedfc8 --- /dev/null +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -0,0 +1,638 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_fs.h | ||
5 | * | ||
6 | * On-disk structures for OCFS2. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License, version 2, as published by the Free Software Foundation. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public | ||
20 | * License along with this program; if not, write to the | ||
21 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
22 | * Boston, MA 021110-1307, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _OCFS2_FS_H | ||
26 | #define _OCFS2_FS_H | ||
27 | |||
28 | /* Version */ | ||
29 | #define OCFS2_MAJOR_REV_LEVEL 0 | ||
30 | #define OCFS2_MINOR_REV_LEVEL 90 | ||
31 | |||
32 | /* | ||
33 | * An OCFS2 volume starts this way: | ||
34 | * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. | ||
35 | * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. | ||
36 | * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. | ||
37 | * | ||
38 | * All other structures are found from the superblock information. | ||
39 | * | ||
40 | * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a | ||
41 | * blocksize of 2K, it is 4096 bytes into disk. | ||
42 | */ | ||
43 | #define OCFS2_SUPER_BLOCK_BLKNO 2 | ||
44 | |||
45 | /* | ||
46 | * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could | ||
47 | * grow if needed. | ||
48 | */ | ||
49 | #define OCFS2_MIN_CLUSTERSIZE 4096 | ||
50 | #define OCFS2_MAX_CLUSTERSIZE 1048576 | ||
51 | |||
52 | /* | ||
53 | * Blocks cannot be bigger than clusters, so the maximum blocksize is the | ||
54 | * minimum cluster size. | ||
55 | */ | ||
56 | #define OCFS2_MIN_BLOCKSIZE 512 | ||
57 | #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE | ||
58 | |||
59 | /* Filesystem magic number */ | ||
60 | #define OCFS2_SUPER_MAGIC 0x7461636f | ||
61 | |||
62 | /* Object signatures */ | ||
63 | #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" | ||
64 | #define OCFS2_INODE_SIGNATURE "INODE01" | ||
65 | #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" | ||
66 | #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" | ||
67 | |||
68 | /* Compatibility flags */ | ||
69 | #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ | ||
70 | ( OCFS2_SB(sb)->s_feature_compat & (mask) ) | ||
71 | #define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ | ||
72 | ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) | ||
73 | #define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ | ||
74 | ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) | ||
75 | #define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ | ||
76 | OCFS2_SB(sb)->s_feature_compat |= (mask) | ||
77 | #define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ | ||
78 | OCFS2_SB(sb)->s_feature_ro_compat |= (mask) | ||
79 | #define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ | ||
80 | OCFS2_SB(sb)->s_feature_incompat |= (mask) | ||
81 | #define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ | ||
82 | OCFS2_SB(sb)->s_feature_compat &= ~(mask) | ||
83 | #define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ | ||
84 | OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) | ||
85 | #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ | ||
86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) | ||
87 | |||
88 | #define OCFS2_FEATURE_COMPAT_SUPP 0 | ||
89 | #define OCFS2_FEATURE_INCOMPAT_SUPP 0 | ||
90 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 | ||
91 | |||
92 | /* | ||
93 | * Heartbeat-only devices are missing journals and other files. The | ||
94 | * filesystem driver can't load them, but the library can. Never put | ||
95 | * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. | ||
96 | */ | ||
97 | #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 | ||
98 | |||
99 | |||
100 | /* | ||
101 | * Flags on ocfs2_dinode.i_flags | ||
102 | */ | ||
103 | #define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ | ||
104 | #define OCFS2_UNUSED2_FL (0x00000002) | ||
105 | #define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ | ||
106 | #define OCFS2_UNUSED3_FL (0x00000008) | ||
107 | /* System inode flags */ | ||
108 | #define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ | ||
109 | #define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ | ||
110 | #define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ | ||
111 | #define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ | ||
112 | #define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ | ||
113 | #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ | ||
114 | #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ | ||
115 | #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ | ||
116 | |||
117 | /* | ||
118 | * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) | ||
119 | */ | ||
120 | #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ | ||
121 | |||
122 | /* | ||
123 | * superblock s_state flags | ||
124 | */ | ||
125 | #define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ | ||
126 | |||
127 | /* Limit of space in ocfs2_dir_entry */ | ||
128 | #define OCFS2_MAX_FILENAME_LEN 255 | ||
129 | |||
130 | /* Maximum slots on an ocfs2 file system */ | ||
131 | #define OCFS2_MAX_SLOTS 255 | ||
132 | |||
133 | /* Slot map indicator for an empty slot */ | ||
134 | #define OCFS2_INVALID_SLOT -1 | ||
135 | |||
136 | #define OCFS2_VOL_UUID_LEN 16 | ||
137 | #define OCFS2_MAX_VOL_LABEL_LEN 64 | ||
138 | |||
139 | /* Journal limits (in bytes) */ | ||
140 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) | ||
141 | #define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) | ||
142 | |||
143 | struct ocfs2_system_inode_info { | ||
144 | char *si_name; | ||
145 | int si_iflags; | ||
146 | int si_mode; | ||
147 | }; | ||
148 | |||
149 | /* System file index */ | ||
150 | enum { | ||
151 | BAD_BLOCK_SYSTEM_INODE = 0, | ||
152 | GLOBAL_INODE_ALLOC_SYSTEM_INODE, | ||
153 | SLOT_MAP_SYSTEM_INODE, | ||
154 | #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE | ||
155 | HEARTBEAT_SYSTEM_INODE, | ||
156 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
157 | #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE | ||
158 | ORPHAN_DIR_SYSTEM_INODE, | ||
159 | EXTENT_ALLOC_SYSTEM_INODE, | ||
160 | INODE_ALLOC_SYSTEM_INODE, | ||
161 | JOURNAL_SYSTEM_INODE, | ||
162 | LOCAL_ALLOC_SYSTEM_INODE, | ||
163 | TRUNCATE_LOG_SYSTEM_INODE, | ||
164 | NUM_SYSTEM_INODES | ||
165 | }; | ||
166 | |||
167 | static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { | ||
168 | /* Global system inodes (single copy) */ | ||
169 | /* The first two are only used from userspace mfks/tunefs */ | ||
170 | [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, | ||
171 | [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
172 | |||
173 | /* These are used by the running filesystem */ | ||
174 | [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, | ||
175 | [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, | ||
176 | [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, | ||
177 | |||
178 | /* Slot-specific system inodes (one copy per slot) */ | ||
179 | [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, | ||
180 | [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
181 | [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
182 | [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, | ||
183 | [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, | ||
184 | [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } | ||
185 | }; | ||
186 | |||
187 | /* Parameter passed from mount.ocfs2 to module */ | ||
188 | #define OCFS2_HB_NONE "heartbeat=none" | ||
189 | #define OCFS2_HB_LOCAL "heartbeat=local" | ||
190 | |||
191 | /* | ||
192 | * OCFS2 directory file types. Only the low 3 bits are used. The | ||
193 | * other bits are reserved for now. | ||
194 | */ | ||
195 | #define OCFS2_FT_UNKNOWN 0 | ||
196 | #define OCFS2_FT_REG_FILE 1 | ||
197 | #define OCFS2_FT_DIR 2 | ||
198 | #define OCFS2_FT_CHRDEV 3 | ||
199 | #define OCFS2_FT_BLKDEV 4 | ||
200 | #define OCFS2_FT_FIFO 5 | ||
201 | #define OCFS2_FT_SOCK 6 | ||
202 | #define OCFS2_FT_SYMLINK 7 | ||
203 | |||
204 | #define OCFS2_FT_MAX 8 | ||
205 | |||
206 | /* | ||
207 | * OCFS2_DIR_PAD defines the directory entries boundaries | ||
208 | * | ||
209 | * NOTE: It must be a multiple of 4 | ||
210 | */ | ||
211 | #define OCFS2_DIR_PAD 4 | ||
212 | #define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) | ||
213 | #define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) | ||
214 | #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ | ||
215 | OCFS2_DIR_ROUND) & \ | ||
216 | ~OCFS2_DIR_ROUND) | ||
217 | |||
218 | #define OCFS2_LINK_MAX 32000 | ||
219 | |||
220 | #define S_SHIFT 12 | ||
221 | static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { | ||
222 | [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, | ||
223 | [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, | ||
224 | [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, | ||
225 | [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, | ||
226 | [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, | ||
227 | [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, | ||
228 | [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, | ||
229 | }; | ||
230 | |||
231 | |||
232 | /* | ||
233 | * Convenience casts | ||
234 | */ | ||
235 | #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) | ||
236 | |||
237 | /* | ||
238 | * On disk extent record for OCFS2 | ||
239 | * It describes a range of clusters on disk. | ||
240 | */ | ||
241 | struct ocfs2_extent_rec { | ||
242 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ | ||
243 | __le32 e_clusters; /* Clusters covered by this extent */ | ||
244 | __le64 e_blkno; /* Physical disk offset, in blocks */ | ||
245 | /*10*/ | ||
246 | }; | ||
247 | |||
248 | struct ocfs2_chain_rec { | ||
249 | __le32 c_free; /* Number of free bits in this chain. */ | ||
250 | __le32 c_total; /* Number of total bits in this chain */ | ||
251 | __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ | ||
252 | }; | ||
253 | |||
254 | struct ocfs2_truncate_rec { | ||
255 | __le32 t_start; /* 1st cluster in this log */ | ||
256 | __le32 t_clusters; /* Number of total clusters covered */ | ||
257 | }; | ||
258 | |||
259 | /* | ||
260 | * On disk extent list for OCFS2 (node in the tree). Note that this | ||
261 | * is contained inside ocfs2_dinode or ocfs2_extent_block, so the | ||
262 | * offsets are relative to ocfs2_dinode.id2.i_list or | ||
263 | * ocfs2_extent_block.h_list, respectively. | ||
264 | */ | ||
265 | struct ocfs2_extent_list { | ||
266 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this | ||
267 | point. 0 means data extents | ||
268 | hang directly off this | ||
269 | header (a leaf) */ | ||
270 | __le16 l_count; /* Number of extent records */ | ||
271 | __le16 l_next_free_rec; /* Next unused extent slot */ | ||
272 | __le16 l_reserved1; | ||
273 | __le64 l_reserved2; /* Pad to | ||
274 | sizeof(ocfs2_extent_rec) */ | ||
275 | /*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ | ||
276 | }; | ||
277 | |||
278 | /* | ||
279 | * On disk allocation chain list for OCFS2. Note that this is | ||
280 | * contained inside ocfs2_dinode, so the offsets are relative to | ||
281 | * ocfs2_dinode.id2.i_chain. | ||
282 | */ | ||
283 | struct ocfs2_chain_list { | ||
284 | /*00*/ __le16 cl_cpg; /* Clusters per Block Group */ | ||
285 | __le16 cl_bpc; /* Bits per cluster */ | ||
286 | __le16 cl_count; /* Total chains in this list */ | ||
287 | __le16 cl_next_free_rec; /* Next unused chain slot */ | ||
288 | __le64 cl_reserved1; | ||
289 | /*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ | ||
290 | }; | ||
291 | |||
292 | /* | ||
293 | * On disk deallocation log for OCFS2. Note that this is | ||
294 | * contained inside ocfs2_dinode, so the offsets are relative to | ||
295 | * ocfs2_dinode.id2.i_dealloc. | ||
296 | */ | ||
297 | struct ocfs2_truncate_log { | ||
298 | /*00*/ __le16 tl_count; /* Total records in this log */ | ||
299 | __le16 tl_used; /* Number of records in use */ | ||
300 | __le32 tl_reserved1; | ||
301 | /*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ | ||
302 | }; | ||
303 | |||
304 | /* | ||
305 | * On disk extent block (indirect block) for OCFS2 | ||
306 | */ | ||
307 | struct ocfs2_extent_block | ||
308 | { | ||
309 | /*00*/ __u8 h_signature[8]; /* Signature for verification */ | ||
310 | __le64 h_reserved1; | ||
311 | /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this | ||
312 | extent_header belongs to */ | ||
313 | __le16 h_suballoc_bit; /* Bit offset in suballocator | ||
314 | block group */ | ||
315 | __le32 h_fs_generation; /* Must match super block */ | ||
316 | __le64 h_blkno; /* Offset on disk, in blocks */ | ||
317 | /*20*/ __le64 h_reserved3; | ||
318 | __le64 h_next_leaf_blk; /* Offset on disk, in blocks, | ||
319 | of next leaf header pointing | ||
320 | to data */ | ||
321 | /*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ | ||
322 | /* Actual on-disk size is one block */ | ||
323 | }; | ||
324 | |||
325 | /* | ||
326 | * On disk superblock for OCFS2 | ||
327 | * Note that it is contained inside an ocfs2_dinode, so all offsets | ||
328 | * are relative to the start of ocfs2_dinode.id2. | ||
329 | */ | ||
330 | struct ocfs2_super_block { | ||
331 | /*00*/ __le16 s_major_rev_level; | ||
332 | __le16 s_minor_rev_level; | ||
333 | __le16 s_mnt_count; | ||
334 | __le16 s_max_mnt_count; | ||
335 | __le16 s_state; /* File system state */ | ||
336 | __le16 s_errors; /* Behaviour when detecting errors */ | ||
337 | __le32 s_checkinterval; /* Max time between checks */ | ||
338 | /*10*/ __le64 s_lastcheck; /* Time of last check */ | ||
339 | __le32 s_creator_os; /* OS */ | ||
340 | __le32 s_feature_compat; /* Compatible feature set */ | ||
341 | /*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ | ||
342 | __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ | ||
343 | __le64 s_root_blkno; /* Offset, in blocks, of root directory | ||
344 | dinode */ | ||
345 | /*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system | ||
346 | directory dinode */ | ||
347 | __le32 s_blocksize_bits; /* Blocksize for this fs */ | ||
348 | __le32 s_clustersize_bits; /* Clustersize for this fs */ | ||
349 | /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts | ||
350 | before tunefs required */ | ||
351 | __le16 s_reserved1; | ||
352 | __le32 s_reserved2; | ||
353 | __le64 s_first_cluster_group; /* Block offset of 1st cluster | ||
354 | * group header */ | ||
355 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ | ||
356 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ | ||
357 | /*A0*/ | ||
358 | }; | ||
359 | |||
360 | /* | ||
361 | * Local allocation bitmap for OCFS2 slots | ||
362 | * Note that it exists inside an ocfs2_dinode, so all offsets are | ||
363 | * relative to the start of ocfs2_dinode.id2. | ||
364 | */ | ||
365 | struct ocfs2_local_alloc | ||
366 | { | ||
367 | /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ | ||
368 | __le16 la_size; /* Size of included bitmap, in bytes */ | ||
369 | __le16 la_reserved1; | ||
370 | __le64 la_reserved2; | ||
371 | /*10*/ __u8 la_bitmap[0]; | ||
372 | }; | ||
373 | |||
374 | /* | ||
375 | * On disk inode for OCFS2 | ||
376 | */ | ||
377 | struct ocfs2_dinode { | ||
378 | /*00*/ __u8 i_signature[8]; /* Signature for validation */ | ||
379 | __le32 i_generation; /* Generation number */ | ||
380 | __le16 i_suballoc_slot; /* Slot suballocator this inode | ||
381 | belongs to */ | ||
382 | __le16 i_suballoc_bit; /* Bit offset in suballocator | ||
383 | block group */ | ||
384 | /*10*/ __le32 i_reserved0; | ||
385 | __le32 i_clusters; /* Cluster count */ | ||
386 | __le32 i_uid; /* Owner UID */ | ||
387 | __le32 i_gid; /* Owning GID */ | ||
388 | /*20*/ __le64 i_size; /* Size in bytes */ | ||
389 | __le16 i_mode; /* File mode */ | ||
390 | __le16 i_links_count; /* Links count */ | ||
391 | __le32 i_flags; /* File flags */ | ||
392 | /*30*/ __le64 i_atime; /* Access time */ | ||
393 | __le64 i_ctime; /* Creation time */ | ||
394 | /*40*/ __le64 i_mtime; /* Modification time */ | ||
395 | __le64 i_dtime; /* Deletion time */ | ||
396 | /*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ | ||
397 | __le64 i_last_eb_blk; /* Pointer to last extent | ||
398 | block */ | ||
399 | /*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ | ||
400 | __le32 i_atime_nsec; | ||
401 | __le32 i_ctime_nsec; | ||
402 | __le32 i_mtime_nsec; | ||
403 | /*70*/ __le64 i_reserved1[9]; | ||
404 | /*B8*/ union { | ||
405 | __le64 i_pad1; /* Generic way to refer to this | ||
406 | 64bit union */ | ||
407 | struct { | ||
408 | __le64 i_rdev; /* Device number */ | ||
409 | } dev1; | ||
410 | struct { /* Info for bitmap system | ||
411 | inodes */ | ||
412 | __le32 i_used; /* Bits (ie, clusters) used */ | ||
413 | __le32 i_total; /* Total bits (clusters) | ||
414 | available */ | ||
415 | } bitmap1; | ||
416 | struct { /* Info for journal system | ||
417 | inodes */ | ||
418 | __le32 ij_flags; /* Mounted, version, etc. */ | ||
419 | __le32 ij_pad; | ||
420 | } journal1; | ||
421 | } id1; /* Inode type dependant 1 */ | ||
422 | /*C0*/ union { | ||
423 | struct ocfs2_super_block i_super; | ||
424 | struct ocfs2_local_alloc i_lab; | ||
425 | struct ocfs2_chain_list i_chain; | ||
426 | struct ocfs2_extent_list i_list; | ||
427 | struct ocfs2_truncate_log i_dealloc; | ||
428 | __u8 i_symlink[0]; | ||
429 | } id2; | ||
430 | /* Actual on-disk size is one block */ | ||
431 | }; | ||
432 | |||
433 | /* | ||
434 | * On-disk directory entry structure for OCFS2 | ||
435 | * | ||
436 | * Packed as this structure could be accessed unaligned on 64-bit platforms | ||
437 | */ | ||
438 | struct ocfs2_dir_entry { | ||
439 | /*00*/ __le64 inode; /* Inode number */ | ||
440 | __le16 rec_len; /* Directory entry length */ | ||
441 | __u8 name_len; /* Name length */ | ||
442 | __u8 file_type; | ||
443 | /*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ | ||
444 | /* Actual on-disk length specified by rec_len */ | ||
445 | } __attribute__ ((packed)); | ||
446 | |||
447 | /* | ||
448 | * On disk allocator group structure for OCFS2 | ||
449 | */ | ||
450 | struct ocfs2_group_desc | ||
451 | { | ||
452 | /*00*/ __u8 bg_signature[8]; /* Signature for validation */ | ||
453 | __le16 bg_size; /* Size of included bitmap in | ||
454 | bytes. */ | ||
455 | __le16 bg_bits; /* Bits represented by this | ||
456 | group. */ | ||
457 | __le16 bg_free_bits_count; /* Free bits count */ | ||
458 | __le16 bg_chain; /* What chain I am in. */ | ||
459 | /*10*/ __le32 bg_generation; | ||
460 | __le32 bg_reserved1; | ||
461 | __le64 bg_next_group; /* Next group in my list, in | ||
462 | blocks */ | ||
463 | /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in | ||
464 | blocks */ | ||
465 | __le64 bg_blkno; /* Offset on disk, in blocks */ | ||
466 | /*30*/ __le64 bg_reserved2[2]; | ||
467 | /*40*/ __u8 bg_bitmap[0]; | ||
468 | }; | ||
469 | |||
470 | #ifdef __KERNEL__ | ||
471 | static inline int ocfs2_fast_symlink_chars(struct super_block *sb) | ||
472 | { | ||
473 | return sb->s_blocksize - | ||
474 | offsetof(struct ocfs2_dinode, id2.i_symlink); | ||
475 | } | ||
476 | |||
477 | static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) | ||
478 | { | ||
479 | int size; | ||
480 | |||
481 | size = sb->s_blocksize - | ||
482 | offsetof(struct ocfs2_dinode, id2.i_list.l_recs); | ||
483 | |||
484 | return size / sizeof(struct ocfs2_extent_rec); | ||
485 | } | ||
486 | |||
487 | static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) | ||
488 | { | ||
489 | int size; | ||
490 | |||
491 | size = sb->s_blocksize - | ||
492 | offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); | ||
493 | |||
494 | return size / sizeof(struct ocfs2_chain_rec); | ||
495 | } | ||
496 | |||
497 | static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) | ||
498 | { | ||
499 | int size; | ||
500 | |||
501 | size = sb->s_blocksize - | ||
502 | offsetof(struct ocfs2_extent_block, h_list.l_recs); | ||
503 | |||
504 | return size / sizeof(struct ocfs2_extent_rec); | ||
505 | } | ||
506 | |||
507 | static inline u16 ocfs2_local_alloc_size(struct super_block *sb) | ||
508 | { | ||
509 | u16 size; | ||
510 | |||
511 | size = sb->s_blocksize - | ||
512 | offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); | ||
513 | |||
514 | return size; | ||
515 | } | ||
516 | |||
517 | static inline int ocfs2_group_bitmap_size(struct super_block *sb) | ||
518 | { | ||
519 | int size; | ||
520 | |||
521 | size = sb->s_blocksize - | ||
522 | offsetof(struct ocfs2_group_desc, bg_bitmap); | ||
523 | |||
524 | return size; | ||
525 | } | ||
526 | |||
527 | static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) | ||
528 | { | ||
529 | int size; | ||
530 | |||
531 | size = sb->s_blocksize - | ||
532 | offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); | ||
533 | |||
534 | return size / sizeof(struct ocfs2_truncate_rec); | ||
535 | } | ||
536 | #else | ||
537 | static inline int ocfs2_fast_symlink_chars(int blocksize) | ||
538 | { | ||
539 | return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); | ||
540 | } | ||
541 | |||
542 | static inline int ocfs2_extent_recs_per_inode(int blocksize) | ||
543 | { | ||
544 | int size; | ||
545 | |||
546 | size = blocksize - | ||
547 | offsetof(struct ocfs2_dinode, id2.i_list.l_recs); | ||
548 | |||
549 | return size / sizeof(struct ocfs2_extent_rec); | ||
550 | } | ||
551 | |||
552 | static inline int ocfs2_chain_recs_per_inode(int blocksize) | ||
553 | { | ||
554 | int size; | ||
555 | |||
556 | size = blocksize - | ||
557 | offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); | ||
558 | |||
559 | return size / sizeof(struct ocfs2_chain_rec); | ||
560 | } | ||
561 | |||
562 | static inline int ocfs2_extent_recs_per_eb(int blocksize) | ||
563 | { | ||
564 | int size; | ||
565 | |||
566 | size = blocksize - | ||
567 | offsetof(struct ocfs2_extent_block, h_list.l_recs); | ||
568 | |||
569 | return size / sizeof(struct ocfs2_extent_rec); | ||
570 | } | ||
571 | |||
572 | static inline int ocfs2_local_alloc_size(int blocksize) | ||
573 | { | ||
574 | int size; | ||
575 | |||
576 | size = blocksize - | ||
577 | offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); | ||
578 | |||
579 | return size; | ||
580 | } | ||
581 | |||
582 | static inline int ocfs2_group_bitmap_size(int blocksize) | ||
583 | { | ||
584 | int size; | ||
585 | |||
586 | size = blocksize - | ||
587 | offsetof(struct ocfs2_group_desc, bg_bitmap); | ||
588 | |||
589 | return size; | ||
590 | } | ||
591 | |||
592 | static inline int ocfs2_truncate_recs_per_inode(int blocksize) | ||
593 | { | ||
594 | int size; | ||
595 | |||
596 | size = blocksize - | ||
597 | offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); | ||
598 | |||
599 | return size / sizeof(struct ocfs2_truncate_rec); | ||
600 | } | ||
601 | #endif /* __KERNEL__ */ | ||
602 | |||
603 | |||
604 | static inline int ocfs2_system_inode_is_global(int type) | ||
605 | { | ||
606 | return ((type >= 0) && | ||
607 | (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); | ||
608 | } | ||
609 | |||
610 | static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, | ||
611 | int type, int slot) | ||
612 | { | ||
613 | int chars; | ||
614 | |||
615 | /* | ||
616 | * Global system inodes can only have one copy. Everything | ||
617 | * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode | ||
618 | * list has a copy per slot. | ||
619 | */ | ||
620 | if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) | ||
621 | chars = snprintf(buf, len, | ||
622 | ocfs2_system_inodes[type].si_name); | ||
623 | else | ||
624 | chars = snprintf(buf, len, | ||
625 | ocfs2_system_inodes[type].si_name, | ||
626 | slot); | ||
627 | |||
628 | return chars; | ||
629 | } | ||
630 | |||
631 | static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, | ||
632 | umode_t mode) | ||
633 | { | ||
634 | de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; | ||
635 | } | ||
636 | |||
637 | #endif /* _OCFS2_FS_H */ | ||
638 | |||
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 000000000000..7dd9e1e705b0 --- /dev/null +++ b/fs/ocfs2/ocfs2_lockid.h | |||
@@ -0,0 +1,73 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_lockid.h | ||
5 | * | ||
6 | * Defines OCFS2 lockid bits. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_LOCKID_H | ||
27 | #define OCFS2_LOCKID_H | ||
28 | |||
29 | /* lock ids are made up in the following manner: | ||
30 | * name[0] --> type | ||
31 | * name[1-6] --> 6 pad characters, reserved for now | ||
32 | * name[7-22] --> block number, expressed in hex as 16 chars | ||
33 | * name[23-30] --> i_generation, expressed in hex 8 chars | ||
34 | * name[31] --> '\0' */ | ||
35 | #define OCFS2_LOCK_ID_MAX_LEN 32 | ||
36 | #define OCFS2_LOCK_ID_PAD "000000" | ||
37 | |||
38 | enum ocfs2_lock_type { | ||
39 | OCFS2_LOCK_TYPE_META = 0, | ||
40 | OCFS2_LOCK_TYPE_DATA, | ||
41 | OCFS2_LOCK_TYPE_SUPER, | ||
42 | OCFS2_LOCK_TYPE_RENAME, | ||
43 | OCFS2_LOCK_TYPE_RW, | ||
44 | OCFS2_NUM_LOCK_TYPES | ||
45 | }; | ||
46 | |||
47 | static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) | ||
48 | { | ||
49 | char c; | ||
50 | switch (type) { | ||
51 | case OCFS2_LOCK_TYPE_META: | ||
52 | c = 'M'; | ||
53 | break; | ||
54 | case OCFS2_LOCK_TYPE_DATA: | ||
55 | c = 'D'; | ||
56 | break; | ||
57 | case OCFS2_LOCK_TYPE_SUPER: | ||
58 | c = 'S'; | ||
59 | break; | ||
60 | case OCFS2_LOCK_TYPE_RENAME: | ||
61 | c = 'R'; | ||
62 | break; | ||
63 | case OCFS2_LOCK_TYPE_RW: | ||
64 | c = 'W'; | ||
65 | break; | ||
66 | default: | ||
67 | c = '\0'; | ||
68 | } | ||
69 | |||
70 | return c; | ||
71 | } | ||
72 | |||
73 | #endif /* OCFS2_LOCKID_H */ | ||
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c new file mode 100644 index 000000000000..871627961d6d --- /dev/null +++ b/fs/ocfs2/slot_map.c | |||
@@ -0,0 +1,303 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * slot_map.c | ||
5 | * | ||
6 | * | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_SUPER | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "dlmglue.h" | ||
37 | #include "extent_map.h" | ||
38 | #include "heartbeat.h" | ||
39 | #include "inode.h" | ||
40 | #include "slot_map.h" | ||
41 | #include "super.h" | ||
42 | #include "sysfile.h" | ||
43 | |||
44 | #include "buffer_head_io.h" | ||
45 | |||
46 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
47 | s16 global); | ||
48 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | ||
49 | s16 slot_num, | ||
50 | s16 node_num); | ||
51 | |||
52 | /* Use the slot information we've collected to create a map of mounted | ||
53 | * nodes. Should be holding an EX on super block. assumes slot info is | ||
54 | * up to date. Note that we call this *after* we find a slot, so our | ||
55 | * own node should be set in the map too... */ | ||
56 | void ocfs2_populate_mounted_map(struct ocfs2_super *osb) | ||
57 | { | ||
58 | int i; | ||
59 | struct ocfs2_slot_info *si = osb->slot_info; | ||
60 | |||
61 | spin_lock(&si->si_lock); | ||
62 | |||
63 | for (i = 0; i < si->si_size; i++) | ||
64 | if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT) | ||
65 | ocfs2_node_map_set_bit(osb, &osb->mounted_map, | ||
66 | si->si_global_node_nums[i]); | ||
67 | |||
68 | spin_unlock(&si->si_lock); | ||
69 | } | ||
70 | |||
71 | /* post the slot information on disk into our slot_info struct. */ | ||
72 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si) | ||
73 | { | ||
74 | int i; | ||
75 | __le16 *disk_info; | ||
76 | |||
77 | /* we don't read the slot block here as ocfs2_super_lock | ||
78 | * should've made sure we have the most recent copy. */ | ||
79 | spin_lock(&si->si_lock); | ||
80 | disk_info = (__le16 *) si->si_bh->b_data; | ||
81 | |||
82 | for (i = 0; i < si->si_size; i++) | ||
83 | si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); | ||
84 | |||
85 | spin_unlock(&si->si_lock); | ||
86 | } | ||
87 | |||
88 | /* post the our slot info stuff into it's destination bh and write it | ||
89 | * out. */ | ||
90 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | ||
91 | struct ocfs2_slot_info *si) | ||
92 | { | ||
93 | int status, i; | ||
94 | __le16 *disk_info = (__le16 *) si->si_bh->b_data; | ||
95 | |||
96 | spin_lock(&si->si_lock); | ||
97 | for (i = 0; i < si->si_size; i++) | ||
98 | disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); | ||
99 | spin_unlock(&si->si_lock); | ||
100 | |||
101 | status = ocfs2_write_block(osb, si->si_bh, si->si_inode); | ||
102 | if (status < 0) | ||
103 | mlog_errno(status); | ||
104 | |||
105 | return status; | ||
106 | } | ||
107 | |||
108 | /* try to find global node in the slot info. Returns | ||
109 | * OCFS2_INVALID_SLOT if nothing is found. */ | ||
110 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
111 | s16 global) | ||
112 | { | ||
113 | int i; | ||
114 | s16 ret = OCFS2_INVALID_SLOT; | ||
115 | |||
116 | for(i = 0; i < si->si_num_slots; i++) { | ||
117 | if (global == si->si_global_node_nums[i]) { | ||
118 | ret = (s16) i; | ||
119 | break; | ||
120 | } | ||
121 | } | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) | ||
126 | { | ||
127 | int i; | ||
128 | s16 ret = OCFS2_INVALID_SLOT; | ||
129 | |||
130 | for(i = 0; i < si->si_num_slots; i++) { | ||
131 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { | ||
132 | ret = (s16) i; | ||
133 | break; | ||
134 | } | ||
135 | } | ||
136 | return ret; | ||
137 | } | ||
138 | |||
139 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
140 | s16 global) | ||
141 | { | ||
142 | s16 ret; | ||
143 | |||
144 | spin_lock(&si->si_lock); | ||
145 | ret = __ocfs2_node_num_to_slot(si, global); | ||
146 | spin_unlock(&si->si_lock); | ||
147 | return ret; | ||
148 | } | ||
149 | |||
150 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | ||
151 | s16 slot_num, | ||
152 | s16 node_num) | ||
153 | { | ||
154 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | ||
155 | BUG_ON(slot_num >= si->si_num_slots); | ||
156 | BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && | ||
157 | (node_num >= O2NM_MAX_NODES)); | ||
158 | |||
159 | si->si_global_node_nums[slot_num] = node_num; | ||
160 | } | ||
161 | |||
162 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | ||
163 | s16 slot_num) | ||
164 | { | ||
165 | spin_lock(&si->si_lock); | ||
166 | __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); | ||
167 | spin_unlock(&si->si_lock); | ||
168 | } | ||
169 | |||
170 | int ocfs2_init_slot_info(struct ocfs2_super *osb) | ||
171 | { | ||
172 | int status, i; | ||
173 | u64 blkno; | ||
174 | struct inode *inode = NULL; | ||
175 | struct buffer_head *bh = NULL; | ||
176 | struct ocfs2_slot_info *si; | ||
177 | |||
178 | si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); | ||
179 | if (!si) { | ||
180 | status = -ENOMEM; | ||
181 | mlog_errno(status); | ||
182 | goto bail; | ||
183 | } | ||
184 | |||
185 | spin_lock_init(&si->si_lock); | ||
186 | si->si_num_slots = osb->max_slots; | ||
187 | si->si_size = OCFS2_MAX_SLOTS; | ||
188 | |||
189 | for(i = 0; i < si->si_num_slots; i++) | ||
190 | si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; | ||
191 | |||
192 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, | ||
193 | OCFS2_INVALID_SLOT); | ||
194 | if (!inode) { | ||
195 | status = -EINVAL; | ||
196 | mlog_errno(status); | ||
197 | goto bail; | ||
198 | } | ||
199 | |||
200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); | ||
201 | if (status < 0) { | ||
202 | mlog_errno(status); | ||
203 | goto bail; | ||
204 | } | ||
205 | |||
206 | status = ocfs2_read_block(osb, blkno, &bh, 0, inode); | ||
207 | if (status < 0) { | ||
208 | mlog_errno(status); | ||
209 | goto bail; | ||
210 | } | ||
211 | |||
212 | si->si_inode = inode; | ||
213 | si->si_bh = bh; | ||
214 | osb->slot_info = si; | ||
215 | bail: | ||
216 | if (status < 0 && si) | ||
217 | ocfs2_free_slot_info(si); | ||
218 | |||
219 | return status; | ||
220 | } | ||
221 | |||
222 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si) | ||
223 | { | ||
224 | if (si->si_inode) | ||
225 | iput(si->si_inode); | ||
226 | if (si->si_bh) | ||
227 | brelse(si->si_bh); | ||
228 | kfree(si); | ||
229 | } | ||
230 | |||
231 | int ocfs2_find_slot(struct ocfs2_super *osb) | ||
232 | { | ||
233 | int status; | ||
234 | s16 slot; | ||
235 | struct ocfs2_slot_info *si; | ||
236 | |||
237 | mlog_entry_void(); | ||
238 | |||
239 | si = osb->slot_info; | ||
240 | |||
241 | ocfs2_update_slot_info(si); | ||
242 | |||
243 | spin_lock(&si->si_lock); | ||
244 | /* search for ourselves first and take the slot if it already | ||
245 | * exists. Perhaps we need to mark this in a variable for our | ||
246 | * own journal recovery? Possibly not, though we certainly | ||
247 | * need to warn to the user */ | ||
248 | slot = __ocfs2_node_num_to_slot(si, osb->node_num); | ||
249 | if (slot == OCFS2_INVALID_SLOT) { | ||
250 | /* if no slot yet, then just take 1st available | ||
251 | * one. */ | ||
252 | slot = __ocfs2_find_empty_slot(si); | ||
253 | if (slot == OCFS2_INVALID_SLOT) { | ||
254 | spin_unlock(&si->si_lock); | ||
255 | mlog(ML_ERROR, "no free slots available!\n"); | ||
256 | status = -EINVAL; | ||
257 | goto bail; | ||
258 | } | ||
259 | } else | ||
260 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", | ||
261 | slot); | ||
262 | |||
263 | __ocfs2_fill_slot(si, slot, osb->node_num); | ||
264 | osb->slot_num = slot; | ||
265 | spin_unlock(&si->si_lock); | ||
266 | |||
267 | mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num); | ||
268 | |||
269 | status = ocfs2_update_disk_slots(osb, si); | ||
270 | if (status < 0) | ||
271 | mlog_errno(status); | ||
272 | |||
273 | bail: | ||
274 | mlog_exit(status); | ||
275 | return status; | ||
276 | } | ||
277 | |||
278 | void ocfs2_put_slot(struct ocfs2_super *osb) | ||
279 | { | ||
280 | int status; | ||
281 | struct ocfs2_slot_info *si = osb->slot_info; | ||
282 | |||
283 | if (!si) | ||
284 | return; | ||
285 | |||
286 | ocfs2_update_slot_info(si); | ||
287 | |||
288 | spin_lock(&si->si_lock); | ||
289 | __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); | ||
290 | osb->slot_num = OCFS2_INVALID_SLOT; | ||
291 | spin_unlock(&si->si_lock); | ||
292 | |||
293 | status = ocfs2_update_disk_slots(osb, si); | ||
294 | if (status < 0) { | ||
295 | mlog_errno(status); | ||
296 | goto bail; | ||
297 | } | ||
298 | |||
299 | bail: | ||
300 | osb->slot_info = NULL; | ||
301 | ocfs2_free_slot_info(si); | ||
302 | } | ||
303 | |||
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h new file mode 100644 index 000000000000..d8c8ceed031b --- /dev/null +++ b/fs/ocfs2/slot_map.h | |||
@@ -0,0 +1,66 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * slotmap.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef SLOTMAP_H | ||
28 | #define SLOTMAP_H | ||
29 | |||
30 | struct ocfs2_slot_info { | ||
31 | spinlock_t si_lock; | ||
32 | |||
33 | struct inode *si_inode; | ||
34 | struct buffer_head *si_bh; | ||
35 | unsigned int si_num_slots; | ||
36 | unsigned int si_size; | ||
37 | s16 si_global_node_nums[OCFS2_MAX_SLOTS]; | ||
38 | }; | ||
39 | |||
40 | int ocfs2_init_slot_info(struct ocfs2_super *osb); | ||
41 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si); | ||
42 | |||
43 | int ocfs2_find_slot(struct ocfs2_super *osb); | ||
44 | void ocfs2_put_slot(struct ocfs2_super *osb); | ||
45 | |||
46 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si); | ||
47 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | ||
48 | struct ocfs2_slot_info *si); | ||
49 | |||
50 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
51 | s16 global); | ||
52 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | ||
53 | s16 slot_num); | ||
54 | |||
55 | void ocfs2_populate_mounted_map(struct ocfs2_super *osb); | ||
56 | |||
57 | static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, | ||
58 | int slot_num) | ||
59 | { | ||
60 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | ||
61 | assert_spin_locked(&si->si_lock); | ||
62 | |||
63 | return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; | ||
64 | } | ||
65 | |||
66 | #endif | ||
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c new file mode 100644 index 000000000000..c46c164aefbb --- /dev/null +++ b/fs/ocfs2/suballoc.c | |||
@@ -0,0 +1,1651 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * suballoc.c | ||
5 | * | ||
6 | * metadata alloc and free | ||
7 | * Inspired by ext3 block groups. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "super.h" | ||
44 | #include "sysfile.h" | ||
45 | #include "uptodate.h" | ||
46 | |||
47 | #include "buffer_head_io.h" | ||
48 | |||
49 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); | ||
50 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); | ||
51 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); | ||
52 | static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, | ||
53 | struct inode *alloc_inode, | ||
54 | struct buffer_head *bg_bh, | ||
55 | u64 group_blkno, | ||
56 | u16 my_chain, | ||
57 | struct ocfs2_chain_list *cl); | ||
58 | static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | ||
59 | struct inode *alloc_inode, | ||
60 | struct buffer_head *bh); | ||
61 | |||
62 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | ||
63 | struct ocfs2_alloc_context *ac); | ||
64 | |||
65 | static int ocfs2_cluster_group_search(struct inode *inode, | ||
66 | struct buffer_head *group_bh, | ||
67 | u32 bits_wanted, u32 min_bits, | ||
68 | u16 *bit_off, u16 *bits_found); | ||
69 | static int ocfs2_block_group_search(struct inode *inode, | ||
70 | struct buffer_head *group_bh, | ||
71 | u32 bits_wanted, u32 min_bits, | ||
72 | u16 *bit_off, u16 *bits_found); | ||
73 | static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, | ||
74 | u32 bits_wanted, | ||
75 | u32 min_bits, | ||
76 | u16 *bit_off, | ||
77 | unsigned int *num_bits, | ||
78 | u64 *bg_blkno); | ||
79 | static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, | ||
80 | struct ocfs2_alloc_context *ac, | ||
81 | u32 bits_wanted, | ||
82 | u32 min_bits, | ||
83 | u16 *bit_off, | ||
84 | unsigned int *num_bits, | ||
85 | u64 *bg_blkno); | ||
86 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, | ||
87 | int nr); | ||
88 | static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, | ||
89 | struct buffer_head *bg_bh, | ||
90 | unsigned int bits_wanted, | ||
91 | u16 *bit_off, | ||
92 | u16 *bits_found); | ||
93 | static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, | ||
94 | struct inode *alloc_inode, | ||
95 | struct ocfs2_group_desc *bg, | ||
96 | struct buffer_head *group_bh, | ||
97 | unsigned int bit_off, | ||
98 | unsigned int num_bits); | ||
99 | static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, | ||
100 | struct inode *alloc_inode, | ||
101 | struct ocfs2_group_desc *bg, | ||
102 | struct buffer_head *group_bh, | ||
103 | unsigned int bit_off, | ||
104 | unsigned int num_bits); | ||
105 | |||
106 | static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, | ||
107 | struct inode *alloc_inode, | ||
108 | struct buffer_head *fe_bh, | ||
109 | struct buffer_head *bg_bh, | ||
110 | struct buffer_head *prev_bg_bh, | ||
111 | u16 chain); | ||
112 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, | ||
113 | u32 wanted); | ||
114 | static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, | ||
115 | struct inode *alloc_inode, | ||
116 | struct buffer_head *alloc_bh, | ||
117 | unsigned int start_bit, | ||
118 | u64 bg_blkno, | ||
119 | unsigned int count); | ||
120 | static inline u64 ocfs2_which_suballoc_group(u64 block, | ||
121 | unsigned int bit); | ||
122 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, | ||
123 | u64 bg_blkno, | ||
124 | u16 bg_bit_off); | ||
125 | static inline u64 ocfs2_which_cluster_group(struct inode *inode, | ||
126 | u32 cluster); | ||
127 | static inline void ocfs2_block_to_cluster_group(struct inode *inode, | ||
128 | u64 data_blkno, | ||
129 | u64 *bg_blkno, | ||
130 | u16 *bg_bit_off); | ||
131 | |||
132 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | ||
133 | { | ||
134 | if (ac->ac_inode) | ||
135 | iput(ac->ac_inode); | ||
136 | if (ac->ac_bh) | ||
137 | brelse(ac->ac_bh); | ||
138 | kfree(ac); | ||
139 | } | ||
140 | |||
141 | static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) | ||
142 | { | ||
143 | return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); | ||
144 | } | ||
145 | |||
146 | static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, | ||
147 | struct inode *alloc_inode, | ||
148 | struct buffer_head *bg_bh, | ||
149 | u64 group_blkno, | ||
150 | u16 my_chain, | ||
151 | struct ocfs2_chain_list *cl) | ||
152 | { | ||
153 | int status = 0; | ||
154 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
155 | struct super_block * sb = alloc_inode->i_sb; | ||
156 | |||
157 | mlog_entry_void(); | ||
158 | |||
159 | if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { | ||
160 | ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") " | ||
161 | "!= b_blocknr (%llu)", group_blkno, | ||
162 | (unsigned long long) bg_bh->b_blocknr); | ||
163 | status = -EIO; | ||
164 | goto bail; | ||
165 | } | ||
166 | |||
167 | status = ocfs2_journal_access(handle, | ||
168 | alloc_inode, | ||
169 | bg_bh, | ||
170 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
171 | if (status < 0) { | ||
172 | mlog_errno(status); | ||
173 | goto bail; | ||
174 | } | ||
175 | |||
176 | memset(bg, 0, sb->s_blocksize); | ||
177 | strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); | ||
178 | bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); | ||
179 | bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); | ||
180 | bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); | ||
181 | bg->bg_chain = cpu_to_le16(my_chain); | ||
182 | bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; | ||
183 | bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); | ||
184 | bg->bg_blkno = cpu_to_le64(group_blkno); | ||
185 | /* set the 1st bit in the bitmap to account for the descriptor block */ | ||
186 | ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); | ||
187 | bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); | ||
188 | |||
189 | status = ocfs2_journal_dirty(handle, bg_bh); | ||
190 | if (status < 0) | ||
191 | mlog_errno(status); | ||
192 | |||
193 | /* There is no need to zero out or otherwise initialize the | ||
194 | * other blocks in a group - All valid FS metadata in a block | ||
195 | * group stores the superblock fs_generation value at | ||
196 | * allocation time. */ | ||
197 | |||
198 | bail: | ||
199 | mlog_exit(status); | ||
200 | return status; | ||
201 | } | ||
202 | |||
203 | static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) | ||
204 | { | ||
205 | u16 curr, best; | ||
206 | |||
207 | best = curr = 0; | ||
208 | while (curr < le16_to_cpu(cl->cl_count)) { | ||
209 | if (le32_to_cpu(cl->cl_recs[best].c_total) > | ||
210 | le32_to_cpu(cl->cl_recs[curr].c_total)) | ||
211 | best = curr; | ||
212 | curr++; | ||
213 | } | ||
214 | return best; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * We expect the block group allocator to already be locked. | ||
219 | */ | ||
220 | static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | ||
221 | struct inode *alloc_inode, | ||
222 | struct buffer_head *bh) | ||
223 | { | ||
224 | int status, credits; | ||
225 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; | ||
226 | struct ocfs2_chain_list *cl; | ||
227 | struct ocfs2_alloc_context *ac = NULL; | ||
228 | struct ocfs2_journal_handle *handle = NULL; | ||
229 | u32 bit_off, num_bits; | ||
230 | u16 alloc_rec; | ||
231 | u64 bg_blkno; | ||
232 | struct buffer_head *bg_bh = NULL; | ||
233 | struct ocfs2_group_desc *bg; | ||
234 | |||
235 | BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); | ||
236 | |||
237 | mlog_entry_void(); | ||
238 | |||
239 | handle = ocfs2_alloc_handle(osb); | ||
240 | if (!handle) { | ||
241 | status = -ENOMEM; | ||
242 | mlog_errno(status); | ||
243 | goto bail; | ||
244 | } | ||
245 | |||
246 | cl = &fe->id2.i_chain; | ||
247 | status = ocfs2_reserve_clusters(osb, | ||
248 | handle, | ||
249 | le16_to_cpu(cl->cl_cpg), | ||
250 | &ac); | ||
251 | if (status < 0) { | ||
252 | if (status != -ENOSPC) | ||
253 | mlog_errno(status); | ||
254 | goto bail; | ||
255 | } | ||
256 | |||
257 | credits = ocfs2_calc_group_alloc_credits(osb->sb, | ||
258 | le16_to_cpu(cl->cl_cpg)); | ||
259 | handle = ocfs2_start_trans(osb, handle, credits); | ||
260 | if (IS_ERR(handle)) { | ||
261 | status = PTR_ERR(handle); | ||
262 | handle = NULL; | ||
263 | mlog_errno(status); | ||
264 | goto bail; | ||
265 | } | ||
266 | |||
267 | status = ocfs2_claim_clusters(osb, | ||
268 | handle, | ||
269 | ac, | ||
270 | le16_to_cpu(cl->cl_cpg), | ||
271 | &bit_off, | ||
272 | &num_bits); | ||
273 | if (status < 0) { | ||
274 | if (status != -ENOSPC) | ||
275 | mlog_errno(status); | ||
276 | goto bail; | ||
277 | } | ||
278 | |||
279 | alloc_rec = ocfs2_find_smallest_chain(cl); | ||
280 | |||
281 | /* setup the group */ | ||
282 | bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); | ||
283 | mlog(0, "new descriptor, record %u, at block %"MLFu64"\n", | ||
284 | alloc_rec, bg_blkno); | ||
285 | |||
286 | bg_bh = sb_getblk(osb->sb, bg_blkno); | ||
287 | if (!bg_bh) { | ||
288 | status = -EIO; | ||
289 | mlog_errno(status); | ||
290 | goto bail; | ||
291 | } | ||
292 | ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); | ||
293 | |||
294 | status = ocfs2_block_group_fill(handle, | ||
295 | alloc_inode, | ||
296 | bg_bh, | ||
297 | bg_blkno, | ||
298 | alloc_rec, | ||
299 | cl); | ||
300 | if (status < 0) { | ||
301 | mlog_errno(status); | ||
302 | goto bail; | ||
303 | } | ||
304 | |||
305 | bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
306 | |||
307 | status = ocfs2_journal_access(handle, alloc_inode, | ||
308 | bh, OCFS2_JOURNAL_ACCESS_WRITE); | ||
309 | if (status < 0) { | ||
310 | mlog_errno(status); | ||
311 | goto bail; | ||
312 | } | ||
313 | |||
314 | le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, | ||
315 | le16_to_cpu(bg->bg_free_bits_count)); | ||
316 | le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); | ||
317 | cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); | ||
318 | if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) | ||
319 | le16_add_cpu(&cl->cl_next_free_rec, 1); | ||
320 | |||
321 | le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - | ||
322 | le16_to_cpu(bg->bg_free_bits_count)); | ||
323 | le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); | ||
324 | le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); | ||
325 | |||
326 | status = ocfs2_journal_dirty(handle, bh); | ||
327 | if (status < 0) { | ||
328 | mlog_errno(status); | ||
329 | goto bail; | ||
330 | } | ||
331 | |||
332 | spin_lock(&OCFS2_I(alloc_inode)->ip_lock); | ||
333 | OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
334 | fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, | ||
335 | le32_to_cpu(fe->i_clusters))); | ||
336 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); | ||
337 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); | ||
338 | alloc_inode->i_blocks = | ||
339 | ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); | ||
340 | |||
341 | status = 0; | ||
342 | bail: | ||
343 | if (handle) | ||
344 | ocfs2_commit_trans(handle); | ||
345 | |||
346 | if (ac) | ||
347 | ocfs2_free_alloc_context(ac); | ||
348 | |||
349 | if (bg_bh) | ||
350 | brelse(bg_bh); | ||
351 | |||
352 | mlog_exit(status); | ||
353 | return status; | ||
354 | } | ||
355 | |||
356 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | ||
357 | struct ocfs2_alloc_context *ac) | ||
358 | { | ||
359 | int status; | ||
360 | u32 bits_wanted = ac->ac_bits_wanted; | ||
361 | struct inode *alloc_inode = ac->ac_inode; | ||
362 | struct buffer_head *bh = NULL; | ||
363 | struct ocfs2_journal_handle *handle = ac->ac_handle; | ||
364 | struct ocfs2_dinode *fe; | ||
365 | u32 free_bits; | ||
366 | |||
367 | mlog_entry_void(); | ||
368 | |||
369 | BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); | ||
370 | |||
371 | ocfs2_handle_add_inode(handle, alloc_inode); | ||
372 | status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1); | ||
373 | if (status < 0) { | ||
374 | mlog_errno(status); | ||
375 | goto bail; | ||
376 | } | ||
377 | |||
378 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
379 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
380 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
381 | status = -EIO; | ||
382 | goto bail; | ||
383 | } | ||
384 | if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { | ||
385 | ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator " | ||
386 | "# %"MLFu64, le64_to_cpu(fe->i_blkno)); | ||
387 | status = -EIO; | ||
388 | goto bail; | ||
389 | } | ||
390 | |||
391 | free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - | ||
392 | le32_to_cpu(fe->id1.bitmap1.i_used); | ||
393 | |||
394 | if (bits_wanted > free_bits) { | ||
395 | /* cluster bitmap never grows */ | ||
396 | if (ocfs2_is_cluster_bitmap(alloc_inode)) { | ||
397 | mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", | ||
398 | bits_wanted, free_bits); | ||
399 | status = -ENOSPC; | ||
400 | goto bail; | ||
401 | } | ||
402 | |||
403 | status = ocfs2_block_group_alloc(osb, alloc_inode, bh); | ||
404 | if (status < 0) { | ||
405 | if (status != -ENOSPC) | ||
406 | mlog_errno(status); | ||
407 | goto bail; | ||
408 | } | ||
409 | atomic_inc(&osb->alloc_stats.bg_extends); | ||
410 | |||
411 | /* You should never ask for this much metadata */ | ||
412 | BUG_ON(bits_wanted > | ||
413 | (le32_to_cpu(fe->id1.bitmap1.i_total) | ||
414 | - le32_to_cpu(fe->id1.bitmap1.i_used))); | ||
415 | } | ||
416 | |||
417 | get_bh(bh); | ||
418 | ac->ac_bh = bh; | ||
419 | bail: | ||
420 | if (bh) | ||
421 | brelse(bh); | ||
422 | |||
423 | mlog_exit(status); | ||
424 | return status; | ||
425 | } | ||
426 | |||
427 | int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | ||
428 | struct ocfs2_journal_handle *handle, | ||
429 | struct ocfs2_dinode *fe, | ||
430 | struct ocfs2_alloc_context **ac) | ||
431 | { | ||
432 | int status; | ||
433 | struct inode *alloc_inode = NULL; | ||
434 | |||
435 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
436 | if (!(*ac)) { | ||
437 | status = -ENOMEM; | ||
438 | mlog_errno(status); | ||
439 | goto bail; | ||
440 | } | ||
441 | |||
442 | (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); | ||
443 | (*ac)->ac_handle = handle; | ||
444 | (*ac)->ac_which = OCFS2_AC_USE_META; | ||
445 | |||
446 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
447 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
448 | EXTENT_ALLOC_SYSTEM_INODE, | ||
449 | 0); | ||
450 | #else | ||
451 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
452 | EXTENT_ALLOC_SYSTEM_INODE, | ||
453 | osb->slot_num); | ||
454 | #endif | ||
455 | if (!alloc_inode) { | ||
456 | status = -ENOMEM; | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | (*ac)->ac_inode = igrab(alloc_inode); | ||
462 | (*ac)->ac_group_search = ocfs2_block_group_search; | ||
463 | |||
464 | status = ocfs2_reserve_suballoc_bits(osb, (*ac)); | ||
465 | if (status < 0) { | ||
466 | if (status != -ENOSPC) | ||
467 | mlog_errno(status); | ||
468 | goto bail; | ||
469 | } | ||
470 | |||
471 | status = 0; | ||
472 | bail: | ||
473 | if ((status < 0) && *ac) { | ||
474 | ocfs2_free_alloc_context(*ac); | ||
475 | *ac = NULL; | ||
476 | } | ||
477 | |||
478 | if (alloc_inode) | ||
479 | iput(alloc_inode); | ||
480 | |||
481 | mlog_exit(status); | ||
482 | return status; | ||
483 | } | ||
484 | |||
485 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | ||
486 | struct ocfs2_journal_handle *handle, | ||
487 | struct ocfs2_alloc_context **ac) | ||
488 | { | ||
489 | int status; | ||
490 | struct inode *alloc_inode = NULL; | ||
491 | |||
492 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
493 | if (!(*ac)) { | ||
494 | status = -ENOMEM; | ||
495 | mlog_errno(status); | ||
496 | goto bail; | ||
497 | } | ||
498 | |||
499 | (*ac)->ac_bits_wanted = 1; | ||
500 | (*ac)->ac_handle = handle; | ||
501 | (*ac)->ac_which = OCFS2_AC_USE_INODE; | ||
502 | |||
503 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
504 | INODE_ALLOC_SYSTEM_INODE, | ||
505 | osb->slot_num); | ||
506 | if (!alloc_inode) { | ||
507 | status = -ENOMEM; | ||
508 | mlog_errno(status); | ||
509 | goto bail; | ||
510 | } | ||
511 | |||
512 | (*ac)->ac_inode = igrab(alloc_inode); | ||
513 | (*ac)->ac_group_search = ocfs2_block_group_search; | ||
514 | |||
515 | status = ocfs2_reserve_suballoc_bits(osb, *ac); | ||
516 | if (status < 0) { | ||
517 | if (status != -ENOSPC) | ||
518 | mlog_errno(status); | ||
519 | goto bail; | ||
520 | } | ||
521 | |||
522 | status = 0; | ||
523 | bail: | ||
524 | if ((status < 0) && *ac) { | ||
525 | ocfs2_free_alloc_context(*ac); | ||
526 | *ac = NULL; | ||
527 | } | ||
528 | |||
529 | if (alloc_inode) | ||
530 | iput(alloc_inode); | ||
531 | |||
532 | mlog_exit(status); | ||
533 | return status; | ||
534 | } | ||
535 | |||
536 | /* local alloc code has to do the same thing, so rather than do this | ||
537 | * twice.. */ | ||
538 | int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, | ||
539 | struct ocfs2_alloc_context *ac) | ||
540 | { | ||
541 | int status; | ||
542 | |||
543 | ac->ac_inode = ocfs2_get_system_file_inode(osb, | ||
544 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
545 | OCFS2_INVALID_SLOT); | ||
546 | if (!ac->ac_inode) { | ||
547 | status = -EINVAL; | ||
548 | mlog(ML_ERROR, "Could not get bitmap inode!\n"); | ||
549 | goto bail; | ||
550 | } | ||
551 | ac->ac_which = OCFS2_AC_USE_MAIN; | ||
552 | ac->ac_group_search = ocfs2_cluster_group_search; | ||
553 | |||
554 | status = ocfs2_reserve_suballoc_bits(osb, ac); | ||
555 | if (status < 0 && status != -ENOSPC) | ||
556 | mlog_errno(status); | ||
557 | bail: | ||
558 | return status; | ||
559 | } | ||
560 | |||
561 | /* Callers don't need to care which bitmap (local alloc or main) to | ||
562 | * use so we figure it out for them, but unfortunately this clutters | ||
563 | * things a bit. */ | ||
564 | int ocfs2_reserve_clusters(struct ocfs2_super *osb, | ||
565 | struct ocfs2_journal_handle *handle, | ||
566 | u32 bits_wanted, | ||
567 | struct ocfs2_alloc_context **ac) | ||
568 | { | ||
569 | int status; | ||
570 | |||
571 | mlog_entry_void(); | ||
572 | |||
573 | BUG_ON(!handle); | ||
574 | |||
575 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
576 | if (!(*ac)) { | ||
577 | status = -ENOMEM; | ||
578 | mlog_errno(status); | ||
579 | goto bail; | ||
580 | } | ||
581 | |||
582 | (*ac)->ac_bits_wanted = bits_wanted; | ||
583 | (*ac)->ac_handle = handle; | ||
584 | |||
585 | status = -ENOSPC; | ||
586 | if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { | ||
587 | status = ocfs2_reserve_local_alloc_bits(osb, | ||
588 | handle, | ||
589 | bits_wanted, | ||
590 | *ac); | ||
591 | if ((status < 0) && (status != -ENOSPC)) { | ||
592 | mlog_errno(status); | ||
593 | goto bail; | ||
594 | } else if (status == -ENOSPC) { | ||
595 | /* reserve_local_bits will return enospc with | ||
596 | * the local alloc inode still locked, so we | ||
597 | * can change this safely here. */ | ||
598 | mlog(0, "Disabling local alloc\n"); | ||
599 | /* We set to OCFS2_LA_DISABLED so that umount | ||
600 | * can clean up what's left of the local | ||
601 | * allocation */ | ||
602 | osb->local_alloc_state = OCFS2_LA_DISABLED; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | if (status == -ENOSPC) { | ||
607 | status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); | ||
608 | if (status < 0) { | ||
609 | if (status != -ENOSPC) | ||
610 | mlog_errno(status); | ||
611 | goto bail; | ||
612 | } | ||
613 | } | ||
614 | |||
615 | status = 0; | ||
616 | bail: | ||
617 | if ((status < 0) && *ac) { | ||
618 | ocfs2_free_alloc_context(*ac); | ||
619 | *ac = NULL; | ||
620 | } | ||
621 | |||
622 | mlog_exit(status); | ||
623 | return status; | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * More or less lifted from ext3. I'll leave their description below: | ||
628 | * | ||
629 | * "For ext3 allocations, we must not reuse any blocks which are | ||
630 | * allocated in the bitmap buffer's "last committed data" copy. This | ||
631 | * prevents deletes from freeing up the page for reuse until we have | ||
632 | * committed the delete transaction. | ||
633 | * | ||
634 | * If we didn't do this, then deleting something and reallocating it as | ||
635 | * data would allow the old block to be overwritten before the | ||
636 | * transaction committed (because we force data to disk before commit). | ||
637 | * This would lead to corruption if we crashed between overwriting the | ||
638 | * data and committing the delete. | ||
639 | * | ||
640 | * @@@ We may want to make this allocation behaviour conditional on | ||
641 | * data-writes at some point, and disable it for metadata allocations or | ||
642 | * sync-data inodes." | ||
643 | * | ||
644 | * Note: OCFS2 already does this differently for metadata vs data | ||
645 | * allocations, as those bitmaps are seperate and undo access is never | ||
646 | * called on a metadata group descriptor. | ||
647 | */ | ||
648 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, | ||
649 | int nr) | ||
650 | { | ||
651 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
652 | |||
653 | if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) | ||
654 | return 0; | ||
655 | if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) | ||
656 | return 1; | ||
657 | |||
658 | bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; | ||
659 | return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); | ||
660 | } | ||
661 | |||
662 | static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, | ||
663 | struct buffer_head *bg_bh, | ||
664 | unsigned int bits_wanted, | ||
665 | u16 *bit_off, | ||
666 | u16 *bits_found) | ||
667 | { | ||
668 | void *bitmap; | ||
669 | u16 best_offset, best_size; | ||
670 | int offset, start, found, status = 0; | ||
671 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
672 | |||
673 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
674 | OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); | ||
675 | return -EIO; | ||
676 | } | ||
677 | |||
678 | found = start = best_offset = best_size = 0; | ||
679 | bitmap = bg->bg_bitmap; | ||
680 | |||
681 | while((offset = ocfs2_find_next_zero_bit(bitmap, | ||
682 | le16_to_cpu(bg->bg_bits), | ||
683 | start)) != -1) { | ||
684 | if (offset == le16_to_cpu(bg->bg_bits)) | ||
685 | break; | ||
686 | |||
687 | if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { | ||
688 | /* We found a zero, but we can't use it as it | ||
689 | * hasn't been put to disk yet! */ | ||
690 | found = 0; | ||
691 | start = offset + 1; | ||
692 | } else if (offset == start) { | ||
693 | /* we found a zero */ | ||
694 | found++; | ||
695 | /* move start to the next bit to test */ | ||
696 | start++; | ||
697 | } else { | ||
698 | /* got a zero after some ones */ | ||
699 | found = 1; | ||
700 | start = offset + 1; | ||
701 | } | ||
702 | if (found > best_size) { | ||
703 | best_size = found; | ||
704 | best_offset = start - found; | ||
705 | } | ||
706 | /* we got everything we needed */ | ||
707 | if (found == bits_wanted) { | ||
708 | /* mlog(0, "Found it all!\n"); */ | ||
709 | break; | ||
710 | } | ||
711 | } | ||
712 | |||
713 | /* XXX: I think the first clause is equivalent to the second | ||
714 | * - jlbec */ | ||
715 | if (found == bits_wanted) { | ||
716 | *bit_off = start - found; | ||
717 | *bits_found = found; | ||
718 | } else if (best_size) { | ||
719 | *bit_off = best_offset; | ||
720 | *bits_found = best_size; | ||
721 | } else { | ||
722 | status = -ENOSPC; | ||
723 | /* No error log here -- see the comment above | ||
724 | * ocfs2_test_bg_bit_allocatable */ | ||
725 | } | ||
726 | |||
727 | return status; | ||
728 | } | ||
729 | |||
730 | static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, | ||
731 | struct inode *alloc_inode, | ||
732 | struct ocfs2_group_desc *bg, | ||
733 | struct buffer_head *group_bh, | ||
734 | unsigned int bit_off, | ||
735 | unsigned int num_bits) | ||
736 | { | ||
737 | int status; | ||
738 | void *bitmap = bg->bg_bitmap; | ||
739 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | ||
740 | |||
741 | mlog_entry_void(); | ||
742 | |||
743 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
744 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
745 | status = -EIO; | ||
746 | goto bail; | ||
747 | } | ||
748 | BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); | ||
749 | |||
750 | mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, | ||
751 | num_bits); | ||
752 | |||
753 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
754 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | ||
755 | |||
756 | status = ocfs2_journal_access(handle, | ||
757 | alloc_inode, | ||
758 | group_bh, | ||
759 | journal_type); | ||
760 | if (status < 0) { | ||
761 | mlog_errno(status); | ||
762 | goto bail; | ||
763 | } | ||
764 | |||
765 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | ||
766 | |||
767 | while(num_bits--) | ||
768 | ocfs2_set_bit(bit_off++, bitmap); | ||
769 | |||
770 | status = ocfs2_journal_dirty(handle, | ||
771 | group_bh); | ||
772 | if (status < 0) { | ||
773 | mlog_errno(status); | ||
774 | goto bail; | ||
775 | } | ||
776 | |||
777 | bail: | ||
778 | mlog_exit(status); | ||
779 | return status; | ||
780 | } | ||
781 | |||
782 | /* find the one with the most empty bits */ | ||
783 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) | ||
784 | { | ||
785 | u16 curr, best; | ||
786 | |||
787 | BUG_ON(!cl->cl_next_free_rec); | ||
788 | |||
789 | best = curr = 0; | ||
790 | while (curr < le16_to_cpu(cl->cl_next_free_rec)) { | ||
791 | if (le32_to_cpu(cl->cl_recs[curr].c_free) > | ||
792 | le32_to_cpu(cl->cl_recs[best].c_free)) | ||
793 | best = curr; | ||
794 | curr++; | ||
795 | } | ||
796 | |||
797 | BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); | ||
798 | return best; | ||
799 | } | ||
800 | |||
801 | static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, | ||
802 | struct inode *alloc_inode, | ||
803 | struct buffer_head *fe_bh, | ||
804 | struct buffer_head *bg_bh, | ||
805 | struct buffer_head *prev_bg_bh, | ||
806 | u16 chain) | ||
807 | { | ||
808 | int status; | ||
809 | /* there is a really tiny chance the journal calls could fail, | ||
810 | * but we wouldn't want inconsistent blocks in *any* case. */ | ||
811 | u64 fe_ptr, bg_ptr, prev_bg_ptr; | ||
812 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
813 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
814 | struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; | ||
815 | |||
816 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
817 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
818 | status = -EIO; | ||
819 | goto out; | ||
820 | } | ||
821 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
822 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
823 | status = -EIO; | ||
824 | goto out; | ||
825 | } | ||
826 | if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { | ||
827 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); | ||
828 | status = -EIO; | ||
829 | goto out; | ||
830 | } | ||
831 | |||
832 | mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to " | ||
833 | "top, prev = %"MLFu64"\n", | ||
834 | fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno); | ||
835 | |||
836 | fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); | ||
837 | bg_ptr = le64_to_cpu(bg->bg_next_group); | ||
838 | prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); | ||
839 | |||
840 | status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, | ||
841 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
842 | if (status < 0) { | ||
843 | mlog_errno(status); | ||
844 | goto out_rollback; | ||
845 | } | ||
846 | |||
847 | prev_bg->bg_next_group = bg->bg_next_group; | ||
848 | |||
849 | status = ocfs2_journal_dirty(handle, prev_bg_bh); | ||
850 | if (status < 0) { | ||
851 | mlog_errno(status); | ||
852 | goto out_rollback; | ||
853 | } | ||
854 | |||
855 | status = ocfs2_journal_access(handle, alloc_inode, bg_bh, | ||
856 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
857 | if (status < 0) { | ||
858 | mlog_errno(status); | ||
859 | goto out_rollback; | ||
860 | } | ||
861 | |||
862 | bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; | ||
863 | |||
864 | status = ocfs2_journal_dirty(handle, bg_bh); | ||
865 | if (status < 0) { | ||
866 | mlog_errno(status); | ||
867 | goto out_rollback; | ||
868 | } | ||
869 | |||
870 | status = ocfs2_journal_access(handle, alloc_inode, fe_bh, | ||
871 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
872 | if (status < 0) { | ||
873 | mlog_errno(status); | ||
874 | goto out_rollback; | ||
875 | } | ||
876 | |||
877 | fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; | ||
878 | |||
879 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
880 | if (status < 0) { | ||
881 | mlog_errno(status); | ||
882 | goto out_rollback; | ||
883 | } | ||
884 | |||
885 | status = 0; | ||
886 | out_rollback: | ||
887 | if (status < 0) { | ||
888 | fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); | ||
889 | bg->bg_next_group = cpu_to_le64(bg_ptr); | ||
890 | prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); | ||
891 | } | ||
892 | out: | ||
893 | mlog_exit(status); | ||
894 | return status; | ||
895 | } | ||
896 | |||
897 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, | ||
898 | u32 wanted) | ||
899 | { | ||
900 | return le16_to_cpu(bg->bg_free_bits_count) > wanted; | ||
901 | } | ||
902 | |||
903 | /* return 0 on success, -ENOSPC to keep searching and any other < 0 | ||
904 | * value on error. */ | ||
905 | static int ocfs2_cluster_group_search(struct inode *inode, | ||
906 | struct buffer_head *group_bh, | ||
907 | u32 bits_wanted, u32 min_bits, | ||
908 | u16 *bit_off, u16 *bits_found) | ||
909 | { | ||
910 | int search = -ENOSPC; | ||
911 | int ret; | ||
912 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
913 | u16 tmp_off, tmp_found; | ||
914 | |||
915 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
916 | |||
917 | if (bg->bg_free_bits_count) { | ||
918 | ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), | ||
919 | group_bh, bits_wanted, | ||
920 | &tmp_off, &tmp_found); | ||
921 | if (ret) | ||
922 | return ret; | ||
923 | |||
924 | /* ocfs2_block_group_find_clear_bits() might | ||
925 | * return success, but we still want to return | ||
926 | * -ENOSPC unless it found the minimum number | ||
927 | * of bits. */ | ||
928 | if (min_bits <= tmp_found) { | ||
929 | *bit_off = tmp_off; | ||
930 | *bits_found = tmp_found; | ||
931 | search = 0; /* success */ | ||
932 | } | ||
933 | } | ||
934 | |||
935 | return search; | ||
936 | } | ||
937 | |||
938 | static int ocfs2_block_group_search(struct inode *inode, | ||
939 | struct buffer_head *group_bh, | ||
940 | u32 bits_wanted, u32 min_bits, | ||
941 | u16 *bit_off, u16 *bits_found) | ||
942 | { | ||
943 | int ret = -ENOSPC; | ||
944 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
945 | |||
946 | BUG_ON(min_bits != 1); | ||
947 | BUG_ON(ocfs2_is_cluster_bitmap(inode)); | ||
948 | |||
949 | if (bg->bg_free_bits_count) | ||
950 | ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), | ||
951 | group_bh, bits_wanted, | ||
952 | bit_off, bits_found); | ||
953 | |||
954 | return ret; | ||
955 | } | ||
956 | |||
957 | static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, | ||
958 | u32 bits_wanted, | ||
959 | u32 min_bits, | ||
960 | u16 *bit_off, | ||
961 | unsigned int *num_bits, | ||
962 | u64 *bg_blkno) | ||
963 | { | ||
964 | int status; | ||
965 | u16 chain, tmp_bits; | ||
966 | u32 tmp_used; | ||
967 | u64 next_group; | ||
968 | struct ocfs2_journal_handle *handle = ac->ac_handle; | ||
969 | struct inode *alloc_inode = ac->ac_inode; | ||
970 | struct buffer_head *group_bh = NULL; | ||
971 | struct buffer_head *prev_group_bh = NULL; | ||
972 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; | ||
973 | struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; | ||
974 | struct ocfs2_group_desc *bg; | ||
975 | |||
976 | chain = ac->ac_chain; | ||
977 | mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n", | ||
978 | bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno); | ||
979 | |||
980 | status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), | ||
981 | le64_to_cpu(cl->cl_recs[chain].c_blkno), | ||
982 | &group_bh, OCFS2_BH_CACHED, alloc_inode); | ||
983 | if (status < 0) { | ||
984 | mlog_errno(status); | ||
985 | goto bail; | ||
986 | } | ||
987 | bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
988 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
989 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
990 | status = -EIO; | ||
991 | goto bail; | ||
992 | } | ||
993 | |||
994 | status = -ENOSPC; | ||
995 | /* for now, the chain search is a bit simplistic. We just use | ||
996 | * the 1st group with any empty bits. */ | ||
997 | while ((status = ac->ac_group_search(alloc_inode, group_bh, | ||
998 | bits_wanted, min_bits, bit_off, | ||
999 | &tmp_bits)) == -ENOSPC) { | ||
1000 | if (!bg->bg_next_group) | ||
1001 | break; | ||
1002 | |||
1003 | if (prev_group_bh) { | ||
1004 | brelse(prev_group_bh); | ||
1005 | prev_group_bh = NULL; | ||
1006 | } | ||
1007 | next_group = le64_to_cpu(bg->bg_next_group); | ||
1008 | prev_group_bh = group_bh; | ||
1009 | group_bh = NULL; | ||
1010 | status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), | ||
1011 | next_group, &group_bh, | ||
1012 | OCFS2_BH_CACHED, alloc_inode); | ||
1013 | if (status < 0) { | ||
1014 | mlog_errno(status); | ||
1015 | goto bail; | ||
1016 | } | ||
1017 | bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
1018 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
1019 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
1020 | status = -EIO; | ||
1021 | goto bail; | ||
1022 | } | ||
1023 | } | ||
1024 | if (status < 0) { | ||
1025 | if (status != -ENOSPC) | ||
1026 | mlog_errno(status); | ||
1027 | goto bail; | ||
1028 | } | ||
1029 | |||
1030 | mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n", | ||
1031 | tmp_bits, bg->bg_blkno); | ||
1032 | |||
1033 | *num_bits = tmp_bits; | ||
1034 | |||
1035 | BUG_ON(*num_bits == 0); | ||
1036 | |||
1037 | /* | ||
1038 | * Keep track of previous block descriptor read. When | ||
1039 | * we find a target, if we have read more than X | ||
1040 | * number of descriptors, and the target is reasonably | ||
1041 | * empty, relink him to top of his chain. | ||
1042 | * | ||
1043 | * We've read 0 extra blocks and only send one more to | ||
1044 | * the transaction, yet the next guy to search has a | ||
1045 | * much easier time. | ||
1046 | * | ||
1047 | * Do this *after* figuring out how many bits we're taking out | ||
1048 | * of our target group. | ||
1049 | */ | ||
1050 | if (ac->ac_allow_chain_relink && | ||
1051 | (prev_group_bh) && | ||
1052 | (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { | ||
1053 | status = ocfs2_relink_block_group(handle, alloc_inode, | ||
1054 | ac->ac_bh, group_bh, | ||
1055 | prev_group_bh, chain); | ||
1056 | if (status < 0) { | ||
1057 | mlog_errno(status); | ||
1058 | goto bail; | ||
1059 | } | ||
1060 | } | ||
1061 | |||
1062 | /* Ok, claim our bits now: set the info on dinode, chainlist | ||
1063 | * and then the group */ | ||
1064 | status = ocfs2_journal_access(handle, | ||
1065 | alloc_inode, | ||
1066 | ac->ac_bh, | ||
1067 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1068 | if (status < 0) { | ||
1069 | mlog_errno(status); | ||
1070 | goto bail; | ||
1071 | } | ||
1072 | |||
1073 | tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); | ||
1074 | fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); | ||
1075 | le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); | ||
1076 | |||
1077 | status = ocfs2_journal_dirty(handle, | ||
1078 | ac->ac_bh); | ||
1079 | if (status < 0) { | ||
1080 | mlog_errno(status); | ||
1081 | goto bail; | ||
1082 | } | ||
1083 | |||
1084 | status = ocfs2_block_group_set_bits(handle, | ||
1085 | alloc_inode, | ||
1086 | bg, | ||
1087 | group_bh, | ||
1088 | *bit_off, | ||
1089 | *num_bits); | ||
1090 | if (status < 0) { | ||
1091 | mlog_errno(status); | ||
1092 | goto bail; | ||
1093 | } | ||
1094 | |||
1095 | mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n", | ||
1096 | *num_bits, fe->i_blkno); | ||
1097 | |||
1098 | *bg_blkno = le64_to_cpu(bg->bg_blkno); | ||
1099 | bail: | ||
1100 | if (group_bh) | ||
1101 | brelse(group_bh); | ||
1102 | if (prev_group_bh) | ||
1103 | brelse(prev_group_bh); | ||
1104 | |||
1105 | mlog_exit(status); | ||
1106 | return status; | ||
1107 | } | ||
1108 | |||
1109 | /* will give out up to bits_wanted contiguous bits. */ | ||
1110 | static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, | ||
1111 | struct ocfs2_alloc_context *ac, | ||
1112 | u32 bits_wanted, | ||
1113 | u32 min_bits, | ||
1114 | u16 *bit_off, | ||
1115 | unsigned int *num_bits, | ||
1116 | u64 *bg_blkno) | ||
1117 | { | ||
1118 | int status; | ||
1119 | u16 victim, i; | ||
1120 | struct ocfs2_chain_list *cl; | ||
1121 | struct ocfs2_dinode *fe; | ||
1122 | |||
1123 | mlog_entry_void(); | ||
1124 | |||
1125 | BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); | ||
1126 | BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); | ||
1127 | BUG_ON(!ac->ac_bh); | ||
1128 | |||
1129 | fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; | ||
1130 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1131 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); | ||
1132 | status = -EIO; | ||
1133 | goto bail; | ||
1134 | } | ||
1135 | if (le32_to_cpu(fe->id1.bitmap1.i_used) >= | ||
1136 | le32_to_cpu(fe->id1.bitmap1.i_total)) { | ||
1137 | ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u" | ||
1138 | "used bits but only %u total.", | ||
1139 | le64_to_cpu(fe->i_blkno), | ||
1140 | le32_to_cpu(fe->id1.bitmap1.i_used), | ||
1141 | le32_to_cpu(fe->id1.bitmap1.i_total)); | ||
1142 | status = -EIO; | ||
1143 | goto bail; | ||
1144 | } | ||
1145 | |||
1146 | cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; | ||
1147 | |||
1148 | victim = ocfs2_find_victim_chain(cl); | ||
1149 | ac->ac_chain = victim; | ||
1150 | ac->ac_allow_chain_relink = 1; | ||
1151 | |||
1152 | status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, | ||
1153 | num_bits, bg_blkno); | ||
1154 | if (!status) | ||
1155 | goto bail; | ||
1156 | if (status < 0 && status != -ENOSPC) { | ||
1157 | mlog_errno(status); | ||
1158 | goto bail; | ||
1159 | } | ||
1160 | |||
1161 | mlog(0, "Search of victim chain %u came up with nothing, " | ||
1162 | "trying all chains now.\n", victim); | ||
1163 | |||
1164 | /* If we didn't pick a good victim, then just default to | ||
1165 | * searching each chain in order. Don't allow chain relinking | ||
1166 | * because we only calculate enough journal credits for one | ||
1167 | * relink per alloc. */ | ||
1168 | ac->ac_allow_chain_relink = 0; | ||
1169 | for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { | ||
1170 | if (i == victim) | ||
1171 | continue; | ||
1172 | if (!cl->cl_recs[i].c_free) | ||
1173 | continue; | ||
1174 | |||
1175 | ac->ac_chain = i; | ||
1176 | status = ocfs2_search_chain(ac, bits_wanted, min_bits, | ||
1177 | bit_off, num_bits, | ||
1178 | bg_blkno); | ||
1179 | if (!status) | ||
1180 | break; | ||
1181 | if (status < 0 && status != -ENOSPC) { | ||
1182 | mlog_errno(status); | ||
1183 | goto bail; | ||
1184 | } | ||
1185 | } | ||
1186 | bail: | ||
1187 | |||
1188 | mlog_exit(status); | ||
1189 | return status; | ||
1190 | } | ||
1191 | |||
1192 | int ocfs2_claim_metadata(struct ocfs2_super *osb, | ||
1193 | struct ocfs2_journal_handle *handle, | ||
1194 | struct ocfs2_alloc_context *ac, | ||
1195 | u32 bits_wanted, | ||
1196 | u16 *suballoc_bit_start, | ||
1197 | unsigned int *num_bits, | ||
1198 | u64 *blkno_start) | ||
1199 | { | ||
1200 | int status; | ||
1201 | u64 bg_blkno; | ||
1202 | |||
1203 | BUG_ON(!ac); | ||
1204 | BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); | ||
1205 | BUG_ON(ac->ac_which != OCFS2_AC_USE_META); | ||
1206 | BUG_ON(ac->ac_handle != handle); | ||
1207 | |||
1208 | status = ocfs2_claim_suballoc_bits(osb, | ||
1209 | ac, | ||
1210 | bits_wanted, | ||
1211 | 1, | ||
1212 | suballoc_bit_start, | ||
1213 | num_bits, | ||
1214 | &bg_blkno); | ||
1215 | if (status < 0) { | ||
1216 | mlog_errno(status); | ||
1217 | goto bail; | ||
1218 | } | ||
1219 | atomic_inc(&osb->alloc_stats.bg_allocs); | ||
1220 | |||
1221 | *blkno_start = bg_blkno + (u64) *suballoc_bit_start; | ||
1222 | ac->ac_bits_given += (*num_bits); | ||
1223 | status = 0; | ||
1224 | bail: | ||
1225 | mlog_exit(status); | ||
1226 | return status; | ||
1227 | } | ||
1228 | |||
1229 | int ocfs2_claim_new_inode(struct ocfs2_super *osb, | ||
1230 | struct ocfs2_journal_handle *handle, | ||
1231 | struct ocfs2_alloc_context *ac, | ||
1232 | u16 *suballoc_bit, | ||
1233 | u64 *fe_blkno) | ||
1234 | { | ||
1235 | int status; | ||
1236 | unsigned int num_bits; | ||
1237 | u64 bg_blkno; | ||
1238 | |||
1239 | mlog_entry_void(); | ||
1240 | |||
1241 | BUG_ON(!ac); | ||
1242 | BUG_ON(ac->ac_bits_given != 0); | ||
1243 | BUG_ON(ac->ac_bits_wanted != 1); | ||
1244 | BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); | ||
1245 | BUG_ON(ac->ac_handle != handle); | ||
1246 | |||
1247 | status = ocfs2_claim_suballoc_bits(osb, | ||
1248 | ac, | ||
1249 | 1, | ||
1250 | 1, | ||
1251 | suballoc_bit, | ||
1252 | &num_bits, | ||
1253 | &bg_blkno); | ||
1254 | if (status < 0) { | ||
1255 | mlog_errno(status); | ||
1256 | goto bail; | ||
1257 | } | ||
1258 | atomic_inc(&osb->alloc_stats.bg_allocs); | ||
1259 | |||
1260 | BUG_ON(num_bits != 1); | ||
1261 | |||
1262 | *fe_blkno = bg_blkno + (u64) (*suballoc_bit); | ||
1263 | ac->ac_bits_given++; | ||
1264 | status = 0; | ||
1265 | bail: | ||
1266 | mlog_exit(status); | ||
1267 | return status; | ||
1268 | } | ||
1269 | |||
1270 | /* translate a group desc. blkno and it's bitmap offset into | ||
1271 | * disk cluster offset. */ | ||
1272 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, | ||
1273 | u64 bg_blkno, | ||
1274 | u16 bg_bit_off) | ||
1275 | { | ||
1276 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1277 | u32 cluster = 0; | ||
1278 | |||
1279 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1280 | |||
1281 | if (bg_blkno != osb->first_cluster_group_blkno) | ||
1282 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); | ||
1283 | cluster += (u32) bg_bit_off; | ||
1284 | return cluster; | ||
1285 | } | ||
1286 | |||
1287 | /* given a cluster offset, calculate which block group it belongs to | ||
1288 | * and return that block offset. */ | ||
1289 | static inline u64 ocfs2_which_cluster_group(struct inode *inode, | ||
1290 | u32 cluster) | ||
1291 | { | ||
1292 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1293 | u32 group_no; | ||
1294 | |||
1295 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1296 | |||
1297 | group_no = cluster / osb->bitmap_cpg; | ||
1298 | if (!group_no) | ||
1299 | return osb->first_cluster_group_blkno; | ||
1300 | return ocfs2_clusters_to_blocks(inode->i_sb, | ||
1301 | group_no * osb->bitmap_cpg); | ||
1302 | } | ||
1303 | |||
1304 | /* given the block number of a cluster start, calculate which cluster | ||
1305 | * group and descriptor bitmap offset that corresponds to. */ | ||
1306 | static inline void ocfs2_block_to_cluster_group(struct inode *inode, | ||
1307 | u64 data_blkno, | ||
1308 | u64 *bg_blkno, | ||
1309 | u16 *bg_bit_off) | ||
1310 | { | ||
1311 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1312 | u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); | ||
1313 | |||
1314 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1315 | |||
1316 | *bg_blkno = ocfs2_which_cluster_group(inode, | ||
1317 | data_cluster); | ||
1318 | |||
1319 | if (*bg_blkno == osb->first_cluster_group_blkno) | ||
1320 | *bg_bit_off = (u16) data_cluster; | ||
1321 | else | ||
1322 | *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, | ||
1323 | data_blkno - *bg_blkno); | ||
1324 | } | ||
1325 | |||
1326 | /* | ||
1327 | * min_bits - minimum contiguous chunk from this total allocation we | ||
1328 | * can handle. set to what we asked for originally for a full | ||
1329 | * contig. allocation, set to '1' to indicate we can deal with extents | ||
1330 | * of any size. | ||
1331 | */ | ||
1332 | int ocfs2_claim_clusters(struct ocfs2_super *osb, | ||
1333 | struct ocfs2_journal_handle *handle, | ||
1334 | struct ocfs2_alloc_context *ac, | ||
1335 | u32 min_clusters, | ||
1336 | u32 *cluster_start, | ||
1337 | u32 *num_clusters) | ||
1338 | { | ||
1339 | int status; | ||
1340 | unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; | ||
1341 | u64 bg_blkno; | ||
1342 | u16 bg_bit_off; | ||
1343 | |||
1344 | mlog_entry_void(); | ||
1345 | |||
1346 | BUG_ON(!ac); | ||
1347 | BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); | ||
1348 | |||
1349 | BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL | ||
1350 | && ac->ac_which != OCFS2_AC_USE_MAIN); | ||
1351 | BUG_ON(ac->ac_handle != handle); | ||
1352 | |||
1353 | if (ac->ac_which == OCFS2_AC_USE_LOCAL) { | ||
1354 | status = ocfs2_claim_local_alloc_bits(osb, | ||
1355 | handle, | ||
1356 | ac, | ||
1357 | bits_wanted, | ||
1358 | cluster_start, | ||
1359 | num_clusters); | ||
1360 | if (!status) | ||
1361 | atomic_inc(&osb->alloc_stats.local_data); | ||
1362 | } else { | ||
1363 | if (min_clusters > (osb->bitmap_cpg - 1)) { | ||
1364 | /* The only paths asking for contiguousness | ||
1365 | * should know about this already. */ | ||
1366 | mlog(ML_ERROR, "minimum allocation requested exceeds " | ||
1367 | "group bitmap size!"); | ||
1368 | status = -ENOSPC; | ||
1369 | goto bail; | ||
1370 | } | ||
1371 | /* clamp the current request down to a realistic size. */ | ||
1372 | if (bits_wanted > (osb->bitmap_cpg - 1)) | ||
1373 | bits_wanted = osb->bitmap_cpg - 1; | ||
1374 | |||
1375 | status = ocfs2_claim_suballoc_bits(osb, | ||
1376 | ac, | ||
1377 | bits_wanted, | ||
1378 | min_clusters, | ||
1379 | &bg_bit_off, | ||
1380 | num_clusters, | ||
1381 | &bg_blkno); | ||
1382 | if (!status) { | ||
1383 | *cluster_start = | ||
1384 | ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, | ||
1385 | bg_blkno, | ||
1386 | bg_bit_off); | ||
1387 | atomic_inc(&osb->alloc_stats.bitmap_data); | ||
1388 | } | ||
1389 | } | ||
1390 | if (status < 0) { | ||
1391 | if (status != -ENOSPC) | ||
1392 | mlog_errno(status); | ||
1393 | goto bail; | ||
1394 | } | ||
1395 | |||
1396 | ac->ac_bits_given += *num_clusters; | ||
1397 | |||
1398 | bail: | ||
1399 | mlog_exit(status); | ||
1400 | return status; | ||
1401 | } | ||
1402 | |||
1403 | static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, | ||
1404 | struct inode *alloc_inode, | ||
1405 | struct ocfs2_group_desc *bg, | ||
1406 | struct buffer_head *group_bh, | ||
1407 | unsigned int bit_off, | ||
1408 | unsigned int num_bits) | ||
1409 | { | ||
1410 | int status; | ||
1411 | unsigned int tmp; | ||
1412 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | ||
1413 | struct ocfs2_group_desc *undo_bg = NULL; | ||
1414 | |||
1415 | mlog_entry_void(); | ||
1416 | |||
1417 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
1418 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
1419 | status = -EIO; | ||
1420 | goto bail; | ||
1421 | } | ||
1422 | |||
1423 | mlog(0, "off = %u, num = %u\n", bit_off, num_bits); | ||
1424 | |||
1425 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1426 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | ||
1427 | |||
1428 | status = ocfs2_journal_access(handle, alloc_inode, group_bh, | ||
1429 | journal_type); | ||
1430 | if (status < 0) { | ||
1431 | mlog_errno(status); | ||
1432 | goto bail; | ||
1433 | } | ||
1434 | |||
1435 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1436 | undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; | ||
1437 | |||
1438 | tmp = num_bits; | ||
1439 | while(tmp--) { | ||
1440 | ocfs2_clear_bit((bit_off + tmp), | ||
1441 | (unsigned long *) bg->bg_bitmap); | ||
1442 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1443 | ocfs2_set_bit(bit_off + tmp, | ||
1444 | (unsigned long *) undo_bg->bg_bitmap); | ||
1445 | } | ||
1446 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); | ||
1447 | |||
1448 | status = ocfs2_journal_dirty(handle, group_bh); | ||
1449 | if (status < 0) | ||
1450 | mlog_errno(status); | ||
1451 | bail: | ||
1452 | return status; | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * expects the suballoc inode to already be locked. | ||
1457 | */ | ||
1458 | static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, | ||
1459 | struct inode *alloc_inode, | ||
1460 | struct buffer_head *alloc_bh, | ||
1461 | unsigned int start_bit, | ||
1462 | u64 bg_blkno, | ||
1463 | unsigned int count) | ||
1464 | { | ||
1465 | int status = 0; | ||
1466 | u32 tmp_used; | ||
1467 | struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); | ||
1468 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
1469 | struct ocfs2_chain_list *cl = &fe->id2.i_chain; | ||
1470 | struct buffer_head *group_bh = NULL; | ||
1471 | struct ocfs2_group_desc *group; | ||
1472 | |||
1473 | mlog_entry_void(); | ||
1474 | |||
1475 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1476 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
1477 | status = -EIO; | ||
1478 | goto bail; | ||
1479 | } | ||
1480 | BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); | ||
1481 | |||
1482 | mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64 | ||
1483 | ", starting at %u\n", | ||
1484 | OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno, | ||
1485 | start_bit); | ||
1486 | |||
1487 | status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, | ||
1488 | alloc_inode); | ||
1489 | if (status < 0) { | ||
1490 | mlog_errno(status); | ||
1491 | goto bail; | ||
1492 | } | ||
1493 | |||
1494 | group = (struct ocfs2_group_desc *) group_bh->b_data; | ||
1495 | if (!OCFS2_IS_VALID_GROUP_DESC(group)) { | ||
1496 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); | ||
1497 | status = -EIO; | ||
1498 | goto bail; | ||
1499 | } | ||
1500 | BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); | ||
1501 | |||
1502 | status = ocfs2_block_group_clear_bits(handle, alloc_inode, | ||
1503 | group, group_bh, | ||
1504 | start_bit, count); | ||
1505 | if (status < 0) { | ||
1506 | mlog_errno(status); | ||
1507 | goto bail; | ||
1508 | } | ||
1509 | |||
1510 | status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, | ||
1511 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1512 | if (status < 0) { | ||
1513 | mlog_errno(status); | ||
1514 | goto bail; | ||
1515 | } | ||
1516 | |||
1517 | le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, | ||
1518 | count); | ||
1519 | tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); | ||
1520 | fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); | ||
1521 | |||
1522 | status = ocfs2_journal_dirty(handle, alloc_bh); | ||
1523 | if (status < 0) { | ||
1524 | mlog_errno(status); | ||
1525 | goto bail; | ||
1526 | } | ||
1527 | |||
1528 | bail: | ||
1529 | if (group_bh) | ||
1530 | brelse(group_bh); | ||
1531 | |||
1532 | mlog_exit(status); | ||
1533 | return status; | ||
1534 | } | ||
1535 | |||
1536 | static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) | ||
1537 | { | ||
1538 | u64 group = block - (u64) bit; | ||
1539 | |||
1540 | return group; | ||
1541 | } | ||
1542 | |||
1543 | int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, | ||
1544 | struct inode *inode_alloc_inode, | ||
1545 | struct buffer_head *inode_alloc_bh, | ||
1546 | struct ocfs2_dinode *di) | ||
1547 | { | ||
1548 | u64 blk = le64_to_cpu(di->i_blkno); | ||
1549 | u16 bit = le16_to_cpu(di->i_suballoc_bit); | ||
1550 | u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); | ||
1551 | |||
1552 | return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, | ||
1553 | inode_alloc_bh, bit, bg_blkno, 1); | ||
1554 | } | ||
1555 | |||
1556 | int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, | ||
1557 | struct inode *eb_alloc_inode, | ||
1558 | struct buffer_head *eb_alloc_bh, | ||
1559 | struct ocfs2_extent_block *eb) | ||
1560 | { | ||
1561 | u64 blk = le64_to_cpu(eb->h_blkno); | ||
1562 | u16 bit = le16_to_cpu(eb->h_suballoc_bit); | ||
1563 | u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); | ||
1564 | |||
1565 | return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, | ||
1566 | bit, bg_blkno, 1); | ||
1567 | } | ||
1568 | |||
1569 | int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, | ||
1570 | struct inode *bitmap_inode, | ||
1571 | struct buffer_head *bitmap_bh, | ||
1572 | u64 start_blk, | ||
1573 | unsigned int num_clusters) | ||
1574 | { | ||
1575 | int status; | ||
1576 | u16 bg_start_bit; | ||
1577 | u64 bg_blkno; | ||
1578 | struct ocfs2_dinode *fe; | ||
1579 | |||
1580 | /* You can't ever have a contiguous set of clusters | ||
1581 | * bigger than a block group bitmap so we never have to worry | ||
1582 | * about looping on them. */ | ||
1583 | |||
1584 | mlog_entry_void(); | ||
1585 | |||
1586 | /* This is expensive. We can safely remove once this stuff has | ||
1587 | * gotten tested really well. */ | ||
1588 | BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); | ||
1589 | |||
1590 | fe = (struct ocfs2_dinode *) bitmap_bh->b_data; | ||
1591 | |||
1592 | ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, | ||
1593 | &bg_start_bit); | ||
1594 | |||
1595 | mlog(0, "want to free %u clusters starting at block %"MLFu64"\n", | ||
1596 | num_clusters, start_blk); | ||
1597 | mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n", | ||
1598 | bg_blkno, bg_start_bit); | ||
1599 | |||
1600 | status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, | ||
1601 | bg_start_bit, bg_blkno, | ||
1602 | num_clusters); | ||
1603 | if (status < 0) | ||
1604 | mlog_errno(status); | ||
1605 | |||
1606 | mlog_exit(status); | ||
1607 | return status; | ||
1608 | } | ||
1609 | |||
1610 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) | ||
1611 | { | ||
1612 | printk("Block Group:\n"); | ||
1613 | printk("bg_signature: %s\n", bg->bg_signature); | ||
1614 | printk("bg_size: %u\n", bg->bg_size); | ||
1615 | printk("bg_bits: %u\n", bg->bg_bits); | ||
1616 | printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); | ||
1617 | printk("bg_chain: %u\n", bg->bg_chain); | ||
1618 | printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); | ||
1619 | printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group); | ||
1620 | printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode); | ||
1621 | printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno); | ||
1622 | } | ||
1623 | |||
1624 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) | ||
1625 | { | ||
1626 | int i; | ||
1627 | |||
1628 | printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno); | ||
1629 | printk("i_signature: %s\n", fe->i_signature); | ||
1630 | printk("i_size: %"MLFu64"\n", fe->i_size); | ||
1631 | printk("i_clusters: %u\n", fe->i_clusters); | ||
1632 | printk("i_generation: %u\n", | ||
1633 | le32_to_cpu(fe->i_generation)); | ||
1634 | printk("id1.bitmap1.i_used: %u\n", | ||
1635 | le32_to_cpu(fe->id1.bitmap1.i_used)); | ||
1636 | printk("id1.bitmap1.i_total: %u\n", | ||
1637 | le32_to_cpu(fe->id1.bitmap1.i_total)); | ||
1638 | printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); | ||
1639 | printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); | ||
1640 | printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); | ||
1641 | printk("id2.i_chain.cl_next_free_rec: %u\n", | ||
1642 | fe->id2.i_chain.cl_next_free_rec); | ||
1643 | for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { | ||
1644 | printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, | ||
1645 | fe->id2.i_chain.cl_recs[i].c_free); | ||
1646 | printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, | ||
1647 | fe->id2.i_chain.cl_recs[i].c_total); | ||
1648 | printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i, | ||
1649 | fe->id2.i_chain.cl_recs[i].c_blkno); | ||
1650 | } | ||
1651 | } | ||
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h new file mode 100644 index 000000000000..a76c82a7ceac --- /dev/null +++ b/fs/ocfs2/suballoc.h | |||
@@ -0,0 +1,132 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * suballoc.h | ||
5 | * | ||
6 | * Defines sub allocator api | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef _CHAINALLOC_H_ | ||
27 | #define _CHAINALLOC_H_ | ||
28 | |||
29 | typedef int (group_search_t)(struct inode *, | ||
30 | struct buffer_head *, | ||
31 | u32, | ||
32 | u32, | ||
33 | u16 *, | ||
34 | u16 *); | ||
35 | |||
36 | struct ocfs2_alloc_context { | ||
37 | struct inode *ac_inode; /* which bitmap are we allocating from? */ | ||
38 | struct buffer_head *ac_bh; /* file entry bh */ | ||
39 | u32 ac_bits_wanted; | ||
40 | u32 ac_bits_given; | ||
41 | #define OCFS2_AC_USE_LOCAL 1 | ||
42 | #define OCFS2_AC_USE_MAIN 2 | ||
43 | #define OCFS2_AC_USE_INODE 3 | ||
44 | #define OCFS2_AC_USE_META 4 | ||
45 | u32 ac_which; | ||
46 | struct ocfs2_journal_handle *ac_handle; | ||
47 | |||
48 | /* these are used by the chain search */ | ||
49 | u16 ac_chain; | ||
50 | int ac_allow_chain_relink; | ||
51 | group_search_t *ac_group_search; | ||
52 | }; | ||
53 | |||
54 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); | ||
55 | static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) | ||
56 | { | ||
57 | return ac->ac_bits_wanted - ac->ac_bits_given; | ||
58 | } | ||
59 | |||
60 | int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | ||
61 | struct ocfs2_journal_handle *handle, | ||
62 | struct ocfs2_dinode *fe, | ||
63 | struct ocfs2_alloc_context **ac); | ||
64 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | ||
65 | struct ocfs2_journal_handle *handle, | ||
66 | struct ocfs2_alloc_context **ac); | ||
67 | int ocfs2_reserve_clusters(struct ocfs2_super *osb, | ||
68 | struct ocfs2_journal_handle *handle, | ||
69 | u32 bits_wanted, | ||
70 | struct ocfs2_alloc_context **ac); | ||
71 | |||
72 | int ocfs2_claim_metadata(struct ocfs2_super *osb, | ||
73 | struct ocfs2_journal_handle *handle, | ||
74 | struct ocfs2_alloc_context *ac, | ||
75 | u32 bits_wanted, | ||
76 | u16 *suballoc_bit_start, | ||
77 | u32 *num_bits, | ||
78 | u64 *blkno_start); | ||
79 | int ocfs2_claim_new_inode(struct ocfs2_super *osb, | ||
80 | struct ocfs2_journal_handle *handle, | ||
81 | struct ocfs2_alloc_context *ac, | ||
82 | u16 *suballoc_bit, | ||
83 | u64 *fe_blkno); | ||
84 | int ocfs2_claim_clusters(struct ocfs2_super *osb, | ||
85 | struct ocfs2_journal_handle *handle, | ||
86 | struct ocfs2_alloc_context *ac, | ||
87 | u32 min_clusters, | ||
88 | u32 *cluster_start, | ||
89 | u32 *num_clusters); | ||
90 | |||
91 | int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, | ||
92 | struct inode *inode_alloc_inode, | ||
93 | struct buffer_head *inode_alloc_bh, | ||
94 | struct ocfs2_dinode *di); | ||
95 | int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, | ||
96 | struct inode *eb_alloc_inode, | ||
97 | struct buffer_head *eb_alloc_bh, | ||
98 | struct ocfs2_extent_block *eb); | ||
99 | int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, | ||
100 | struct inode *bitmap_inode, | ||
101 | struct buffer_head *bitmap_bh, | ||
102 | u64 start_blk, | ||
103 | unsigned int num_clusters); | ||
104 | |||
105 | static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, | ||
106 | u64 bg_blkno) | ||
107 | { | ||
108 | /* This should work for all block group descriptors as only | ||
109 | * the 1st group descriptor of the cluster bitmap is | ||
110 | * different. */ | ||
111 | |||
112 | if (bg_blkno == osb->first_cluster_group_blkno) | ||
113 | return 0; | ||
114 | |||
115 | /* the rest of the block groups are located at the beginning | ||
116 | * of their 1st cluster, so a direct translation just | ||
117 | * works. */ | ||
118 | return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); | ||
119 | } | ||
120 | |||
121 | static inline int ocfs2_is_cluster_bitmap(struct inode *inode) | ||
122 | { | ||
123 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
124 | return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; | ||
125 | } | ||
126 | |||
127 | /* This is for local alloc ONLY. Others should use the task-specific | ||
128 | * apis above. */ | ||
129 | int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, | ||
130 | struct ocfs2_alloc_context *ac); | ||
131 | |||
132 | #endif /* _CHAINALLOC_H_ */ | ||
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c new file mode 100644 index 000000000000..48bf7f0ce544 --- /dev/null +++ b/fs/ocfs2/super.c | |||
@@ -0,0 +1,1733 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * super.c | ||
5 | * | ||
6 | * load/unload driver, mount/dismount volumes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/utsname.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/random.h> | ||
34 | #include <linux/statfs.h> | ||
35 | #include <linux/moduleparam.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/socket.h> | ||
38 | #include <linux/inet.h> | ||
39 | #include <linux/parser.h> | ||
40 | #include <linux/crc32.h> | ||
41 | #include <linux/debugfs.h> | ||
42 | |||
43 | #include <cluster/nodemanager.h> | ||
44 | |||
45 | #define MLOG_MASK_PREFIX ML_SUPER | ||
46 | #include <cluster/masklog.h> | ||
47 | |||
48 | #include "ocfs2.h" | ||
49 | |||
50 | /* this should be the only file to include a version 1 header */ | ||
51 | #include "ocfs1_fs_compat.h" | ||
52 | |||
53 | #include "alloc.h" | ||
54 | #include "dlmglue.h" | ||
55 | #include "export.h" | ||
56 | #include "extent_map.h" | ||
57 | #include "heartbeat.h" | ||
58 | #include "inode.h" | ||
59 | #include "journal.h" | ||
60 | #include "localalloc.h" | ||
61 | #include "namei.h" | ||
62 | #include "slot_map.h" | ||
63 | #include "super.h" | ||
64 | #include "sysfile.h" | ||
65 | #include "uptodate.h" | ||
66 | #include "ver.h" | ||
67 | #include "vote.h" | ||
68 | |||
69 | #include "buffer_head_io.h" | ||
70 | |||
71 | /* | ||
72 | * Globals | ||
73 | */ | ||
74 | static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED; | ||
75 | |||
76 | static u32 osb_id; /* Keeps track of next available OSB Id */ | ||
77 | |||
78 | static kmem_cache_t *ocfs2_inode_cachep = NULL; | ||
79 | |||
80 | kmem_cache_t *ocfs2_lock_cache = NULL; | ||
81 | |||
82 | /* OCFS2 needs to schedule several differnt types of work which | ||
83 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
84 | * types of work tend to be heavy we avoid using the kernel events | ||
85 | * workqueue and schedule on our own. */ | ||
86 | struct workqueue_struct *ocfs2_wq = NULL; | ||
87 | |||
88 | static struct dentry *ocfs2_debugfs_root = NULL; | ||
89 | |||
90 | MODULE_AUTHOR("Oracle"); | ||
91 | MODULE_LICENSE("GPL"); | ||
92 | |||
93 | static int ocfs2_parse_options(struct super_block *sb, char *options, | ||
94 | unsigned long *mount_opt, int is_remount); | ||
95 | static void ocfs2_put_super(struct super_block *sb); | ||
96 | static int ocfs2_mount_volume(struct super_block *sb); | ||
97 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); | ||
98 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); | ||
99 | static int ocfs2_initialize_mem_caches(void); | ||
100 | static void ocfs2_free_mem_caches(void); | ||
101 | static void ocfs2_delete_osb(struct ocfs2_super *osb); | ||
102 | |||
103 | static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf); | ||
104 | |||
105 | static int ocfs2_sync_fs(struct super_block *sb, int wait); | ||
106 | |||
107 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); | ||
108 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); | ||
109 | static int ocfs2_release_system_inodes(struct ocfs2_super *osb); | ||
110 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); | ||
111 | static int ocfs2_check_volume(struct ocfs2_super *osb); | ||
112 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | ||
113 | struct buffer_head *bh, | ||
114 | u32 sectsize); | ||
115 | static int ocfs2_initialize_super(struct super_block *sb, | ||
116 | struct buffer_head *bh, | ||
117 | int sector_size); | ||
118 | static int ocfs2_get_sector(struct super_block *sb, | ||
119 | struct buffer_head **bh, | ||
120 | int block, | ||
121 | int sect_size); | ||
122 | static void ocfs2_write_super(struct super_block *sb); | ||
123 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); | ||
124 | static void ocfs2_destroy_inode(struct inode *inode); | ||
125 | |||
126 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); | ||
127 | |||
128 | static struct super_operations ocfs2_sops = { | ||
129 | .statfs = ocfs2_statfs, | ||
130 | .alloc_inode = ocfs2_alloc_inode, | ||
131 | .destroy_inode = ocfs2_destroy_inode, | ||
132 | .drop_inode = ocfs2_drop_inode, | ||
133 | .clear_inode = ocfs2_clear_inode, | ||
134 | .delete_inode = ocfs2_delete_inode, | ||
135 | .sync_fs = ocfs2_sync_fs, | ||
136 | .write_super = ocfs2_write_super, | ||
137 | .put_super = ocfs2_put_super, | ||
138 | .remount_fs = ocfs2_remount, | ||
139 | }; | ||
140 | |||
141 | enum { | ||
142 | Opt_barrier, | ||
143 | Opt_err_panic, | ||
144 | Opt_err_ro, | ||
145 | Opt_intr, | ||
146 | Opt_nointr, | ||
147 | Opt_hb_none, | ||
148 | Opt_hb_local, | ||
149 | Opt_data_ordered, | ||
150 | Opt_data_writeback, | ||
151 | Opt_err, | ||
152 | }; | ||
153 | |||
154 | static match_table_t tokens = { | ||
155 | {Opt_barrier, "barrier=%u"}, | ||
156 | {Opt_err_panic, "errors=panic"}, | ||
157 | {Opt_err_ro, "errors=remount-ro"}, | ||
158 | {Opt_intr, "intr"}, | ||
159 | {Opt_nointr, "nointr"}, | ||
160 | {Opt_hb_none, OCFS2_HB_NONE}, | ||
161 | {Opt_hb_local, OCFS2_HB_LOCAL}, | ||
162 | {Opt_data_ordered, "data=ordered"}, | ||
163 | {Opt_data_writeback, "data=writeback"}, | ||
164 | {Opt_err, NULL} | ||
165 | }; | ||
166 | |||
167 | /* | ||
168 | * write_super and sync_fs ripped right out of ext3. | ||
169 | */ | ||
170 | static void ocfs2_write_super(struct super_block *sb) | ||
171 | { | ||
172 | if (down_trylock(&sb->s_lock) == 0) | ||
173 | BUG(); | ||
174 | sb->s_dirt = 0; | ||
175 | } | ||
176 | |||
177 | static int ocfs2_sync_fs(struct super_block *sb, int wait) | ||
178 | { | ||
179 | int status = 0; | ||
180 | tid_t target; | ||
181 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
182 | |||
183 | sb->s_dirt = 0; | ||
184 | |||
185 | if (ocfs2_is_hard_readonly(osb)) | ||
186 | return -EROFS; | ||
187 | |||
188 | if (wait) { | ||
189 | status = ocfs2_flush_truncate_log(osb); | ||
190 | if (status < 0) | ||
191 | mlog_errno(status); | ||
192 | } else { | ||
193 | ocfs2_schedule_truncate_log_flush(osb, 0); | ||
194 | } | ||
195 | |||
196 | if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { | ||
197 | if (wait) | ||
198 | log_wait_commit(OCFS2_SB(sb)->journal->j_journal, | ||
199 | target); | ||
200 | } | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) | ||
205 | { | ||
206 | struct inode *new = NULL; | ||
207 | int status = 0; | ||
208 | int i; | ||
209 | |||
210 | mlog_entry_void(); | ||
211 | |||
212 | new = ocfs2_iget(osb, osb->root_blkno); | ||
213 | if (IS_ERR(new)) { | ||
214 | status = PTR_ERR(new); | ||
215 | mlog_errno(status); | ||
216 | goto bail; | ||
217 | } | ||
218 | osb->root_inode = new; | ||
219 | |||
220 | new = ocfs2_iget(osb, osb->system_dir_blkno); | ||
221 | if (IS_ERR(new)) { | ||
222 | status = PTR_ERR(new); | ||
223 | mlog_errno(status); | ||
224 | goto bail; | ||
225 | } | ||
226 | osb->sys_root_inode = new; | ||
227 | |||
228 | for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; | ||
229 | i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { | ||
230 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | ||
231 | if (!new) { | ||
232 | ocfs2_release_system_inodes(osb); | ||
233 | status = -EINVAL; | ||
234 | mlog_errno(status); | ||
235 | /* FIXME: Should ERROR_RO_FS */ | ||
236 | mlog(ML_ERROR, "Unable to load system inode %d, " | ||
237 | "possibly corrupt fs?", i); | ||
238 | goto bail; | ||
239 | } | ||
240 | // the array now has one ref, so drop this one | ||
241 | iput(new); | ||
242 | } | ||
243 | |||
244 | bail: | ||
245 | mlog_exit(status); | ||
246 | return status; | ||
247 | } | ||
248 | |||
249 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) | ||
250 | { | ||
251 | struct inode *new = NULL; | ||
252 | int status = 0; | ||
253 | int i; | ||
254 | |||
255 | mlog_entry_void(); | ||
256 | |||
257 | for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; | ||
258 | i < NUM_SYSTEM_INODES; | ||
259 | i++) { | ||
260 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | ||
261 | if (!new) { | ||
262 | ocfs2_release_system_inodes(osb); | ||
263 | status = -EINVAL; | ||
264 | mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", | ||
265 | status, i, osb->slot_num); | ||
266 | goto bail; | ||
267 | } | ||
268 | /* the array now has one ref, so drop this one */ | ||
269 | iput(new); | ||
270 | } | ||
271 | |||
272 | bail: | ||
273 | mlog_exit(status); | ||
274 | return status; | ||
275 | } | ||
276 | |||
277 | static int ocfs2_release_system_inodes(struct ocfs2_super *osb) | ||
278 | { | ||
279 | int status = 0, i; | ||
280 | struct inode *inode; | ||
281 | |||
282 | mlog_entry_void(); | ||
283 | |||
284 | for (i = 0; i < NUM_SYSTEM_INODES; i++) { | ||
285 | inode = osb->system_inodes[i]; | ||
286 | if (inode) { | ||
287 | iput(inode); | ||
288 | osb->system_inodes[i] = NULL; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | inode = osb->sys_root_inode; | ||
293 | if (inode) { | ||
294 | iput(inode); | ||
295 | osb->sys_root_inode = NULL; | ||
296 | } | ||
297 | |||
298 | inode = osb->root_inode; | ||
299 | if (inode) { | ||
300 | iput(inode); | ||
301 | osb->root_inode = NULL; | ||
302 | } | ||
303 | |||
304 | mlog_exit(status); | ||
305 | return status; | ||
306 | } | ||
307 | |||
308 | /* We're allocating fs objects, use GFP_NOFS */ | ||
309 | static struct inode *ocfs2_alloc_inode(struct super_block *sb) | ||
310 | { | ||
311 | struct ocfs2_inode_info *oi; | ||
312 | |||
313 | oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS); | ||
314 | if (!oi) | ||
315 | return NULL; | ||
316 | |||
317 | return &oi->vfs_inode; | ||
318 | } | ||
319 | |||
320 | static void ocfs2_destroy_inode(struct inode *inode) | ||
321 | { | ||
322 | kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); | ||
323 | } | ||
324 | |||
325 | /* From xfs_super.c:xfs_max_file_offset | ||
326 | * Copyright (c) 2000-2004 Silicon Graphics, Inc. | ||
327 | */ | ||
328 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) | ||
329 | { | ||
330 | unsigned int pagefactor = 1; | ||
331 | unsigned int bitshift = BITS_PER_LONG - 1; | ||
332 | |||
333 | /* Figure out maximum filesize, on Linux this can depend on | ||
334 | * the filesystem blocksize (on 32 bit platforms). | ||
335 | * __block_prepare_write does this in an [unsigned] long... | ||
336 | * page->index << (PAGE_CACHE_SHIFT - bbits) | ||
337 | * So, for page sized blocks (4K on 32 bit platforms), | ||
338 | * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is | ||
339 | * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) | ||
340 | * but for smaller blocksizes it is less (bbits = log2 bsize). | ||
341 | * Note1: get_block_t takes a long (implicit cast from above) | ||
342 | * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch | ||
343 | * can optionally convert the [unsigned] long from above into | ||
344 | * an [unsigned] long long. | ||
345 | */ | ||
346 | |||
347 | #if BITS_PER_LONG == 32 | ||
348 | # if defined(CONFIG_LBD) | ||
349 | BUG_ON(sizeof(sector_t) != 8); | ||
350 | pagefactor = PAGE_CACHE_SIZE; | ||
351 | bitshift = BITS_PER_LONG; | ||
352 | # else | ||
353 | pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); | ||
354 | # endif | ||
355 | #endif | ||
356 | |||
357 | return (((unsigned long long)pagefactor) << bitshift) - 1; | ||
358 | } | ||
359 | |||
360 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | ||
361 | { | ||
362 | int incompat_features; | ||
363 | int ret = 0; | ||
364 | unsigned long parsed_options; | ||
365 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
366 | |||
367 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { | ||
368 | ret = -EINVAL; | ||
369 | goto out; | ||
370 | } | ||
371 | |||
372 | if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != | ||
373 | (parsed_options & OCFS2_MOUNT_HB_LOCAL)) { | ||
374 | ret = -EINVAL; | ||
375 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); | ||
376 | goto out; | ||
377 | } | ||
378 | |||
379 | if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != | ||
380 | (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) { | ||
381 | ret = -EINVAL; | ||
382 | mlog(ML_ERROR, "Cannot change data mode on remount\n"); | ||
383 | goto out; | ||
384 | } | ||
385 | |||
386 | /* We're going to/from readonly mode. */ | ||
387 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { | ||
388 | /* Lock here so the check of HARD_RO and the potential | ||
389 | * setting of SOFT_RO is atomic. */ | ||
390 | spin_lock(&osb->osb_lock); | ||
391 | if (osb->osb_flags & OCFS2_OSB_HARD_RO) { | ||
392 | mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); | ||
393 | ret = -EROFS; | ||
394 | goto unlock_osb; | ||
395 | } | ||
396 | |||
397 | if (*flags & MS_RDONLY) { | ||
398 | mlog(0, "Going to ro mode.\n"); | ||
399 | sb->s_flags |= MS_RDONLY; | ||
400 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | ||
401 | } else { | ||
402 | mlog(0, "Making ro filesystem writeable.\n"); | ||
403 | |||
404 | if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { | ||
405 | mlog(ML_ERROR, "Cannot remount RDWR " | ||
406 | "filesystem due to previous errors.\n"); | ||
407 | ret = -EROFS; | ||
408 | goto unlock_osb; | ||
409 | } | ||
410 | incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); | ||
411 | if (incompat_features) { | ||
412 | mlog(ML_ERROR, "Cannot remount RDWR because " | ||
413 | "of unsupported optional features " | ||
414 | "(%x).\n", incompat_features); | ||
415 | ret = -EINVAL; | ||
416 | goto unlock_osb; | ||
417 | } | ||
418 | sb->s_flags &= ~MS_RDONLY; | ||
419 | osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; | ||
420 | } | ||
421 | unlock_osb: | ||
422 | spin_unlock(&osb->osb_lock); | ||
423 | } | ||
424 | |||
425 | if (!ret) { | ||
426 | if (!ocfs2_is_hard_readonly(osb)) | ||
427 | ocfs2_set_journal_params(osb); | ||
428 | |||
429 | /* Only save off the new mount options in case of a successful | ||
430 | * remount. */ | ||
431 | osb->s_mount_opt = parsed_options; | ||
432 | } | ||
433 | out: | ||
434 | return ret; | ||
435 | } | ||
436 | |||
437 | static int ocfs2_sb_probe(struct super_block *sb, | ||
438 | struct buffer_head **bh, | ||
439 | int *sector_size) | ||
440 | { | ||
441 | int status = 0, tmpstat; | ||
442 | struct ocfs1_vol_disk_hdr *hdr; | ||
443 | struct ocfs2_dinode *di; | ||
444 | int blksize; | ||
445 | |||
446 | *bh = NULL; | ||
447 | |||
448 | /* may be > 512 */ | ||
449 | *sector_size = bdev_hardsect_size(sb->s_bdev); | ||
450 | if (*sector_size > OCFS2_MAX_BLOCKSIZE) { | ||
451 | mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", | ||
452 | *sector_size, OCFS2_MAX_BLOCKSIZE); | ||
453 | status = -EINVAL; | ||
454 | goto bail; | ||
455 | } | ||
456 | |||
457 | /* Can this really happen? */ | ||
458 | if (*sector_size < OCFS2_MIN_BLOCKSIZE) | ||
459 | *sector_size = OCFS2_MIN_BLOCKSIZE; | ||
460 | |||
461 | /* check block zero for old format */ | ||
462 | status = ocfs2_get_sector(sb, bh, 0, *sector_size); | ||
463 | if (status < 0) { | ||
464 | mlog_errno(status); | ||
465 | goto bail; | ||
466 | } | ||
467 | hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; | ||
468 | if (hdr->major_version == OCFS1_MAJOR_VERSION) { | ||
469 | mlog(ML_ERROR, "incompatible version: %u.%u\n", | ||
470 | hdr->major_version, hdr->minor_version); | ||
471 | status = -EINVAL; | ||
472 | } | ||
473 | if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, | ||
474 | strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { | ||
475 | mlog(ML_ERROR, "incompatible volume signature: %8s\n", | ||
476 | hdr->signature); | ||
477 | status = -EINVAL; | ||
478 | } | ||
479 | brelse(*bh); | ||
480 | *bh = NULL; | ||
481 | if (status < 0) { | ||
482 | mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " | ||
483 | "upgraded before mounting with ocfs v2\n"); | ||
484 | goto bail; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * Now check at magic offset for 512, 1024, 2048, 4096 | ||
489 | * blocksizes. 4096 is the maximum blocksize because it is | ||
490 | * the minimum clustersize. | ||
491 | */ | ||
492 | status = -EINVAL; | ||
493 | for (blksize = *sector_size; | ||
494 | blksize <= OCFS2_MAX_BLOCKSIZE; | ||
495 | blksize <<= 1) { | ||
496 | tmpstat = ocfs2_get_sector(sb, bh, | ||
497 | OCFS2_SUPER_BLOCK_BLKNO, | ||
498 | blksize); | ||
499 | if (tmpstat < 0) { | ||
500 | status = tmpstat; | ||
501 | mlog_errno(status); | ||
502 | goto bail; | ||
503 | } | ||
504 | di = (struct ocfs2_dinode *) (*bh)->b_data; | ||
505 | status = ocfs2_verify_volume(di, *bh, blksize); | ||
506 | if (status >= 0) | ||
507 | goto bail; | ||
508 | brelse(*bh); | ||
509 | *bh = NULL; | ||
510 | if (status != -EAGAIN) | ||
511 | break; | ||
512 | } | ||
513 | |||
514 | bail: | ||
515 | return status; | ||
516 | } | ||
517 | |||
518 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | ||
519 | { | ||
520 | struct dentry *root; | ||
521 | int status, sector_size; | ||
522 | unsigned long parsed_opt; | ||
523 | struct inode *inode = NULL; | ||
524 | struct ocfs2_super *osb = NULL; | ||
525 | struct buffer_head *bh = NULL; | ||
526 | |||
527 | mlog_entry("%p, %p, %i", sb, data, silent); | ||
528 | |||
529 | /* for now we only have one cluster/node, make sure we see it | ||
530 | * in the heartbeat universe */ | ||
531 | if (!o2hb_check_local_node_heartbeating()) { | ||
532 | status = -EINVAL; | ||
533 | goto read_super_error; | ||
534 | } | ||
535 | |||
536 | /* probe for superblock */ | ||
537 | status = ocfs2_sb_probe(sb, &bh, §or_size); | ||
538 | if (status < 0) { | ||
539 | mlog(ML_ERROR, "superblock probe failed!\n"); | ||
540 | goto read_super_error; | ||
541 | } | ||
542 | |||
543 | status = ocfs2_initialize_super(sb, bh, sector_size); | ||
544 | osb = OCFS2_SB(sb); | ||
545 | if (status < 0) { | ||
546 | mlog_errno(status); | ||
547 | goto read_super_error; | ||
548 | } | ||
549 | brelse(bh); | ||
550 | bh = NULL; | ||
551 | |||
552 | if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { | ||
553 | status = -EINVAL; | ||
554 | goto read_super_error; | ||
555 | } | ||
556 | osb->s_mount_opt = parsed_opt; | ||
557 | |||
558 | sb->s_magic = OCFS2_SUPER_MAGIC; | ||
559 | |||
560 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, | ||
561 | * heartbeat=none */ | ||
562 | if (bdev_read_only(sb->s_bdev)) { | ||
563 | if (!(sb->s_flags & MS_RDONLY)) { | ||
564 | status = -EACCES; | ||
565 | mlog(ML_ERROR, "Readonly device detected but readonly " | ||
566 | "mount was not specified.\n"); | ||
567 | goto read_super_error; | ||
568 | } | ||
569 | |||
570 | /* You should not be able to start a local heartbeat | ||
571 | * on a readonly device. */ | ||
572 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | ||
573 | status = -EROFS; | ||
574 | mlog(ML_ERROR, "Local heartbeat specified on readonly " | ||
575 | "device.\n"); | ||
576 | goto read_super_error; | ||
577 | } | ||
578 | |||
579 | status = ocfs2_check_journals_nolocks(osb); | ||
580 | if (status < 0) { | ||
581 | if (status == -EROFS) | ||
582 | mlog(ML_ERROR, "Recovery required on readonly " | ||
583 | "file system, but write access is " | ||
584 | "unavailable.\n"); | ||
585 | else | ||
586 | mlog_errno(status); | ||
587 | goto read_super_error; | ||
588 | } | ||
589 | |||
590 | ocfs2_set_ro_flag(osb, 1); | ||
591 | |||
592 | printk(KERN_NOTICE "Readonly device detected. No cluster " | ||
593 | "services will be utilized for this mount. Recovery " | ||
594 | "will be skipped.\n"); | ||
595 | } | ||
596 | |||
597 | if (!ocfs2_is_hard_readonly(osb)) { | ||
598 | /* If this isn't a hard readonly mount, then we need | ||
599 | * to make sure that heartbeat is in a valid state, | ||
600 | * and that we mark ourselves soft readonly is -oro | ||
601 | * was specified. */ | ||
602 | if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { | ||
603 | mlog(ML_ERROR, "No heartbeat for device (%s)\n", | ||
604 | sb->s_id); | ||
605 | status = -EINVAL; | ||
606 | goto read_super_error; | ||
607 | } | ||
608 | |||
609 | if (sb->s_flags & MS_RDONLY) | ||
610 | ocfs2_set_ro_flag(osb, 0); | ||
611 | } | ||
612 | |||
613 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, | ||
614 | ocfs2_debugfs_root); | ||
615 | if (!osb->osb_debug_root) { | ||
616 | status = -EINVAL; | ||
617 | mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); | ||
618 | goto read_super_error; | ||
619 | } | ||
620 | |||
621 | status = ocfs2_mount_volume(sb); | ||
622 | if (osb->root_inode) | ||
623 | inode = igrab(osb->root_inode); | ||
624 | |||
625 | if (status < 0) | ||
626 | goto read_super_error; | ||
627 | |||
628 | if (!inode) { | ||
629 | status = -EIO; | ||
630 | mlog_errno(status); | ||
631 | goto read_super_error; | ||
632 | } | ||
633 | |||
634 | root = d_alloc_root(inode); | ||
635 | if (!root) { | ||
636 | status = -ENOMEM; | ||
637 | mlog_errno(status); | ||
638 | goto read_super_error; | ||
639 | } | ||
640 | |||
641 | sb->s_root = root; | ||
642 | |||
643 | ocfs2_complete_mount_recovery(osb); | ||
644 | |||
645 | printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s " | ||
646 | "data mode.\n", | ||
647 | MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num, | ||
648 | osb->slot_num, | ||
649 | osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : | ||
650 | "ordered"); | ||
651 | |||
652 | atomic_set(&osb->vol_state, VOLUME_MOUNTED); | ||
653 | wake_up(&osb->osb_mount_event); | ||
654 | |||
655 | mlog_exit(status); | ||
656 | return status; | ||
657 | |||
658 | read_super_error: | ||
659 | if (bh != NULL) | ||
660 | brelse(bh); | ||
661 | |||
662 | if (inode) | ||
663 | iput(inode); | ||
664 | |||
665 | if (osb) { | ||
666 | atomic_set(&osb->vol_state, VOLUME_DISABLED); | ||
667 | wake_up(&osb->osb_mount_event); | ||
668 | ocfs2_dismount_volume(sb, 1); | ||
669 | } | ||
670 | |||
671 | mlog_exit(status); | ||
672 | return status; | ||
673 | } | ||
674 | |||
675 | static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type, | ||
676 | int flags, | ||
677 | const char *dev_name, | ||
678 | void *data) | ||
679 | { | ||
680 | return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); | ||
681 | } | ||
682 | |||
683 | static struct file_system_type ocfs2_fs_type = { | ||
684 | .owner = THIS_MODULE, | ||
685 | .name = "ocfs2", | ||
686 | .get_sb = ocfs2_get_sb, /* is this called when we mount | ||
687 | * the fs? */ | ||
688 | .kill_sb = kill_block_super, /* set to the generic one | ||
689 | * right now, but do we | ||
690 | * need to change that? */ | ||
691 | .fs_flags = FS_REQUIRES_DEV, | ||
692 | .next = NULL | ||
693 | }; | ||
694 | |||
695 | static int ocfs2_parse_options(struct super_block *sb, | ||
696 | char *options, | ||
697 | unsigned long *mount_opt, | ||
698 | int is_remount) | ||
699 | { | ||
700 | int status; | ||
701 | char *p; | ||
702 | |||
703 | mlog_entry("remount: %d, options: \"%s\"\n", is_remount, | ||
704 | options ? options : "(none)"); | ||
705 | |||
706 | *mount_opt = 0; | ||
707 | |||
708 | if (!options) { | ||
709 | status = 1; | ||
710 | goto bail; | ||
711 | } | ||
712 | |||
713 | while ((p = strsep(&options, ",")) != NULL) { | ||
714 | int token, option; | ||
715 | substring_t args[MAX_OPT_ARGS]; | ||
716 | |||
717 | if (!*p) | ||
718 | continue; | ||
719 | |||
720 | token = match_token(p, tokens, args); | ||
721 | switch (token) { | ||
722 | case Opt_hb_local: | ||
723 | *mount_opt |= OCFS2_MOUNT_HB_LOCAL; | ||
724 | break; | ||
725 | case Opt_hb_none: | ||
726 | *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; | ||
727 | break; | ||
728 | case Opt_barrier: | ||
729 | if (match_int(&args[0], &option)) { | ||
730 | status = 0; | ||
731 | goto bail; | ||
732 | } | ||
733 | if (option) | ||
734 | *mount_opt |= OCFS2_MOUNT_BARRIER; | ||
735 | else | ||
736 | *mount_opt &= ~OCFS2_MOUNT_BARRIER; | ||
737 | break; | ||
738 | case Opt_intr: | ||
739 | *mount_opt &= ~OCFS2_MOUNT_NOINTR; | ||
740 | break; | ||
741 | case Opt_nointr: | ||
742 | *mount_opt |= OCFS2_MOUNT_NOINTR; | ||
743 | break; | ||
744 | case Opt_err_panic: | ||
745 | *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | ||
746 | break; | ||
747 | case Opt_err_ro: | ||
748 | *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; | ||
749 | break; | ||
750 | case Opt_data_ordered: | ||
751 | *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; | ||
752 | break; | ||
753 | case Opt_data_writeback: | ||
754 | *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; | ||
755 | break; | ||
756 | default: | ||
757 | mlog(ML_ERROR, | ||
758 | "Unrecognized mount option \"%s\" " | ||
759 | "or missing value\n", p); | ||
760 | status = 0; | ||
761 | goto bail; | ||
762 | } | ||
763 | } | ||
764 | |||
765 | status = 1; | ||
766 | |||
767 | bail: | ||
768 | mlog_exit(status); | ||
769 | return status; | ||
770 | } | ||
771 | |||
772 | static int __init ocfs2_init(void) | ||
773 | { | ||
774 | int status; | ||
775 | |||
776 | mlog_entry_void(); | ||
777 | |||
778 | ocfs2_print_version(); | ||
779 | |||
780 | if (init_ocfs2_extent_maps()) | ||
781 | return -ENOMEM; | ||
782 | |||
783 | status = init_ocfs2_uptodate_cache(); | ||
784 | if (status < 0) { | ||
785 | mlog_errno(status); | ||
786 | goto leave; | ||
787 | } | ||
788 | |||
789 | status = ocfs2_initialize_mem_caches(); | ||
790 | if (status < 0) { | ||
791 | mlog_errno(status); | ||
792 | goto leave; | ||
793 | } | ||
794 | |||
795 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
796 | if (!ocfs2_wq) { | ||
797 | status = -ENOMEM; | ||
798 | goto leave; | ||
799 | } | ||
800 | |||
801 | spin_lock(&ocfs2_globals_lock); | ||
802 | osb_id = 0; | ||
803 | spin_unlock(&ocfs2_globals_lock); | ||
804 | |||
805 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | ||
806 | if (!ocfs2_debugfs_root) { | ||
807 | status = -EFAULT; | ||
808 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | ||
809 | } | ||
810 | |||
811 | leave: | ||
812 | if (status < 0) { | ||
813 | ocfs2_free_mem_caches(); | ||
814 | exit_ocfs2_uptodate_cache(); | ||
815 | exit_ocfs2_extent_maps(); | ||
816 | } | ||
817 | |||
818 | mlog_exit(status); | ||
819 | |||
820 | if (status >= 0) { | ||
821 | return register_filesystem(&ocfs2_fs_type); | ||
822 | } else | ||
823 | return -1; | ||
824 | } | ||
825 | |||
826 | static void __exit ocfs2_exit(void) | ||
827 | { | ||
828 | mlog_entry_void(); | ||
829 | |||
830 | if (ocfs2_wq) { | ||
831 | flush_workqueue(ocfs2_wq); | ||
832 | destroy_workqueue(ocfs2_wq); | ||
833 | } | ||
834 | |||
835 | debugfs_remove(ocfs2_debugfs_root); | ||
836 | |||
837 | ocfs2_free_mem_caches(); | ||
838 | |||
839 | unregister_filesystem(&ocfs2_fs_type); | ||
840 | |||
841 | exit_ocfs2_extent_maps(); | ||
842 | |||
843 | exit_ocfs2_uptodate_cache(); | ||
844 | |||
845 | mlog_exit_void(); | ||
846 | } | ||
847 | |||
848 | static void ocfs2_put_super(struct super_block *sb) | ||
849 | { | ||
850 | mlog_entry("(0x%p)\n", sb); | ||
851 | |||
852 | ocfs2_sync_blockdev(sb); | ||
853 | ocfs2_dismount_volume(sb, 0); | ||
854 | |||
855 | mlog_exit_void(); | ||
856 | } | ||
857 | |||
858 | static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) | ||
859 | { | ||
860 | struct ocfs2_super *osb; | ||
861 | u32 numbits, freebits; | ||
862 | int status; | ||
863 | struct ocfs2_dinode *bm_lock; | ||
864 | struct buffer_head *bh = NULL; | ||
865 | struct inode *inode = NULL; | ||
866 | |||
867 | mlog_entry("(%p, %p)\n", sb, buf); | ||
868 | |||
869 | osb = OCFS2_SB(sb); | ||
870 | |||
871 | inode = ocfs2_get_system_file_inode(osb, | ||
872 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
873 | OCFS2_INVALID_SLOT); | ||
874 | if (!inode) { | ||
875 | mlog(ML_ERROR, "failed to get bitmap inode\n"); | ||
876 | status = -EIO; | ||
877 | goto bail; | ||
878 | } | ||
879 | |||
880 | status = ocfs2_meta_lock(inode, NULL, &bh, 0); | ||
881 | if (status < 0) { | ||
882 | mlog_errno(status); | ||
883 | goto bail; | ||
884 | } | ||
885 | |||
886 | bm_lock = (struct ocfs2_dinode *) bh->b_data; | ||
887 | |||
888 | numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); | ||
889 | freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); | ||
890 | |||
891 | buf->f_type = OCFS2_SUPER_MAGIC; | ||
892 | buf->f_bsize = sb->s_blocksize; | ||
893 | buf->f_namelen = OCFS2_MAX_FILENAME_LEN; | ||
894 | buf->f_blocks = ((sector_t) numbits) * | ||
895 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | ||
896 | buf->f_bfree = ((sector_t) freebits) * | ||
897 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | ||
898 | buf->f_bavail = buf->f_bfree; | ||
899 | buf->f_files = numbits; | ||
900 | buf->f_ffree = freebits; | ||
901 | |||
902 | brelse(bh); | ||
903 | |||
904 | ocfs2_meta_unlock(inode, 0); | ||
905 | status = 0; | ||
906 | bail: | ||
907 | if (inode) | ||
908 | iput(inode); | ||
909 | |||
910 | mlog_exit(status); | ||
911 | |||
912 | return status; | ||
913 | } | ||
914 | |||
915 | static void ocfs2_inode_init_once(void *data, | ||
916 | kmem_cache_t *cachep, | ||
917 | unsigned long flags) | ||
918 | { | ||
919 | struct ocfs2_inode_info *oi = data; | ||
920 | |||
921 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
922 | SLAB_CTOR_CONSTRUCTOR) { | ||
923 | oi->ip_flags = 0; | ||
924 | oi->ip_open_count = 0; | ||
925 | spin_lock_init(&oi->ip_lock); | ||
926 | ocfs2_extent_map_init(&oi->vfs_inode); | ||
927 | INIT_LIST_HEAD(&oi->ip_handle_list); | ||
928 | INIT_LIST_HEAD(&oi->ip_io_markers); | ||
929 | oi->ip_handle = NULL; | ||
930 | oi->ip_created_trans = 0; | ||
931 | oi->ip_last_trans = 0; | ||
932 | oi->ip_dir_start_lookup = 0; | ||
933 | |||
934 | init_rwsem(&oi->ip_alloc_sem); | ||
935 | init_MUTEX(&(oi->ip_io_sem)); | ||
936 | |||
937 | oi->ip_blkno = 0ULL; | ||
938 | oi->ip_clusters = 0; | ||
939 | |||
940 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); | ||
941 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); | ||
942 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); | ||
943 | |||
944 | ocfs2_metadata_cache_init(&oi->vfs_inode); | ||
945 | |||
946 | inode_init_once(&oi->vfs_inode); | ||
947 | } | ||
948 | } | ||
949 | |||
950 | static int ocfs2_initialize_mem_caches(void) | ||
951 | { | ||
952 | ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", | ||
953 | sizeof(struct ocfs2_inode_info), | ||
954 | 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, | ||
955 | ocfs2_inode_init_once, NULL); | ||
956 | if (!ocfs2_inode_cachep) | ||
957 | return -ENOMEM; | ||
958 | |||
959 | ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", | ||
960 | sizeof(struct ocfs2_journal_lock), | ||
961 | 0, | ||
962 | SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, | ||
963 | NULL, NULL); | ||
964 | if (!ocfs2_lock_cache) | ||
965 | return -ENOMEM; | ||
966 | |||
967 | return 0; | ||
968 | } | ||
969 | |||
970 | static void ocfs2_free_mem_caches(void) | ||
971 | { | ||
972 | if (ocfs2_inode_cachep) | ||
973 | kmem_cache_destroy(ocfs2_inode_cachep); | ||
974 | if (ocfs2_lock_cache) | ||
975 | kmem_cache_destroy(ocfs2_lock_cache); | ||
976 | |||
977 | ocfs2_inode_cachep = NULL; | ||
978 | ocfs2_lock_cache = NULL; | ||
979 | } | ||
980 | |||
981 | static int ocfs2_get_sector(struct super_block *sb, | ||
982 | struct buffer_head **bh, | ||
983 | int block, | ||
984 | int sect_size) | ||
985 | { | ||
986 | if (!sb_set_blocksize(sb, sect_size)) { | ||
987 | mlog(ML_ERROR, "unable to set blocksize\n"); | ||
988 | return -EIO; | ||
989 | } | ||
990 | |||
991 | *bh = sb_getblk(sb, block); | ||
992 | if (!*bh) { | ||
993 | mlog_errno(-EIO); | ||
994 | return -EIO; | ||
995 | } | ||
996 | lock_buffer(*bh); | ||
997 | if (!buffer_dirty(*bh)) | ||
998 | clear_buffer_uptodate(*bh); | ||
999 | unlock_buffer(*bh); | ||
1000 | ll_rw_block(READ, 1, bh); | ||
1001 | wait_on_buffer(*bh); | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | /* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ | ||
1006 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) | ||
1007 | { | ||
1008 | int status; | ||
1009 | |||
1010 | /* XXX hold a ref on the node while mounte? easy enough, if | ||
1011 | * desirable. */ | ||
1012 | osb->node_num = o2nm_this_node(); | ||
1013 | if (osb->node_num == O2NM_MAX_NODES) { | ||
1014 | mlog(ML_ERROR, "could not find this host's node number\n"); | ||
1015 | status = -ENOENT; | ||
1016 | goto bail; | ||
1017 | } | ||
1018 | |||
1019 | mlog(ML_NOTICE, "I am node %d\n", osb->node_num); | ||
1020 | |||
1021 | status = 0; | ||
1022 | bail: | ||
1023 | return status; | ||
1024 | } | ||
1025 | |||
1026 | static int ocfs2_mount_volume(struct super_block *sb) | ||
1027 | { | ||
1028 | int status = 0; | ||
1029 | int unlock_super = 0; | ||
1030 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
1031 | |||
1032 | mlog_entry_void(); | ||
1033 | |||
1034 | if (ocfs2_is_hard_readonly(osb)) | ||
1035 | goto leave; | ||
1036 | |||
1037 | status = ocfs2_fill_local_node_info(osb); | ||
1038 | if (status < 0) { | ||
1039 | mlog_errno(status); | ||
1040 | goto leave; | ||
1041 | } | ||
1042 | |||
1043 | status = ocfs2_register_hb_callbacks(osb); | ||
1044 | if (status < 0) { | ||
1045 | mlog_errno(status); | ||
1046 | goto leave; | ||
1047 | } | ||
1048 | |||
1049 | status = ocfs2_dlm_init(osb); | ||
1050 | if (status < 0) { | ||
1051 | mlog_errno(status); | ||
1052 | goto leave; | ||
1053 | } | ||
1054 | |||
1055 | /* requires vote_thread to be running. */ | ||
1056 | status = ocfs2_register_net_handlers(osb); | ||
1057 | if (status < 0) { | ||
1058 | mlog_errno(status); | ||
1059 | goto leave; | ||
1060 | } | ||
1061 | |||
1062 | status = ocfs2_super_lock(osb, 1); | ||
1063 | if (status < 0) { | ||
1064 | mlog_errno(status); | ||
1065 | goto leave; | ||
1066 | } | ||
1067 | unlock_super = 1; | ||
1068 | |||
1069 | /* This will load up the node map and add ourselves to it. */ | ||
1070 | status = ocfs2_find_slot(osb); | ||
1071 | if (status < 0) { | ||
1072 | mlog_errno(status); | ||
1073 | goto leave; | ||
1074 | } | ||
1075 | |||
1076 | ocfs2_populate_mounted_map(osb); | ||
1077 | |||
1078 | /* load all node-local system inodes */ | ||
1079 | status = ocfs2_init_local_system_inodes(osb); | ||
1080 | if (status < 0) { | ||
1081 | mlog_errno(status); | ||
1082 | goto leave; | ||
1083 | } | ||
1084 | |||
1085 | status = ocfs2_check_volume(osb); | ||
1086 | if (status < 0) { | ||
1087 | mlog_errno(status); | ||
1088 | goto leave; | ||
1089 | } | ||
1090 | |||
1091 | status = ocfs2_truncate_log_init(osb); | ||
1092 | if (status < 0) { | ||
1093 | mlog_errno(status); | ||
1094 | goto leave; | ||
1095 | } | ||
1096 | |||
1097 | /* This should be sent *after* we recovered our journal as it | ||
1098 | * will cause other nodes to unmark us as needing | ||
1099 | * recovery. However, we need to send it *before* dropping the | ||
1100 | * super block lock as otherwise their recovery threads might | ||
1101 | * try to clean us up while we're live! */ | ||
1102 | status = ocfs2_request_mount_vote(osb); | ||
1103 | if (status < 0) | ||
1104 | mlog_errno(status); | ||
1105 | |||
1106 | leave: | ||
1107 | if (unlock_super) | ||
1108 | ocfs2_super_unlock(osb, 1); | ||
1109 | |||
1110 | mlog_exit(status); | ||
1111 | return status; | ||
1112 | } | ||
1113 | |||
1114 | /* we can't grab the goofy sem lock from inside wait_event, so we use | ||
1115 | * memory barriers to make sure that we'll see the null task before | ||
1116 | * being woken up */ | ||
1117 | static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) | ||
1118 | { | ||
1119 | mb(); | ||
1120 | return osb->recovery_thread_task != NULL; | ||
1121 | } | ||
1122 | |||
1123 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | ||
1124 | { | ||
1125 | int tmp; | ||
1126 | struct ocfs2_super *osb = NULL; | ||
1127 | |||
1128 | mlog_entry("(0x%p)\n", sb); | ||
1129 | |||
1130 | BUG_ON(!sb); | ||
1131 | osb = OCFS2_SB(sb); | ||
1132 | BUG_ON(!osb); | ||
1133 | |||
1134 | ocfs2_shutdown_local_alloc(osb); | ||
1135 | |||
1136 | ocfs2_truncate_log_shutdown(osb); | ||
1137 | |||
1138 | /* disable any new recovery threads and wait for any currently | ||
1139 | * running ones to exit. Do this before setting the vol_state. */ | ||
1140 | down(&osb->recovery_lock); | ||
1141 | osb->disable_recovery = 1; | ||
1142 | up(&osb->recovery_lock); | ||
1143 | wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); | ||
1144 | |||
1145 | /* At this point, we know that no more recovery threads can be | ||
1146 | * launched, so wait for any recovery completion work to | ||
1147 | * complete. */ | ||
1148 | flush_workqueue(ocfs2_wq); | ||
1149 | |||
1150 | ocfs2_journal_shutdown(osb); | ||
1151 | |||
1152 | ocfs2_sync_blockdev(sb); | ||
1153 | |||
1154 | /* No dlm means we've failed during mount, so skip all the | ||
1155 | * steps which depended on that to complete. */ | ||
1156 | if (osb->dlm) { | ||
1157 | tmp = ocfs2_super_lock(osb, 1); | ||
1158 | if (tmp < 0) { | ||
1159 | mlog_errno(tmp); | ||
1160 | return; | ||
1161 | } | ||
1162 | |||
1163 | tmp = ocfs2_request_umount_vote(osb); | ||
1164 | if (tmp < 0) | ||
1165 | mlog_errno(tmp); | ||
1166 | |||
1167 | if (osb->slot_num != OCFS2_INVALID_SLOT) | ||
1168 | ocfs2_put_slot(osb); | ||
1169 | |||
1170 | ocfs2_super_unlock(osb, 1); | ||
1171 | } | ||
1172 | |||
1173 | ocfs2_release_system_inodes(osb); | ||
1174 | |||
1175 | if (osb->dlm) { | ||
1176 | ocfs2_unregister_net_handlers(osb); | ||
1177 | |||
1178 | ocfs2_dlm_shutdown(osb); | ||
1179 | } | ||
1180 | |||
1181 | ocfs2_clear_hb_callbacks(osb); | ||
1182 | |||
1183 | debugfs_remove(osb->osb_debug_root); | ||
1184 | |||
1185 | if (!mnt_err) | ||
1186 | ocfs2_stop_heartbeat(osb); | ||
1187 | |||
1188 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); | ||
1189 | |||
1190 | printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n", | ||
1191 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num); | ||
1192 | |||
1193 | ocfs2_delete_osb(osb); | ||
1194 | kfree(osb); | ||
1195 | sb->s_dev = 0; | ||
1196 | sb->s_fs_info = NULL; | ||
1197 | } | ||
1198 | |||
1199 | static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, | ||
1200 | unsigned uuid_bytes) | ||
1201 | { | ||
1202 | int i, ret; | ||
1203 | char *ptr; | ||
1204 | |||
1205 | BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); | ||
1206 | |||
1207 | osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); | ||
1208 | if (osb->uuid_str == NULL) | ||
1209 | return -ENOMEM; | ||
1210 | |||
1211 | memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN); | ||
1212 | |||
1213 | for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { | ||
1214 | /* print with null */ | ||
1215 | ret = snprintf(ptr, 3, "%02X", uuid[i]); | ||
1216 | if (ret != 2) /* drop super cleans up */ | ||
1217 | return -EINVAL; | ||
1218 | /* then only advance past the last char */ | ||
1219 | ptr += 2; | ||
1220 | } | ||
1221 | |||
1222 | return 0; | ||
1223 | } | ||
1224 | |||
1225 | static int ocfs2_initialize_super(struct super_block *sb, | ||
1226 | struct buffer_head *bh, | ||
1227 | int sector_size) | ||
1228 | { | ||
1229 | int status = 0; | ||
1230 | int i; | ||
1231 | struct ocfs2_dinode *di = NULL; | ||
1232 | struct inode *inode = NULL; | ||
1233 | struct buffer_head *bitmap_bh = NULL; | ||
1234 | struct ocfs2_journal *journal; | ||
1235 | __le32 uuid_net_key; | ||
1236 | struct ocfs2_super *osb; | ||
1237 | |||
1238 | mlog_entry_void(); | ||
1239 | |||
1240 | osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); | ||
1241 | if (!osb) { | ||
1242 | status = -ENOMEM; | ||
1243 | mlog_errno(status); | ||
1244 | goto bail; | ||
1245 | } | ||
1246 | |||
1247 | sb->s_fs_info = osb; | ||
1248 | sb->s_op = &ocfs2_sops; | ||
1249 | sb->s_export_op = &ocfs2_export_ops; | ||
1250 | sb->s_flags |= MS_NOATIME; | ||
1251 | /* this is needed to support O_LARGEFILE */ | ||
1252 | sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); | ||
1253 | |||
1254 | osb->sb = sb; | ||
1255 | /* Save off for ocfs2_rw_direct */ | ||
1256 | osb->s_sectsize_bits = blksize_bits(sector_size); | ||
1257 | if (!osb->s_sectsize_bits) | ||
1258 | BUG(); | ||
1259 | |||
1260 | osb->net_response_ids = 0; | ||
1261 | spin_lock_init(&osb->net_response_lock); | ||
1262 | INIT_LIST_HEAD(&osb->net_response_list); | ||
1263 | |||
1264 | INIT_LIST_HEAD(&osb->osb_net_handlers); | ||
1265 | init_waitqueue_head(&osb->recovery_event); | ||
1266 | spin_lock_init(&osb->vote_task_lock); | ||
1267 | init_waitqueue_head(&osb->vote_event); | ||
1268 | osb->vote_work_sequence = 0; | ||
1269 | osb->vote_wake_sequence = 0; | ||
1270 | INIT_LIST_HEAD(&osb->blocked_lock_list); | ||
1271 | osb->blocked_lock_count = 0; | ||
1272 | INIT_LIST_HEAD(&osb->vote_list); | ||
1273 | spin_lock_init(&osb->osb_lock); | ||
1274 | |||
1275 | atomic_set(&osb->alloc_stats.moves, 0); | ||
1276 | atomic_set(&osb->alloc_stats.local_data, 0); | ||
1277 | atomic_set(&osb->alloc_stats.bitmap_data, 0); | ||
1278 | atomic_set(&osb->alloc_stats.bg_allocs, 0); | ||
1279 | atomic_set(&osb->alloc_stats.bg_extends, 0); | ||
1280 | |||
1281 | ocfs2_init_node_maps(osb); | ||
1282 | |||
1283 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", | ||
1284 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1285 | |||
1286 | init_MUTEX(&osb->recovery_lock); | ||
1287 | |||
1288 | osb->disable_recovery = 0; | ||
1289 | osb->recovery_thread_task = NULL; | ||
1290 | |||
1291 | init_waitqueue_head(&osb->checkpoint_event); | ||
1292 | atomic_set(&osb->needs_checkpoint, 0); | ||
1293 | |||
1294 | osb->node_num = O2NM_INVALID_NODE_NUM; | ||
1295 | osb->slot_num = OCFS2_INVALID_SLOT; | ||
1296 | |||
1297 | osb->local_alloc_state = OCFS2_LA_UNUSED; | ||
1298 | osb->local_alloc_bh = NULL; | ||
1299 | |||
1300 | ocfs2_setup_hb_callbacks(osb); | ||
1301 | |||
1302 | init_waitqueue_head(&osb->osb_mount_event); | ||
1303 | |||
1304 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); | ||
1305 | if (!osb->vol_label) { | ||
1306 | mlog(ML_ERROR, "unable to alloc vol label\n"); | ||
1307 | status = -ENOMEM; | ||
1308 | goto bail; | ||
1309 | } | ||
1310 | |||
1311 | osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL); | ||
1312 | if (!osb->uuid) { | ||
1313 | mlog(ML_ERROR, "unable to alloc uuid\n"); | ||
1314 | status = -ENOMEM; | ||
1315 | goto bail; | ||
1316 | } | ||
1317 | |||
1318 | di = (struct ocfs2_dinode *)bh->b_data; | ||
1319 | |||
1320 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); | ||
1321 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { | ||
1322 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", | ||
1323 | osb->max_slots); | ||
1324 | status = -EINVAL; | ||
1325 | goto bail; | ||
1326 | } | ||
1327 | mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); | ||
1328 | |||
1329 | osb->s_feature_compat = | ||
1330 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); | ||
1331 | osb->s_feature_ro_compat = | ||
1332 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); | ||
1333 | osb->s_feature_incompat = | ||
1334 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); | ||
1335 | |||
1336 | if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { | ||
1337 | mlog(ML_ERROR, "couldn't mount because of unsupported " | ||
1338 | "optional features (%x).\n", i); | ||
1339 | status = -EINVAL; | ||
1340 | goto bail; | ||
1341 | } | ||
1342 | if (!(osb->sb->s_flags & MS_RDONLY) && | ||
1343 | (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { | ||
1344 | mlog(ML_ERROR, "couldn't mount RDWR because of " | ||
1345 | "unsupported optional features (%x).\n", i); | ||
1346 | status = -EINVAL; | ||
1347 | goto bail; | ||
1348 | } | ||
1349 | |||
1350 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); | ||
1351 | |||
1352 | /* FIXME | ||
1353 | * This should be done in ocfs2_journal_init(), but unknown | ||
1354 | * ordering issues will cause the filesystem to crash. | ||
1355 | * If anyone wants to figure out what part of the code | ||
1356 | * refers to osb->journal before ocfs2_journal_init() is run, | ||
1357 | * be my guest. | ||
1358 | */ | ||
1359 | /* initialize our journal structure */ | ||
1360 | |||
1361 | journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); | ||
1362 | if (!journal) { | ||
1363 | mlog(ML_ERROR, "unable to alloc journal\n"); | ||
1364 | status = -ENOMEM; | ||
1365 | goto bail; | ||
1366 | } | ||
1367 | osb->journal = journal; | ||
1368 | journal->j_osb = osb; | ||
1369 | |||
1370 | atomic_set(&journal->j_num_trans, 0); | ||
1371 | init_rwsem(&journal->j_trans_barrier); | ||
1372 | init_waitqueue_head(&journal->j_checkpointed); | ||
1373 | spin_lock_init(&journal->j_lock); | ||
1374 | journal->j_trans_id = (unsigned long) 1; | ||
1375 | INIT_LIST_HEAD(&journal->j_la_cleanups); | ||
1376 | INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb); | ||
1377 | journal->j_state = OCFS2_JOURNAL_FREE; | ||
1378 | |||
1379 | /* get some pseudo constants for clustersize bits */ | ||
1380 | osb->s_clustersize_bits = | ||
1381 | le32_to_cpu(di->id2.i_super.s_clustersize_bits); | ||
1382 | osb->s_clustersize = 1 << osb->s_clustersize_bits; | ||
1383 | mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); | ||
1384 | |||
1385 | if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || | ||
1386 | osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { | ||
1387 | mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", | ||
1388 | osb->s_clustersize); | ||
1389 | status = -EINVAL; | ||
1390 | goto bail; | ||
1391 | } | ||
1392 | |||
1393 | if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) | ||
1394 | > (u32)~0UL) { | ||
1395 | mlog(ML_ERROR, "Volume might try to write to blocks beyond " | ||
1396 | "what jbd can address in 32 bits.\n"); | ||
1397 | status = -EINVAL; | ||
1398 | goto bail; | ||
1399 | } | ||
1400 | |||
1401 | if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, | ||
1402 | sizeof(di->id2.i_super.s_uuid))) { | ||
1403 | mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); | ||
1404 | status = -ENOMEM; | ||
1405 | goto bail; | ||
1406 | } | ||
1407 | |||
1408 | memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key)); | ||
1409 | osb->net_key = le32_to_cpu(uuid_net_key); | ||
1410 | |||
1411 | strncpy(osb->vol_label, di->id2.i_super.s_label, 63); | ||
1412 | osb->vol_label[63] = '\0'; | ||
1413 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); | ||
1414 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); | ||
1415 | osb->first_cluster_group_blkno = | ||
1416 | le64_to_cpu(di->id2.i_super.s_first_cluster_group); | ||
1417 | osb->fs_generation = le32_to_cpu(di->i_fs_generation); | ||
1418 | mlog(0, "vol_label: %s\n", osb->vol_label); | ||
1419 | mlog(0, "uuid: %s\n", osb->uuid_str); | ||
1420 | mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n", | ||
1421 | osb->root_blkno, osb->system_dir_blkno); | ||
1422 | |||
1423 | osb->osb_dlm_debug = ocfs2_new_dlm_debug(); | ||
1424 | if (!osb->osb_dlm_debug) { | ||
1425 | status = -ENOMEM; | ||
1426 | mlog_errno(status); | ||
1427 | goto bail; | ||
1428 | } | ||
1429 | |||
1430 | atomic_set(&osb->vol_state, VOLUME_INIT); | ||
1431 | |||
1432 | /* load root, system_dir, and all global system inodes */ | ||
1433 | status = ocfs2_init_global_system_inodes(osb); | ||
1434 | if (status < 0) { | ||
1435 | mlog_errno(status); | ||
1436 | goto bail; | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * global bitmap | ||
1441 | */ | ||
1442 | inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, | ||
1443 | OCFS2_INVALID_SLOT); | ||
1444 | if (!inode) { | ||
1445 | status = -EINVAL; | ||
1446 | mlog_errno(status); | ||
1447 | goto bail; | ||
1448 | } | ||
1449 | |||
1450 | osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; | ||
1451 | |||
1452 | status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, | ||
1453 | inode); | ||
1454 | iput(inode); | ||
1455 | if (status < 0) { | ||
1456 | mlog_errno(status); | ||
1457 | goto bail; | ||
1458 | } | ||
1459 | |||
1460 | di = (struct ocfs2_dinode *) bitmap_bh->b_data; | ||
1461 | osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); | ||
1462 | osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); | ||
1463 | brelse(bitmap_bh); | ||
1464 | mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n", | ||
1465 | osb->bitmap_blkno, osb->bitmap_cpg); | ||
1466 | |||
1467 | status = ocfs2_init_slot_info(osb); | ||
1468 | if (status < 0) { | ||
1469 | mlog_errno(status); | ||
1470 | goto bail; | ||
1471 | } | ||
1472 | |||
1473 | /* Link this osb onto the global linked list of all osb structures. */ | ||
1474 | /* The Global Link List is mainted for the whole driver . */ | ||
1475 | spin_lock(&ocfs2_globals_lock); | ||
1476 | osb->osb_id = osb_id; | ||
1477 | if (osb_id < OCFS2_MAX_OSB_ID) | ||
1478 | osb_id++; | ||
1479 | else { | ||
1480 | mlog(ML_ERROR, "Too many volumes mounted\n"); | ||
1481 | status = -ENOMEM; | ||
1482 | } | ||
1483 | spin_unlock(&ocfs2_globals_lock); | ||
1484 | |||
1485 | bail: | ||
1486 | mlog_exit(status); | ||
1487 | return status; | ||
1488 | } | ||
1489 | |||
1490 | /* | ||
1491 | * will return: -EAGAIN if it is ok to keep searching for superblocks | ||
1492 | * -EINVAL if there is a bad superblock | ||
1493 | * 0 on success | ||
1494 | */ | ||
1495 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | ||
1496 | struct buffer_head *bh, | ||
1497 | u32 blksz) | ||
1498 | { | ||
1499 | int status = -EAGAIN; | ||
1500 | |||
1501 | mlog_entry_void(); | ||
1502 | |||
1503 | if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, | ||
1504 | strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { | ||
1505 | status = -EINVAL; | ||
1506 | if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { | ||
1507 | mlog(ML_ERROR, "found superblock with incorrect block " | ||
1508 | "size: found %u, should be %u\n", | ||
1509 | 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), | ||
1510 | blksz); | ||
1511 | } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != | ||
1512 | OCFS2_MAJOR_REV_LEVEL || | ||
1513 | le16_to_cpu(di->id2.i_super.s_minor_rev_level) != | ||
1514 | OCFS2_MINOR_REV_LEVEL) { | ||
1515 | mlog(ML_ERROR, "found superblock with bad version: " | ||
1516 | "found %u.%u, should be %u.%u\n", | ||
1517 | le16_to_cpu(di->id2.i_super.s_major_rev_level), | ||
1518 | le16_to_cpu(di->id2.i_super.s_minor_rev_level), | ||
1519 | OCFS2_MAJOR_REV_LEVEL, | ||
1520 | OCFS2_MINOR_REV_LEVEL); | ||
1521 | } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { | ||
1522 | mlog(ML_ERROR, "bad block number on superblock: " | ||
1523 | "found %"MLFu64", should be %llu\n", | ||
1524 | di->i_blkno, (unsigned long long)bh->b_blocknr); | ||
1525 | } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || | ||
1526 | le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { | ||
1527 | mlog(ML_ERROR, "bad cluster size found: %u\n", | ||
1528 | 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); | ||
1529 | } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { | ||
1530 | mlog(ML_ERROR, "bad root_blkno: 0\n"); | ||
1531 | } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { | ||
1532 | mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); | ||
1533 | } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { | ||
1534 | mlog(ML_ERROR, | ||
1535 | "Superblock slots found greater than file system " | ||
1536 | "maximum: found %u, max %u\n", | ||
1537 | le16_to_cpu(di->id2.i_super.s_max_slots), | ||
1538 | OCFS2_MAX_SLOTS); | ||
1539 | } else { | ||
1540 | /* found it! */ | ||
1541 | status = 0; | ||
1542 | } | ||
1543 | } | ||
1544 | |||
1545 | mlog_exit(status); | ||
1546 | return status; | ||
1547 | } | ||
1548 | |||
1549 | static int ocfs2_check_volume(struct ocfs2_super *osb) | ||
1550 | { | ||
1551 | int status = 0; | ||
1552 | int dirty; | ||
1553 | struct ocfs2_dinode *local_alloc = NULL; /* only used if we | ||
1554 | * recover | ||
1555 | * ourselves. */ | ||
1556 | |||
1557 | mlog_entry_void(); | ||
1558 | |||
1559 | /* Init our journal object. */ | ||
1560 | status = ocfs2_journal_init(osb->journal, &dirty); | ||
1561 | if (status < 0) { | ||
1562 | mlog(ML_ERROR, "Could not initialize journal!\n"); | ||
1563 | goto finally; | ||
1564 | } | ||
1565 | |||
1566 | /* If the journal was unmounted cleanly then we don't want to | ||
1567 | * recover anything. Otherwise, journal_load will do that | ||
1568 | * dirty work for us :) */ | ||
1569 | if (!dirty) { | ||
1570 | status = ocfs2_journal_wipe(osb->journal, 0); | ||
1571 | if (status < 0) { | ||
1572 | mlog_errno(status); | ||
1573 | goto finally; | ||
1574 | } | ||
1575 | } else { | ||
1576 | mlog(ML_NOTICE, "File system was not unmounted cleanly, " | ||
1577 | "recovering volume.\n"); | ||
1578 | } | ||
1579 | |||
1580 | /* will play back anything left in the journal. */ | ||
1581 | ocfs2_journal_load(osb->journal); | ||
1582 | |||
1583 | if (dirty) { | ||
1584 | /* recover my local alloc if we didn't unmount cleanly. */ | ||
1585 | status = ocfs2_begin_local_alloc_recovery(osb, | ||
1586 | osb->slot_num, | ||
1587 | &local_alloc); | ||
1588 | if (status < 0) { | ||
1589 | mlog_errno(status); | ||
1590 | goto finally; | ||
1591 | } | ||
1592 | /* we complete the recovery process after we've marked | ||
1593 | * ourselves as mounted. */ | ||
1594 | } | ||
1595 | |||
1596 | mlog(0, "Journal loaded.\n"); | ||
1597 | |||
1598 | status = ocfs2_load_local_alloc(osb); | ||
1599 | if (status < 0) { | ||
1600 | mlog_errno(status); | ||
1601 | goto finally; | ||
1602 | } | ||
1603 | |||
1604 | if (dirty) { | ||
1605 | /* Recovery will be completed after we've mounted the | ||
1606 | * rest of the volume. */ | ||
1607 | osb->dirty = 1; | ||
1608 | osb->local_alloc_copy = local_alloc; | ||
1609 | local_alloc = NULL; | ||
1610 | } | ||
1611 | |||
1612 | /* go through each journal, trylock it and if you get the | ||
1613 | * lock, and it's marked as dirty, set the bit in the recover | ||
1614 | * map and launch a recovery thread for it. */ | ||
1615 | status = ocfs2_mark_dead_nodes(osb); | ||
1616 | if (status < 0) | ||
1617 | mlog_errno(status); | ||
1618 | |||
1619 | finally: | ||
1620 | if (local_alloc) | ||
1621 | kfree(local_alloc); | ||
1622 | |||
1623 | mlog_exit(status); | ||
1624 | return status; | ||
1625 | } | ||
1626 | |||
1627 | /* | ||
1628 | * The routine gets called from dismount or close whenever a dismount on | ||
1629 | * volume is requested and the osb open count becomes 1. | ||
1630 | * It will remove the osb from the global list and also free up all the | ||
1631 | * initialized resources and fileobject. | ||
1632 | */ | ||
1633 | static void ocfs2_delete_osb(struct ocfs2_super *osb) | ||
1634 | { | ||
1635 | mlog_entry_void(); | ||
1636 | |||
1637 | /* This function assumes that the caller has the main osb resource */ | ||
1638 | |||
1639 | if (osb->slot_info) | ||
1640 | ocfs2_free_slot_info(osb->slot_info); | ||
1641 | |||
1642 | /* FIXME | ||
1643 | * This belongs in journal shutdown, but because we have to | ||
1644 | * allocate osb->journal at the start of ocfs2_initalize_osb(), | ||
1645 | * we free it here. | ||
1646 | */ | ||
1647 | kfree(osb->journal); | ||
1648 | if (osb->local_alloc_copy) | ||
1649 | kfree(osb->local_alloc_copy); | ||
1650 | kfree(osb->uuid_str); | ||
1651 | ocfs2_put_dlm_debug(osb->osb_dlm_debug); | ||
1652 | memset(osb, 0, sizeof(struct ocfs2_super)); | ||
1653 | |||
1654 | mlog_exit_void(); | ||
1655 | } | ||
1656 | |||
1657 | /* Put OCFS2 into a readonly state, or (if the user specifies it), | ||
1658 | * panic(). We do not support continue-on-error operation. */ | ||
1659 | static void ocfs2_handle_error(struct super_block *sb) | ||
1660 | { | ||
1661 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
1662 | |||
1663 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) | ||
1664 | panic("OCFS2: (device %s): panic forced after error\n", | ||
1665 | sb->s_id); | ||
1666 | |||
1667 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); | ||
1668 | |||
1669 | if (sb->s_flags & MS_RDONLY && | ||
1670 | (ocfs2_is_soft_readonly(osb) || | ||
1671 | ocfs2_is_hard_readonly(osb))) | ||
1672 | return; | ||
1673 | |||
1674 | printk(KERN_CRIT "File system is now read-only due to the potential " | ||
1675 | "of on-disk corruption. Please run fsck.ocfs2 once the file " | ||
1676 | "system is unmounted.\n"); | ||
1677 | sb->s_flags |= MS_RDONLY; | ||
1678 | ocfs2_set_ro_flag(osb, 0); | ||
1679 | } | ||
1680 | |||
1681 | static char error_buf[1024]; | ||
1682 | |||
1683 | void __ocfs2_error(struct super_block *sb, | ||
1684 | const char *function, | ||
1685 | const char *fmt, ...) | ||
1686 | { | ||
1687 | va_list args; | ||
1688 | |||
1689 | va_start(args, fmt); | ||
1690 | vsprintf(error_buf, fmt, args); | ||
1691 | va_end(args); | ||
1692 | |||
1693 | /* Not using mlog here because we want to show the actual | ||
1694 | * function the error came from. */ | ||
1695 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", | ||
1696 | sb->s_id, function, error_buf); | ||
1697 | |||
1698 | ocfs2_handle_error(sb); | ||
1699 | } | ||
1700 | |||
1701 | /* Handle critical errors. This is intentionally more drastic than | ||
1702 | * ocfs2_handle_error, so we only use for things like journal errors, | ||
1703 | * etc. */ | ||
1704 | void __ocfs2_abort(struct super_block* sb, | ||
1705 | const char *function, | ||
1706 | const char *fmt, ...) | ||
1707 | { | ||
1708 | va_list args; | ||
1709 | |||
1710 | va_start(args, fmt); | ||
1711 | vsprintf(error_buf, fmt, args); | ||
1712 | va_end(args); | ||
1713 | |||
1714 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", | ||
1715 | sb->s_id, function, error_buf); | ||
1716 | |||
1717 | /* We don't have the cluster support yet to go straight to | ||
1718 | * hard readonly in here. Until then, we want to keep | ||
1719 | * ocfs2_abort() so that we can at least mark critical | ||
1720 | * errors. | ||
1721 | * | ||
1722 | * TODO: This should abort the journal and alert other nodes | ||
1723 | * that our slot needs recovery. */ | ||
1724 | |||
1725 | /* Force a panic(). This stinks, but it's better than letting | ||
1726 | * things continue without having a proper hard readonly | ||
1727 | * here. */ | ||
1728 | OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | ||
1729 | ocfs2_handle_error(sb); | ||
1730 | } | ||
1731 | |||
1732 | module_init(ocfs2_init); | ||
1733 | module_exit(ocfs2_exit); | ||
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h new file mode 100644 index 000000000000..c564177dfbdc --- /dev/null +++ b/fs/ocfs2/super.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * super.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SUPER_H | ||
27 | #define OCFS2_SUPER_H | ||
28 | |||
29 | extern struct workqueue_struct *ocfs2_wq; | ||
30 | |||
31 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, | ||
32 | int node_num); | ||
33 | |||
34 | void __ocfs2_error(struct super_block *sb, | ||
35 | const char *function, | ||
36 | const char *fmt, ...); | ||
37 | #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) | ||
38 | |||
39 | void __ocfs2_abort(struct super_block *sb, | ||
40 | const char *function, | ||
41 | const char *fmt, ...); | ||
42 | #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) | ||
43 | |||
44 | #endif /* OCFS2_SUPER_H */ | ||
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c new file mode 100644 index 000000000000..f6986bd79e75 --- /dev/null +++ b/fs/ocfs2/symlink.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * linux/cluster/ssi/cfs/symlink.c | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation; either version 2 of | ||
9 | * the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE | ||
14 | * or NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net | ||
22 | * | ||
23 | * Copyright (C) 1992 Rick Sladkey | ||
24 | * | ||
25 | * Optimization changes Copyright (C) 1994 Florian La Roche | ||
26 | * | ||
27 | * Jun 7 1999, cache symlink lookups in the page cache. -DaveM | ||
28 | * | ||
29 | * Portions Copyright (C) 2001 Compaq Computer Corporation | ||
30 | * | ||
31 | * ocfs2 symlink handling code. | ||
32 | * | ||
33 | * Copyright (C) 2004, 2005 Oracle. | ||
34 | * | ||
35 | */ | ||
36 | |||
37 | #include <linux/fs.h> | ||
38 | #include <linux/types.h> | ||
39 | #include <linux/slab.h> | ||
40 | #include <linux/pagemap.h> | ||
41 | #include <linux/utsname.h> | ||
42 | |||
43 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
44 | #include <cluster/masklog.h> | ||
45 | |||
46 | #include "ocfs2.h" | ||
47 | |||
48 | #include "alloc.h" | ||
49 | #include "file.h" | ||
50 | #include "inode.h" | ||
51 | #include "journal.h" | ||
52 | #include "symlink.h" | ||
53 | |||
54 | #include "buffer_head_io.h" | ||
55 | |||
56 | static char *ocfs2_page_getlink(struct dentry * dentry, | ||
57 | struct page **ppage); | ||
58 | static char *ocfs2_fast_symlink_getlink(struct inode *inode, | ||
59 | struct buffer_head **bh); | ||
60 | |||
61 | /* get the link contents into pagecache */ | ||
62 | static char *ocfs2_page_getlink(struct dentry * dentry, | ||
63 | struct page **ppage) | ||
64 | { | ||
65 | struct page * page; | ||
66 | struct address_space *mapping = dentry->d_inode->i_mapping; | ||
67 | page = read_cache_page(mapping, 0, | ||
68 | (filler_t *)mapping->a_ops->readpage, NULL); | ||
69 | if (IS_ERR(page)) | ||
70 | goto sync_fail; | ||
71 | wait_on_page_locked(page); | ||
72 | if (!PageUptodate(page)) | ||
73 | goto async_fail; | ||
74 | *ppage = page; | ||
75 | return kmap(page); | ||
76 | |||
77 | async_fail: | ||
78 | page_cache_release(page); | ||
79 | return ERR_PTR(-EIO); | ||
80 | |||
81 | sync_fail: | ||
82 | return (char*)page; | ||
83 | } | ||
84 | |||
85 | static char *ocfs2_fast_symlink_getlink(struct inode *inode, | ||
86 | struct buffer_head **bh) | ||
87 | { | ||
88 | int status; | ||
89 | char *link = NULL; | ||
90 | struct ocfs2_dinode *fe; | ||
91 | |||
92 | mlog_entry_void(); | ||
93 | |||
94 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
95 | OCFS2_I(inode)->ip_blkno, | ||
96 | bh, | ||
97 | OCFS2_BH_CACHED, | ||
98 | inode); | ||
99 | if (status < 0) { | ||
100 | mlog_errno(status); | ||
101 | link = ERR_PTR(status); | ||
102 | goto bail; | ||
103 | } | ||
104 | |||
105 | fe = (struct ocfs2_dinode *) (*bh)->b_data; | ||
106 | link = (char *) fe->id2.i_symlink; | ||
107 | bail: | ||
108 | mlog_exit(status); | ||
109 | |||
110 | return link; | ||
111 | } | ||
112 | |||
113 | static int ocfs2_readlink(struct dentry *dentry, | ||
114 | char __user *buffer, | ||
115 | int buflen) | ||
116 | { | ||
117 | int ret; | ||
118 | char *link; | ||
119 | struct buffer_head *bh = NULL; | ||
120 | struct inode *inode = dentry->d_inode; | ||
121 | |||
122 | mlog_entry_void(); | ||
123 | |||
124 | link = ocfs2_fast_symlink_getlink(inode, &bh); | ||
125 | if (IS_ERR(link)) { | ||
126 | ret = PTR_ERR(link); | ||
127 | goto out; | ||
128 | } | ||
129 | |||
130 | ret = vfs_readlink(dentry, buffer, buflen, link); | ||
131 | |||
132 | brelse(bh); | ||
133 | out: | ||
134 | mlog_exit(ret); | ||
135 | return ret; | ||
136 | } | ||
137 | |||
138 | static void *ocfs2_follow_link(struct dentry *dentry, | ||
139 | struct nameidata *nd) | ||
140 | { | ||
141 | int status; | ||
142 | char *link; | ||
143 | struct inode *inode = dentry->d_inode; | ||
144 | struct page *page = NULL; | ||
145 | struct buffer_head *bh = NULL; | ||
146 | |||
147 | if (ocfs2_inode_is_fast_symlink(inode)) | ||
148 | link = ocfs2_fast_symlink_getlink(inode, &bh); | ||
149 | else | ||
150 | link = ocfs2_page_getlink(dentry, &page); | ||
151 | if (IS_ERR(link)) { | ||
152 | status = PTR_ERR(link); | ||
153 | mlog_errno(status); | ||
154 | goto bail; | ||
155 | } | ||
156 | |||
157 | status = vfs_follow_link(nd, link); | ||
158 | if (status) | ||
159 | mlog_errno(status); | ||
160 | bail: | ||
161 | if (page) { | ||
162 | kunmap(page); | ||
163 | page_cache_release(page); | ||
164 | } | ||
165 | if (bh) | ||
166 | brelse(bh); | ||
167 | |||
168 | return ERR_PTR(status); | ||
169 | } | ||
170 | |||
171 | struct inode_operations ocfs2_symlink_inode_operations = { | ||
172 | .readlink = page_readlink, | ||
173 | .follow_link = ocfs2_follow_link, | ||
174 | .getattr = ocfs2_getattr, | ||
175 | }; | ||
176 | struct inode_operations ocfs2_fast_symlink_inode_operations = { | ||
177 | .readlink = ocfs2_readlink, | ||
178 | .follow_link = ocfs2_follow_link, | ||
179 | .getattr = ocfs2_getattr, | ||
180 | }; | ||
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h new file mode 100644 index 000000000000..1ea9e4d9e9eb --- /dev/null +++ b/fs/ocfs2/symlink.h | |||
@@ -0,0 +1,42 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * symlink.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SYMLINK_H | ||
27 | #define OCFS2_SYMLINK_H | ||
28 | |||
29 | extern struct inode_operations ocfs2_symlink_inode_operations; | ||
30 | extern struct inode_operations ocfs2_fast_symlink_inode_operations; | ||
31 | |||
32 | /* | ||
33 | * Test whether an inode is a fast symlink. | ||
34 | */ | ||
35 | static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) | ||
36 | { | ||
37 | return (S_ISLNK(inode->i_mode) && | ||
38 | inode->i_blocks == 0); | ||
39 | } | ||
40 | |||
41 | |||
42 | #endif /* OCFS2_SYMLINK_H */ | ||
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c new file mode 100644 index 000000000000..600a8bc5b541 --- /dev/null +++ b/fs/ocfs2/sysfile.c | |||
@@ -0,0 +1,131 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sysfile.c | ||
5 | * | ||
6 | * Initialize, read, write, etc. system files. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #include "ocfs2.h" | ||
32 | |||
33 | #define MLOG_MASK_PREFIX ML_INODE | ||
34 | #include <cluster/masklog.h> | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dir.h" | ||
38 | #include "inode.h" | ||
39 | #include "journal.h" | ||
40 | #include "sysfile.h" | ||
41 | |||
42 | #include "buffer_head_io.h" | ||
43 | |||
44 | static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
45 | int type, | ||
46 | u32 slot); | ||
47 | |||
48 | static inline int is_global_system_inode(int type); | ||
49 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | ||
50 | int type, | ||
51 | u32 slot); | ||
52 | |||
53 | static inline int is_global_system_inode(int type) | ||
54 | { | ||
55 | return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && | ||
56 | type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; | ||
57 | } | ||
58 | |||
59 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | ||
60 | int type, | ||
61 | u32 slot) | ||
62 | { | ||
63 | return slot == osb->slot_num || is_global_system_inode(type); | ||
64 | } | ||
65 | |||
66 | struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
67 | int type, | ||
68 | u32 slot) | ||
69 | { | ||
70 | struct inode *inode = NULL; | ||
71 | struct inode **arr = NULL; | ||
72 | |||
73 | /* avoid the lookup if cached in local system file array */ | ||
74 | if (is_in_system_inode_array(osb, type, slot)) | ||
75 | arr = &(osb->system_inodes[type]); | ||
76 | |||
77 | if (arr && ((inode = *arr) != NULL)) { | ||
78 | /* get a ref in addition to the array ref */ | ||
79 | inode = igrab(inode); | ||
80 | if (!inode) | ||
81 | BUG(); | ||
82 | |||
83 | return inode; | ||
84 | } | ||
85 | |||
86 | /* this gets one ref thru iget */ | ||
87 | inode = _ocfs2_get_system_file_inode(osb, type, slot); | ||
88 | |||
89 | /* add one more if putting into array for first time */ | ||
90 | if (arr && inode) { | ||
91 | *arr = igrab(inode); | ||
92 | if (!*arr) | ||
93 | BUG(); | ||
94 | } | ||
95 | return inode; | ||
96 | } | ||
97 | |||
98 | static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
99 | int type, | ||
100 | u32 slot) | ||
101 | { | ||
102 | char namebuf[40]; | ||
103 | struct inode *inode = NULL; | ||
104 | u64 blkno; | ||
105 | struct buffer_head *dirent_bh = NULL; | ||
106 | struct ocfs2_dir_entry *de = NULL; | ||
107 | int status = 0; | ||
108 | |||
109 | ocfs2_sprintf_system_inode_name(namebuf, | ||
110 | sizeof(namebuf), | ||
111 | type, slot); | ||
112 | |||
113 | status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), | ||
114 | &blkno, osb->sys_root_inode, | ||
115 | &dirent_bh, &de); | ||
116 | if (status < 0) { | ||
117 | goto bail; | ||
118 | } | ||
119 | |||
120 | inode = ocfs2_iget(osb, blkno); | ||
121 | if (IS_ERR(inode)) { | ||
122 | mlog_errno(PTR_ERR(inode)); | ||
123 | inode = NULL; | ||
124 | goto bail; | ||
125 | } | ||
126 | bail: | ||
127 | if (dirent_bh) | ||
128 | brelse(dirent_bh); | ||
129 | return inode; | ||
130 | } | ||
131 | |||
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h new file mode 100644 index 000000000000..cc9ea661ffc1 --- /dev/null +++ b/fs/ocfs2/sysfile.h | |||
@@ -0,0 +1,33 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sysfile.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SYSFILE_H | ||
27 | #define OCFS2_SYSFILE_H | ||
28 | |||
29 | struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
30 | int type, | ||
31 | u32 slot); | ||
32 | |||
33 | #endif /* OCFS2_SYSFILE_H */ | ||
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c new file mode 100644 index 000000000000..3a0458fd3e1b --- /dev/null +++ b/fs/ocfs2/uptodate.c | |||
@@ -0,0 +1,544 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * uptodate.c | ||
5 | * | ||
6 | * Tracking the up-to-date-ness of a local buffer_head with respect to | ||
7 | * the cluster. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | * | ||
26 | * Standard buffer head caching flags (uptodate, etc) are insufficient | ||
27 | * in a clustered environment - a buffer may be marked up to date on | ||
28 | * our local node but could have been modified by another cluster | ||
29 | * member. As a result an additional (and performant) caching scheme | ||
30 | * is required. A further requirement is that we consume as little | ||
31 | * memory as possible - we never pin buffer_head structures in order | ||
32 | * to cache them. | ||
33 | * | ||
34 | * We track the existence of up to date buffers on the inodes which | ||
35 | * are associated with them. Because we don't want to pin | ||
36 | * buffer_heads, this is only a (strong) hint and several other checks | ||
37 | * are made in the I/O path to ensure that we don't use a stale or | ||
38 | * invalid buffer without going to disk: | ||
39 | * - buffer_jbd is used liberally - if a bh is in the journal on | ||
40 | * this node then it *must* be up to date. | ||
41 | * - the standard buffer_uptodate() macro is used to detect buffers | ||
42 | * which may be invalid (even if we have an up to date tracking | ||
43 | * item for them) | ||
44 | * | ||
45 | * For a full understanding of how this code works together, one | ||
46 | * should read the callers in dlmglue.c, the I/O functions in | ||
47 | * buffer_head_io.c and ocfs2_journal_access in journal.c | ||
48 | */ | ||
49 | |||
50 | #include <linux/fs.h> | ||
51 | #include <linux/types.h> | ||
52 | #include <linux/slab.h> | ||
53 | #include <linux/highmem.h> | ||
54 | #include <linux/buffer_head.h> | ||
55 | #include <linux/rbtree.h> | ||
56 | #include <linux/jbd.h> | ||
57 | |||
58 | #define MLOG_MASK_PREFIX ML_UPTODATE | ||
59 | |||
60 | #include <cluster/masklog.h> | ||
61 | |||
62 | #include "ocfs2.h" | ||
63 | |||
64 | #include "inode.h" | ||
65 | #include "uptodate.h" | ||
66 | |||
67 | struct ocfs2_meta_cache_item { | ||
68 | struct rb_node c_node; | ||
69 | sector_t c_block; | ||
70 | }; | ||
71 | |||
72 | static kmem_cache_t *ocfs2_uptodate_cachep = NULL; | ||
73 | |||
74 | void ocfs2_metadata_cache_init(struct inode *inode) | ||
75 | { | ||
76 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
77 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
78 | |||
79 | oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; | ||
80 | ci->ci_num_cached = 0; | ||
81 | } | ||
82 | |||
83 | /* No lock taken here as 'root' is not expected to be visible to other | ||
84 | * processes. */ | ||
85 | static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) | ||
86 | { | ||
87 | unsigned int purged = 0; | ||
88 | struct rb_node *node; | ||
89 | struct ocfs2_meta_cache_item *item; | ||
90 | |||
91 | while ((node = rb_last(root)) != NULL) { | ||
92 | item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); | ||
93 | |||
94 | mlog(0, "Purge item %llu\n", | ||
95 | (unsigned long long) item->c_block); | ||
96 | |||
97 | rb_erase(&item->c_node, root); | ||
98 | kmem_cache_free(ocfs2_uptodate_cachep, item); | ||
99 | |||
100 | purged++; | ||
101 | } | ||
102 | return purged; | ||
103 | } | ||
104 | |||
105 | /* Called from locking and called from ocfs2_clear_inode. Dump the | ||
106 | * cache for a given inode. | ||
107 | * | ||
108 | * This function is a few more lines longer than necessary due to some | ||
109 | * accounting done here, but I think it's worth tracking down those | ||
110 | * bugs sooner -- Mark */ | ||
111 | void ocfs2_metadata_cache_purge(struct inode *inode) | ||
112 | { | ||
113 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
114 | unsigned int tree, to_purge, purged; | ||
115 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
116 | struct rb_root root = RB_ROOT; | ||
117 | |||
118 | spin_lock(&oi->ip_lock); | ||
119 | tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); | ||
120 | to_purge = ci->ci_num_cached; | ||
121 | |||
122 | mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge, | ||
123 | tree ? "array" : "tree", oi->ip_blkno); | ||
124 | |||
125 | /* If we're a tree, save off the root so that we can safely | ||
126 | * initialize the cache. We do the work to free tree members | ||
127 | * without the spinlock. */ | ||
128 | if (tree) | ||
129 | root = ci->ci_cache.ci_tree; | ||
130 | |||
131 | ocfs2_metadata_cache_init(inode); | ||
132 | spin_unlock(&oi->ip_lock); | ||
133 | |||
134 | purged = ocfs2_purge_copied_metadata_tree(&root); | ||
135 | /* If possible, track the number wiped so that we can more | ||
136 | * easily detect counting errors. Unfortunately, this is only | ||
137 | * meaningful for trees. */ | ||
138 | if (tree && purged != to_purge) | ||
139 | mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n", | ||
140 | oi->ip_blkno, to_purge, purged); | ||
141 | } | ||
142 | |||
143 | /* Returns the index in the cache array, -1 if not found. | ||
144 | * Requires ip_lock. */ | ||
145 | static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, | ||
146 | sector_t item) | ||
147 | { | ||
148 | int i; | ||
149 | |||
150 | for (i = 0; i < ci->ci_num_cached; i++) { | ||
151 | if (item == ci->ci_cache.ci_array[i]) | ||
152 | return i; | ||
153 | } | ||
154 | |||
155 | return -1; | ||
156 | } | ||
157 | |||
158 | /* Returns the cache item if found, otherwise NULL. | ||
159 | * Requires ip_lock. */ | ||
160 | static struct ocfs2_meta_cache_item * | ||
161 | ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, | ||
162 | sector_t block) | ||
163 | { | ||
164 | struct rb_node * n = ci->ci_cache.ci_tree.rb_node; | ||
165 | struct ocfs2_meta_cache_item *item = NULL; | ||
166 | |||
167 | while (n) { | ||
168 | item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); | ||
169 | |||
170 | if (block < item->c_block) | ||
171 | n = n->rb_left; | ||
172 | else if (block > item->c_block) | ||
173 | n = n->rb_right; | ||
174 | else | ||
175 | return item; | ||
176 | } | ||
177 | |||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, | ||
182 | struct buffer_head *bh) | ||
183 | { | ||
184 | int index = -1; | ||
185 | struct ocfs2_meta_cache_item *item = NULL; | ||
186 | |||
187 | spin_lock(&oi->ip_lock); | ||
188 | |||
189 | mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n", | ||
190 | oi->ip_blkno, (unsigned long long) bh->b_blocknr, | ||
191 | !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); | ||
192 | |||
193 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) | ||
194 | index = ocfs2_search_cache_array(&oi->ip_metadata_cache, | ||
195 | bh->b_blocknr); | ||
196 | else | ||
197 | item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, | ||
198 | bh->b_blocknr); | ||
199 | |||
200 | spin_unlock(&oi->ip_lock); | ||
201 | |||
202 | mlog(0, "index = %d, item = %p\n", index, item); | ||
203 | |||
204 | return (index != -1) || (item != NULL); | ||
205 | } | ||
206 | |||
207 | /* Warning: even if it returns true, this does *not* guarantee that | ||
208 | * the block is stored in our inode metadata cache. */ | ||
209 | int ocfs2_buffer_uptodate(struct inode *inode, | ||
210 | struct buffer_head *bh) | ||
211 | { | ||
212 | /* Doesn't matter if the bh is in our cache or not -- if it's | ||
213 | * not marked uptodate then we know it can't have correct | ||
214 | * data. */ | ||
215 | if (!buffer_uptodate(bh)) | ||
216 | return 0; | ||
217 | |||
218 | /* OCFS2 does not allow multiple nodes to be changing the same | ||
219 | * block at the same time. */ | ||
220 | if (buffer_jbd(bh)) | ||
221 | return 1; | ||
222 | |||
223 | /* Ok, locally the buffer is marked as up to date, now search | ||
224 | * our cache to see if we can trust that. */ | ||
225 | return ocfs2_buffer_cached(OCFS2_I(inode), bh); | ||
226 | } | ||
227 | |||
228 | /* Requires ip_lock */ | ||
229 | static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, | ||
230 | sector_t block) | ||
231 | { | ||
232 | BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); | ||
233 | |||
234 | mlog(0, "block %llu takes position %u\n", (unsigned long long) block, | ||
235 | ci->ci_num_cached); | ||
236 | |||
237 | ci->ci_cache.ci_array[ci->ci_num_cached] = block; | ||
238 | ci->ci_num_cached++; | ||
239 | } | ||
240 | |||
241 | /* By now the caller should have checked that the item does *not* | ||
242 | * exist in the tree. | ||
243 | * Requires ip_lock. */ | ||
244 | static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, | ||
245 | struct ocfs2_meta_cache_item *new) | ||
246 | { | ||
247 | sector_t block = new->c_block; | ||
248 | struct rb_node *parent = NULL; | ||
249 | struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; | ||
250 | struct ocfs2_meta_cache_item *tmp; | ||
251 | |||
252 | mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, | ||
253 | ci->ci_num_cached); | ||
254 | |||
255 | while(*p) { | ||
256 | parent = *p; | ||
257 | |||
258 | tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); | ||
259 | |||
260 | if (block < tmp->c_block) | ||
261 | p = &(*p)->rb_left; | ||
262 | else if (block > tmp->c_block) | ||
263 | p = &(*p)->rb_right; | ||
264 | else { | ||
265 | /* This should never happen! */ | ||
266 | mlog(ML_ERROR, "Duplicate block %llu cached!\n", | ||
267 | (unsigned long long) block); | ||
268 | BUG(); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | rb_link_node(&new->c_node, parent, p); | ||
273 | rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); | ||
274 | ci->ci_num_cached++; | ||
275 | } | ||
276 | |||
277 | static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, | ||
278 | struct ocfs2_caching_info *ci) | ||
279 | { | ||
280 | assert_spin_locked(&oi->ip_lock); | ||
281 | |||
282 | return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && | ||
283 | (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); | ||
284 | } | ||
285 | |||
286 | /* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the | ||
287 | * pointers in tree after we use them - this allows caller to detect | ||
288 | * when to free in case of error. */ | ||
289 | static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, | ||
290 | struct ocfs2_meta_cache_item **tree) | ||
291 | { | ||
292 | int i; | ||
293 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
294 | |||
295 | mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, | ||
296 | "Inode %"MLFu64", num cached = %u, should be %u\n", | ||
297 | oi->ip_blkno, ci->ci_num_cached, | ||
298 | OCFS2_INODE_MAX_CACHE_ARRAY); | ||
299 | mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), | ||
300 | "Inode %"MLFu64" not marked as inline anymore!\n", | ||
301 | oi->ip_blkno); | ||
302 | assert_spin_locked(&oi->ip_lock); | ||
303 | |||
304 | /* Be careful to initialize the tree members *first* because | ||
305 | * once the ci_tree is used, the array is junk... */ | ||
306 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) | ||
307 | tree[i]->c_block = ci->ci_cache.ci_array[i]; | ||
308 | |||
309 | oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; | ||
310 | ci->ci_cache.ci_tree = RB_ROOT; | ||
311 | /* this will be set again by __ocfs2_insert_cache_tree */ | ||
312 | ci->ci_num_cached = 0; | ||
313 | |||
314 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { | ||
315 | __ocfs2_insert_cache_tree(ci, tree[i]); | ||
316 | tree[i] = NULL; | ||
317 | } | ||
318 | |||
319 | mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n", | ||
320 | oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); | ||
321 | } | ||
322 | |||
323 | /* Slow path function - memory allocation is necessary. See the | ||
324 | * comment above ocfs2_set_buffer_uptodate for more information. */ | ||
325 | static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, | ||
326 | sector_t block, | ||
327 | int expand_tree) | ||
328 | { | ||
329 | int i; | ||
330 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
331 | struct ocfs2_meta_cache_item *new = NULL; | ||
332 | struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = | ||
333 | { NULL, }; | ||
334 | |||
335 | mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n", | ||
336 | oi->ip_blkno, (unsigned long long) block, expand_tree); | ||
337 | |||
338 | new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); | ||
339 | if (!new) { | ||
340 | mlog_errno(-ENOMEM); | ||
341 | return; | ||
342 | } | ||
343 | new->c_block = block; | ||
344 | |||
345 | if (expand_tree) { | ||
346 | /* Do *not* allocate an array here - the removal code | ||
347 | * has no way of tracking that. */ | ||
348 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { | ||
349 | tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, | ||
350 | GFP_KERNEL); | ||
351 | if (!tree[i]) { | ||
352 | mlog_errno(-ENOMEM); | ||
353 | goto out_free; | ||
354 | } | ||
355 | |||
356 | /* These are initialized in ocfs2_expand_cache! */ | ||
357 | } | ||
358 | } | ||
359 | |||
360 | spin_lock(&oi->ip_lock); | ||
361 | if (ocfs2_insert_can_use_array(oi, ci)) { | ||
362 | mlog(0, "Someone cleared the tree underneath us\n"); | ||
363 | /* Ok, items were removed from the cache in between | ||
364 | * locks. Detect this and revert back to the fast path */ | ||
365 | ocfs2_append_cache_array(ci, block); | ||
366 | spin_unlock(&oi->ip_lock); | ||
367 | goto out_free; | ||
368 | } | ||
369 | |||
370 | if (expand_tree) | ||
371 | ocfs2_expand_cache(oi, tree); | ||
372 | |||
373 | __ocfs2_insert_cache_tree(ci, new); | ||
374 | spin_unlock(&oi->ip_lock); | ||
375 | |||
376 | new = NULL; | ||
377 | out_free: | ||
378 | if (new) | ||
379 | kmem_cache_free(ocfs2_uptodate_cachep, new); | ||
380 | |||
381 | /* If these were used, then ocfs2_expand_cache re-set them to | ||
382 | * NULL for us. */ | ||
383 | if (tree[0]) { | ||
384 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) | ||
385 | if (tree[i]) | ||
386 | kmem_cache_free(ocfs2_uptodate_cachep, | ||
387 | tree[i]); | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* Item insertion is guarded by ip_io_sem, so the insertion path takes | ||
392 | * advantage of this by not rechecking for a duplicate insert during | ||
393 | * the slow case. Additionally, if the cache needs to be bumped up to | ||
394 | * a tree, the code will not recheck after acquiring the lock -- | ||
395 | * multiple paths cannot be expanding to a tree at the same time. | ||
396 | * | ||
397 | * The slow path takes into account that items can be removed | ||
398 | * (including the whole tree wiped and reset) when this process it out | ||
399 | * allocating memory. In those cases, it reverts back to the fast | ||
400 | * path. | ||
401 | * | ||
402 | * Note that this function may actually fail to insert the block if | ||
403 | * memory cannot be allocated. This is not fatal however (but may | ||
404 | * result in a performance penalty) */ | ||
405 | void ocfs2_set_buffer_uptodate(struct inode *inode, | ||
406 | struct buffer_head *bh) | ||
407 | { | ||
408 | int expand; | ||
409 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
410 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
411 | |||
412 | /* The block may very well exist in our cache already, so avoid | ||
413 | * doing any more work in that case. */ | ||
414 | if (ocfs2_buffer_cached(oi, bh)) | ||
415 | return; | ||
416 | |||
417 | mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno, | ||
418 | (unsigned long long) bh->b_blocknr); | ||
419 | |||
420 | /* No need to recheck under spinlock - insertion is guarded by | ||
421 | * ip_io_sem */ | ||
422 | spin_lock(&oi->ip_lock); | ||
423 | if (ocfs2_insert_can_use_array(oi, ci)) { | ||
424 | /* Fast case - it's an array and there's a free | ||
425 | * spot. */ | ||
426 | ocfs2_append_cache_array(ci, bh->b_blocknr); | ||
427 | spin_unlock(&oi->ip_lock); | ||
428 | return; | ||
429 | } | ||
430 | |||
431 | expand = 0; | ||
432 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { | ||
433 | /* We need to bump things up to a tree. */ | ||
434 | expand = 1; | ||
435 | } | ||
436 | spin_unlock(&oi->ip_lock); | ||
437 | |||
438 | __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); | ||
439 | } | ||
440 | |||
441 | /* Called against a newly allocated buffer. Most likely nobody should | ||
442 | * be able to read this sort of metadata while it's still being | ||
443 | * allocated, but this is careful to take ip_io_sem anyway. */ | ||
444 | void ocfs2_set_new_buffer_uptodate(struct inode *inode, | ||
445 | struct buffer_head *bh) | ||
446 | { | ||
447 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
448 | |||
449 | /* This should definitely *not* exist in our cache */ | ||
450 | BUG_ON(ocfs2_buffer_cached(oi, bh)); | ||
451 | |||
452 | set_buffer_uptodate(bh); | ||
453 | |||
454 | down(&oi->ip_io_sem); | ||
455 | ocfs2_set_buffer_uptodate(inode, bh); | ||
456 | up(&oi->ip_io_sem); | ||
457 | } | ||
458 | |||
459 | /* Requires ip_lock. */ | ||
460 | static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, | ||
461 | int index) | ||
462 | { | ||
463 | sector_t *array = ci->ci_cache.ci_array; | ||
464 | int bytes; | ||
465 | |||
466 | BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); | ||
467 | BUG_ON(index >= ci->ci_num_cached); | ||
468 | BUG_ON(!ci->ci_num_cached); | ||
469 | |||
470 | mlog(0, "remove index %d (num_cached = %u\n", index, | ||
471 | ci->ci_num_cached); | ||
472 | |||
473 | ci->ci_num_cached--; | ||
474 | |||
475 | /* don't need to copy if the array is now empty, or if we | ||
476 | * removed at the tail */ | ||
477 | if (ci->ci_num_cached && index < ci->ci_num_cached) { | ||
478 | bytes = sizeof(sector_t) * (ci->ci_num_cached - index); | ||
479 | memmove(&array[index], &array[index + 1], bytes); | ||
480 | } | ||
481 | } | ||
482 | |||
483 | /* Requires ip_lock. */ | ||
484 | static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, | ||
485 | struct ocfs2_meta_cache_item *item) | ||
486 | { | ||
487 | mlog(0, "remove block %llu from tree\n", | ||
488 | (unsigned long long) item->c_block); | ||
489 | |||
490 | rb_erase(&item->c_node, &ci->ci_cache.ci_tree); | ||
491 | ci->ci_num_cached--; | ||
492 | } | ||
493 | |||
494 | /* Called when we remove a chunk of metadata from an inode. We don't | ||
495 | * bother reverting things to an inlined array in the case of a remove | ||
496 | * which moves us back under the limit. */ | ||
497 | void ocfs2_remove_from_cache(struct inode *inode, | ||
498 | struct buffer_head *bh) | ||
499 | { | ||
500 | int index; | ||
501 | sector_t block = bh->b_blocknr; | ||
502 | struct ocfs2_meta_cache_item *item = NULL; | ||
503 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
504 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
505 | |||
506 | spin_lock(&oi->ip_lock); | ||
507 | mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n", | ||
508 | oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached, | ||
509 | oi->ip_flags & OCFS2_INODE_CACHE_INLINE); | ||
510 | |||
511 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { | ||
512 | index = ocfs2_search_cache_array(ci, block); | ||
513 | if (index != -1) | ||
514 | ocfs2_remove_metadata_array(ci, index); | ||
515 | } else { | ||
516 | item = ocfs2_search_cache_tree(ci, block); | ||
517 | if (item) | ||
518 | ocfs2_remove_metadata_tree(ci, item); | ||
519 | } | ||
520 | spin_unlock(&oi->ip_lock); | ||
521 | |||
522 | if (item) | ||
523 | kmem_cache_free(ocfs2_uptodate_cachep, item); | ||
524 | } | ||
525 | |||
526 | int __init init_ocfs2_uptodate_cache(void) | ||
527 | { | ||
528 | ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", | ||
529 | sizeof(struct ocfs2_meta_cache_item), | ||
530 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
531 | if (!ocfs2_uptodate_cachep) | ||
532 | return -ENOMEM; | ||
533 | |||
534 | mlog(0, "%u inlined cache items per inode.\n", | ||
535 | OCFS2_INODE_MAX_CACHE_ARRAY); | ||
536 | |||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | void __exit exit_ocfs2_uptodate_cache(void) | ||
541 | { | ||
542 | if (ocfs2_uptodate_cachep) | ||
543 | kmem_cache_destroy(ocfs2_uptodate_cachep); | ||
544 | } | ||
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h new file mode 100644 index 000000000000..e5aacdf4eabf --- /dev/null +++ b/fs/ocfs2/uptodate.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * uptodate.h | ||
5 | * | ||
6 | * Cluster uptodate tracking | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_UPTODATE_H | ||
27 | #define OCFS2_UPTODATE_H | ||
28 | |||
29 | int __init init_ocfs2_uptodate_cache(void); | ||
30 | void __exit exit_ocfs2_uptodate_cache(void); | ||
31 | |||
32 | void ocfs2_metadata_cache_init(struct inode *inode); | ||
33 | void ocfs2_metadata_cache_purge(struct inode *inode); | ||
34 | |||
35 | int ocfs2_buffer_uptodate(struct inode *inode, | ||
36 | struct buffer_head *bh); | ||
37 | void ocfs2_set_buffer_uptodate(struct inode *inode, | ||
38 | struct buffer_head *bh); | ||
39 | void ocfs2_set_new_buffer_uptodate(struct inode *inode, | ||
40 | struct buffer_head *bh); | ||
41 | void ocfs2_remove_from_cache(struct inode *inode, | ||
42 | struct buffer_head *bh); | ||
43 | |||
44 | #endif /* OCFS2_UPTODATE_H */ | ||
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c new file mode 100644 index 000000000000..5405ce121c99 --- /dev/null +++ b/fs/ocfs2/ver.c | |||
@@ -0,0 +1,43 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/kernel.h> | ||
29 | |||
30 | #include "ver.h" | ||
31 | |||
32 | #define OCFS2_BUILD_VERSION "1.3.3" | ||
33 | |||
34 | #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION | ||
35 | |||
36 | void ocfs2_print_version(void) | ||
37 | { | ||
38 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
39 | } | ||
40 | |||
41 | MODULE_DESCRIPTION(VERSION_STR); | ||
42 | |||
43 | MODULE_VERSION(OCFS2_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h new file mode 100644 index 000000000000..d7395cb91d2f --- /dev/null +++ b/fs/ocfs2/ver.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_VER_H | ||
27 | #define OCFS2_VER_H | ||
28 | |||
29 | void ocfs2_print_version(void); | ||
30 | |||
31 | #endif /* OCFS2_VER_H */ | ||
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c new file mode 100644 index 000000000000..021978e0576b --- /dev/null +++ b/fs/ocfs2/vote.c | |||
@@ -0,0 +1,1202 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * vote.c | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/kthread.h> | ||
31 | |||
32 | #include <cluster/heartbeat.h> | ||
33 | #include <cluster/nodemanager.h> | ||
34 | #include <cluster/tcp.h> | ||
35 | |||
36 | #include <dlm/dlmapi.h> | ||
37 | |||
38 | #define MLOG_MASK_PREFIX ML_VOTE | ||
39 | #include <cluster/masklog.h> | ||
40 | |||
41 | #include "ocfs2.h" | ||
42 | |||
43 | #include "alloc.h" | ||
44 | #include "dlmglue.h" | ||
45 | #include "extent_map.h" | ||
46 | #include "heartbeat.h" | ||
47 | #include "inode.h" | ||
48 | #include "journal.h" | ||
49 | #include "slot_map.h" | ||
50 | #include "vote.h" | ||
51 | |||
52 | #include "buffer_head_io.h" | ||
53 | |||
54 | #define OCFS2_MESSAGE_TYPE_VOTE (0x1) | ||
55 | #define OCFS2_MESSAGE_TYPE_RESPONSE (0x2) | ||
56 | struct ocfs2_msg_hdr | ||
57 | { | ||
58 | __be32 h_response_id; /* used to lookup message handle on sending | ||
59 | * node. */ | ||
60 | __be32 h_request; | ||
61 | __be64 h_blkno; | ||
62 | __be32 h_generation; | ||
63 | __be32 h_node_num; /* node sending this particular message. */ | ||
64 | }; | ||
65 | |||
66 | /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this | ||
67 | * for the network. */ | ||
68 | #define OCFS2_VOTE_FILENAME_LEN 256 | ||
69 | struct ocfs2_vote_msg | ||
70 | { | ||
71 | struct ocfs2_msg_hdr v_hdr; | ||
72 | union { | ||
73 | __be32 v_generic1; | ||
74 | __be32 v_orphaned_slot; /* Used during delete votes */ | ||
75 | __be32 v_nlink; /* Used during unlink votes */ | ||
76 | } md1; /* Message type dependant 1 */ | ||
77 | __be32 v_unlink_namelen; | ||
78 | __be64 v_unlink_parent; | ||
79 | u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN]; | ||
80 | }; | ||
81 | |||
82 | /* Responses are given these values to maintain backwards | ||
83 | * compatibility with older ocfs2 versions */ | ||
84 | #define OCFS2_RESPONSE_OK (0) | ||
85 | #define OCFS2_RESPONSE_BUSY (-16) | ||
86 | #define OCFS2_RESPONSE_BAD_MSG (-22) | ||
87 | |||
88 | struct ocfs2_response_msg | ||
89 | { | ||
90 | struct ocfs2_msg_hdr r_hdr; | ||
91 | __be32 r_response; | ||
92 | __be32 r_orphaned_slot; | ||
93 | }; | ||
94 | |||
95 | struct ocfs2_vote_work { | ||
96 | struct list_head w_list; | ||
97 | struct ocfs2_vote_msg w_msg; | ||
98 | }; | ||
99 | |||
100 | enum ocfs2_vote_request { | ||
101 | OCFS2_VOTE_REQ_INVALID = 0, | ||
102 | OCFS2_VOTE_REQ_DELETE, | ||
103 | OCFS2_VOTE_REQ_UNLINK, | ||
104 | OCFS2_VOTE_REQ_RENAME, | ||
105 | OCFS2_VOTE_REQ_MOUNT, | ||
106 | OCFS2_VOTE_REQ_UMOUNT, | ||
107 | OCFS2_VOTE_REQ_LAST | ||
108 | }; | ||
109 | |||
110 | static inline int ocfs2_is_valid_vote_request(int request) | ||
111 | { | ||
112 | return OCFS2_VOTE_REQ_INVALID < request && | ||
113 | request < OCFS2_VOTE_REQ_LAST; | ||
114 | } | ||
115 | |||
116 | typedef void (*ocfs2_net_response_callback)(void *priv, | ||
117 | struct ocfs2_response_msg *resp); | ||
118 | struct ocfs2_net_response_cb { | ||
119 | ocfs2_net_response_callback rc_cb; | ||
120 | void *rc_priv; | ||
121 | }; | ||
122 | |||
123 | struct ocfs2_net_wait_ctxt { | ||
124 | struct list_head n_list; | ||
125 | u32 n_response_id; | ||
126 | wait_queue_head_t n_event; | ||
127 | struct ocfs2_node_map n_node_map; | ||
128 | int n_response; /* an agreggate response. 0 if | ||
129 | * all nodes are go, < 0 on any | ||
130 | * negative response from any | ||
131 | * node or network error. */ | ||
132 | struct ocfs2_net_response_cb *n_callback; | ||
133 | }; | ||
134 | |||
135 | static void ocfs2_process_mount_request(struct ocfs2_super *osb, | ||
136 | unsigned int node_num) | ||
137 | { | ||
138 | mlog(0, "MOUNT vote from node %u\n", node_num); | ||
139 | /* The other node only sends us this message when he has an EX | ||
140 | * on the superblock, so our recovery threads (if having been | ||
141 | * launched) are waiting on it.*/ | ||
142 | ocfs2_recovery_map_clear(osb, node_num); | ||
143 | ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); | ||
144 | |||
145 | /* We clear the umount map here because a node may have been | ||
146 | * previously mounted, safely unmounted but never stopped | ||
147 | * heartbeating - in which case we'd have a stale entry. */ | ||
148 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
149 | } | ||
150 | |||
151 | static void ocfs2_process_umount_request(struct ocfs2_super *osb, | ||
152 | unsigned int node_num) | ||
153 | { | ||
154 | mlog(0, "UMOUNT vote from node %u\n", node_num); | ||
155 | ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); | ||
156 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); | ||
157 | } | ||
158 | |||
159 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode) | ||
160 | { | ||
161 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
162 | |||
163 | assert_spin_locked(&oi->ip_lock); | ||
164 | /* We set the SKIP_DELETE flag on the inode so we don't try to | ||
165 | * delete it in delete_inode ourselves, thus avoiding | ||
166 | * unecessary lock pinging. If the other node failed to wipe | ||
167 | * the inode as a result of a crash, then recovery will pick | ||
168 | * up the slack. */ | ||
169 | oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; | ||
170 | } | ||
171 | |||
172 | static int ocfs2_process_delete_request(struct inode *inode, | ||
173 | int *orphaned_slot) | ||
174 | { | ||
175 | int response = OCFS2_RESPONSE_BUSY; | ||
176 | |||
177 | mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", | ||
178 | inode->i_ino, inode->i_nlink, *orphaned_slot); | ||
179 | |||
180 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
181 | |||
182 | /* Whatever our vote response is, we want to make sure that | ||
183 | * the orphaned slot is recorded properly on this node *and* | ||
184 | * on the requesting node. Technically, if the requesting node | ||
185 | * did not know which slot the inode is orphaned in but we | ||
186 | * respond with BUSY he doesn't actually need the orphaned | ||
187 | * slot, but it doesn't hurt to do it here anyway. */ | ||
188 | if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { | ||
189 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != | ||
190 | OCFS2_INVALID_SLOT && | ||
191 | OCFS2_I(inode)->ip_orphaned_slot != | ||
192 | (*orphaned_slot), | ||
193 | "Inode %"MLFu64": This node thinks it's " | ||
194 | "orphaned in slot %d, messaged it's in %d\n", | ||
195 | OCFS2_I(inode)->ip_blkno, | ||
196 | OCFS2_I(inode)->ip_orphaned_slot, | ||
197 | *orphaned_slot); | ||
198 | |||
199 | mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n", | ||
200 | OCFS2_I(inode)->ip_blkno, *orphaned_slot); | ||
201 | |||
202 | OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; | ||
203 | } else { | ||
204 | mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n", | ||
205 | OCFS2_I(inode)->ip_orphaned_slot, | ||
206 | OCFS2_I(inode)->ip_blkno); | ||
207 | |||
208 | *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
209 | } | ||
210 | |||
211 | /* vote no if the file is still open. */ | ||
212 | if (OCFS2_I(inode)->ip_open_count) { | ||
213 | mlog(0, "open count = %u\n", | ||
214 | OCFS2_I(inode)->ip_open_count); | ||
215 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
216 | goto done; | ||
217 | } | ||
218 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
219 | |||
220 | /* directories are a bit ugly... What if someone is sitting in | ||
221 | * it? We want to make sure the inode is removed completely as | ||
222 | * a result of the iput in process_vote. */ | ||
223 | if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { | ||
224 | mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); | ||
225 | goto done; | ||
226 | } | ||
227 | |||
228 | if (filemap_fdatawrite(inode->i_mapping)) { | ||
229 | mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n", | ||
230 | OCFS2_I(inode)->ip_blkno); | ||
231 | goto done; | ||
232 | } | ||
233 | sync_mapping_buffers(inode->i_mapping); | ||
234 | truncate_inode_pages(inode->i_mapping, 0); | ||
235 | ocfs2_extent_map_trunc(inode, 0); | ||
236 | |||
237 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
238 | /* double check open count - someone might have raced this | ||
239 | * thread into ocfs2_file_open while we were writing out | ||
240 | * data. If we're to allow a wipe of this inode now, we *must* | ||
241 | * hold the spinlock until we've marked it. */ | ||
242 | if (OCFS2_I(inode)->ip_open_count) { | ||
243 | mlog(0, "Raced to wipe! open count = %u\n", | ||
244 | OCFS2_I(inode)->ip_open_count); | ||
245 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
246 | goto done; | ||
247 | } | ||
248 | |||
249 | /* Mark the inode as being wiped from disk. */ | ||
250 | ocfs2_mark_inode_remotely_deleted(inode); | ||
251 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
252 | |||
253 | /* Not sure this is necessary anymore. */ | ||
254 | d_prune_aliases(inode); | ||
255 | |||
256 | /* If we get here, then we're voting 'yes', so commit the | ||
257 | * delete on our side. */ | ||
258 | response = OCFS2_RESPONSE_OK; | ||
259 | done: | ||
260 | return response; | ||
261 | } | ||
262 | |||
263 | static int ocfs2_match_dentry(struct dentry *dentry, | ||
264 | u64 parent_blkno, | ||
265 | unsigned int namelen, | ||
266 | const char *name) | ||
267 | { | ||
268 | struct inode *parent; | ||
269 | |||
270 | if (!dentry->d_parent) { | ||
271 | mlog(0, "Detached from parent.\n"); | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | parent = dentry->d_parent->d_inode; | ||
276 | /* Negative parent dentry? */ | ||
277 | if (!parent) | ||
278 | return 0; | ||
279 | |||
280 | /* Name is in a different directory. */ | ||
281 | if (OCFS2_I(parent)->ip_blkno != parent_blkno) | ||
282 | return 0; | ||
283 | |||
284 | if (dentry->d_name.len != namelen) | ||
285 | return 0; | ||
286 | |||
287 | /* comparison above guarantees this is safe. */ | ||
288 | if (memcmp(dentry->d_name.name, name, namelen)) | ||
289 | return 0; | ||
290 | |||
291 | return 1; | ||
292 | } | ||
293 | |||
294 | static void ocfs2_process_dentry_request(struct inode *inode, | ||
295 | int rename, | ||
296 | unsigned int new_nlink, | ||
297 | u64 parent_blkno, | ||
298 | unsigned int namelen, | ||
299 | const char *name) | ||
300 | { | ||
301 | struct dentry *dentry = NULL; | ||
302 | struct list_head *p; | ||
303 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
304 | |||
305 | mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno, | ||
306 | namelen, namelen, name); | ||
307 | |||
308 | spin_lock(&dcache_lock); | ||
309 | |||
310 | /* Another node is removing this name from the system. It is | ||
311 | * up to us to find the corresponding dentry and if it exists, | ||
312 | * unhash it from the dcache. */ | ||
313 | list_for_each(p, &inode->i_dentry) { | ||
314 | dentry = list_entry(p, struct dentry, d_alias); | ||
315 | |||
316 | if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { | ||
317 | mlog(0, "dentry found: %.*s\n", | ||
318 | dentry->d_name.len, dentry->d_name.name); | ||
319 | |||
320 | dget_locked(dentry); | ||
321 | break; | ||
322 | } | ||
323 | |||
324 | dentry = NULL; | ||
325 | } | ||
326 | |||
327 | spin_unlock(&dcache_lock); | ||
328 | |||
329 | if (dentry) { | ||
330 | d_delete(dentry); | ||
331 | dput(dentry); | ||
332 | } | ||
333 | |||
334 | /* rename votes don't send link counts */ | ||
335 | if (!rename) { | ||
336 | mlog(0, "new_nlink = %u\n", new_nlink); | ||
337 | |||
338 | /* We don't have the proper locks here to directly | ||
339 | * change i_nlink and besides, the vote is sent | ||
340 | * *before* the operation so it may have failed on the | ||
341 | * other node. This passes a hint to ocfs2_drop_inode | ||
342 | * to force ocfs2_delete_inode, who will take the | ||
343 | * proper cluster locks to sort things out. */ | ||
344 | if (new_nlink == 0) { | ||
345 | spin_lock(&oi->ip_lock); | ||
346 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | ||
347 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
348 | } | ||
349 | } | ||
350 | } | ||
351 | |||
352 | static void ocfs2_process_vote(struct ocfs2_super *osb, | ||
353 | struct ocfs2_vote_msg *msg) | ||
354 | { | ||
355 | int net_status, vote_response; | ||
356 | int orphaned_slot = 0; | ||
357 | int rename = 0; | ||
358 | unsigned int node_num, generation, new_nlink, namelen; | ||
359 | u64 blkno, parent_blkno; | ||
360 | enum ocfs2_vote_request request; | ||
361 | struct inode *inode = NULL; | ||
362 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; | ||
363 | struct ocfs2_response_msg response; | ||
364 | |||
365 | /* decode the network mumbo jumbo into local variables. */ | ||
366 | request = be32_to_cpu(hdr->h_request); | ||
367 | blkno = be64_to_cpu(hdr->h_blkno); | ||
368 | generation = be32_to_cpu(hdr->h_generation); | ||
369 | node_num = be32_to_cpu(hdr->h_node_num); | ||
370 | if (request == OCFS2_VOTE_REQ_DELETE) | ||
371 | orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); | ||
372 | |||
373 | mlog(0, "processing vote: request = %u, blkno = %"MLFu64", " | ||
374 | "generation = %u, node_num = %u, priv1 = %u\n", request, | ||
375 | blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1)); | ||
376 | |||
377 | if (!ocfs2_is_valid_vote_request(request)) { | ||
378 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", | ||
379 | request, node_num); | ||
380 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
381 | goto respond; | ||
382 | } | ||
383 | |||
384 | vote_response = OCFS2_RESPONSE_OK; | ||
385 | |||
386 | switch (request) { | ||
387 | case OCFS2_VOTE_REQ_UMOUNT: | ||
388 | ocfs2_process_umount_request(osb, node_num); | ||
389 | goto respond; | ||
390 | case OCFS2_VOTE_REQ_MOUNT: | ||
391 | ocfs2_process_mount_request(osb, node_num); | ||
392 | goto respond; | ||
393 | default: | ||
394 | /* avoids a gcc warning */ | ||
395 | break; | ||
396 | } | ||
397 | |||
398 | /* We cannot process the remaining message types before we're | ||
399 | * fully mounted. It's perfectly safe however to send a 'yes' | ||
400 | * response as we can't possibly have any of the state they're | ||
401 | * asking us to modify yet. */ | ||
402 | if (atomic_read(&osb->vol_state) == VOLUME_INIT) | ||
403 | goto respond; | ||
404 | |||
405 | /* If we get here, then the request is against an inode. */ | ||
406 | inode = ocfs2_ilookup_for_vote(osb, blkno, | ||
407 | request == OCFS2_VOTE_REQ_DELETE); | ||
408 | |||
409 | /* Not finding the inode is perfectly valid - it means we're | ||
410 | * not interested in what the other node is about to do to it | ||
411 | * so in those cases we automatically respond with an | ||
412 | * affirmative. Cluster locking ensures that we won't race | ||
413 | * interest in the inode with this vote request. */ | ||
414 | if (!inode) | ||
415 | goto respond; | ||
416 | |||
417 | /* Check generation values. It's possible for us to get a | ||
418 | * request against a stale inode. If so then we proceed as if | ||
419 | * we had not found an inode in the first place. */ | ||
420 | if (inode->i_generation != generation) { | ||
421 | mlog(0, "generation passed %u != inode generation = %u, " | ||
422 | "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", " | ||
423 | "i_count = %u, message type = %u\n", | ||
424 | generation, inode->i_generation, OCFS2_I(inode)->ip_flags, | ||
425 | OCFS2_I(inode)->ip_blkno, blkno, | ||
426 | atomic_read(&inode->i_count), request); | ||
427 | iput(inode); | ||
428 | inode = NULL; | ||
429 | goto respond; | ||
430 | } | ||
431 | |||
432 | switch (request) { | ||
433 | case OCFS2_VOTE_REQ_DELETE: | ||
434 | vote_response = ocfs2_process_delete_request(inode, | ||
435 | &orphaned_slot); | ||
436 | break; | ||
437 | case OCFS2_VOTE_REQ_RENAME: | ||
438 | rename = 1; | ||
439 | /* fall through */ | ||
440 | case OCFS2_VOTE_REQ_UNLINK: | ||
441 | parent_blkno = be64_to_cpu(msg->v_unlink_parent); | ||
442 | namelen = be32_to_cpu(msg->v_unlink_namelen); | ||
443 | /* new_nlink will be ignored in case of a rename vote */ | ||
444 | new_nlink = be32_to_cpu(msg->md1.v_nlink); | ||
445 | ocfs2_process_dentry_request(inode, rename, new_nlink, | ||
446 | parent_blkno, namelen, | ||
447 | msg->v_unlink_dirent); | ||
448 | break; | ||
449 | default: | ||
450 | mlog(ML_ERROR, "node %u, invalid request: %u\n", | ||
451 | node_num, request); | ||
452 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
453 | } | ||
454 | |||
455 | respond: | ||
456 | /* Response struture is small so we just put it on the stack | ||
457 | * and stuff it inline. */ | ||
458 | memset(&response, 0, sizeof(struct ocfs2_response_msg)); | ||
459 | response.r_hdr.h_response_id = hdr->h_response_id; | ||
460 | response.r_hdr.h_blkno = hdr->h_blkno; | ||
461 | response.r_hdr.h_generation = hdr->h_generation; | ||
462 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); | ||
463 | response.r_response = cpu_to_be32(vote_response); | ||
464 | response.r_orphaned_slot = cpu_to_be32(orphaned_slot); | ||
465 | |||
466 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, | ||
467 | osb->net_key, | ||
468 | &response, | ||
469 | sizeof(struct ocfs2_response_msg), | ||
470 | node_num, | ||
471 | NULL); | ||
472 | /* We still want to error print for ENOPROTOOPT here. The | ||
473 | * sending node shouldn't have unregistered his net handler | ||
474 | * without sending an unmount vote 1st */ | ||
475 | if (net_status < 0 | ||
476 | && net_status != -ETIMEDOUT | ||
477 | && net_status != -ENOTCONN) | ||
478 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", | ||
479 | node_num, net_status); | ||
480 | |||
481 | if (inode) | ||
482 | iput(inode); | ||
483 | } | ||
484 | |||
485 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) | ||
486 | { | ||
487 | unsigned long processed; | ||
488 | struct ocfs2_lock_res *lockres; | ||
489 | struct ocfs2_vote_work *work; | ||
490 | |||
491 | mlog_entry_void(); | ||
492 | |||
493 | spin_lock(&osb->vote_task_lock); | ||
494 | /* grab this early so we know to try again if a state change and | ||
495 | * wake happens part-way through our work */ | ||
496 | osb->vote_work_sequence = osb->vote_wake_sequence; | ||
497 | |||
498 | processed = osb->blocked_lock_count; | ||
499 | while (processed) { | ||
500 | BUG_ON(list_empty(&osb->blocked_lock_list)); | ||
501 | |||
502 | lockres = list_entry(osb->blocked_lock_list.next, | ||
503 | struct ocfs2_lock_res, l_blocked_list); | ||
504 | list_del_init(&lockres->l_blocked_list); | ||
505 | osb->blocked_lock_count--; | ||
506 | spin_unlock(&osb->vote_task_lock); | ||
507 | |||
508 | BUG_ON(!processed); | ||
509 | processed--; | ||
510 | |||
511 | ocfs2_process_blocked_lock(osb, lockres); | ||
512 | |||
513 | spin_lock(&osb->vote_task_lock); | ||
514 | } | ||
515 | |||
516 | while (osb->vote_count) { | ||
517 | BUG_ON(list_empty(&osb->vote_list)); | ||
518 | work = list_entry(osb->vote_list.next, | ||
519 | struct ocfs2_vote_work, w_list); | ||
520 | list_del(&work->w_list); | ||
521 | osb->vote_count--; | ||
522 | spin_unlock(&osb->vote_task_lock); | ||
523 | |||
524 | ocfs2_process_vote(osb, &work->w_msg); | ||
525 | kfree(work); | ||
526 | |||
527 | spin_lock(&osb->vote_task_lock); | ||
528 | } | ||
529 | spin_unlock(&osb->vote_task_lock); | ||
530 | |||
531 | mlog_exit_void(); | ||
532 | } | ||
533 | |||
534 | static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb) | ||
535 | { | ||
536 | int empty = 0; | ||
537 | |||
538 | spin_lock(&osb->vote_task_lock); | ||
539 | if (list_empty(&osb->blocked_lock_list) && | ||
540 | list_empty(&osb->vote_list)) | ||
541 | empty = 1; | ||
542 | |||
543 | spin_unlock(&osb->vote_task_lock); | ||
544 | return empty; | ||
545 | } | ||
546 | |||
547 | static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb) | ||
548 | { | ||
549 | int should_wake = 0; | ||
550 | |||
551 | spin_lock(&osb->vote_task_lock); | ||
552 | if (osb->vote_work_sequence != osb->vote_wake_sequence) | ||
553 | should_wake = 1; | ||
554 | spin_unlock(&osb->vote_task_lock); | ||
555 | |||
556 | return should_wake; | ||
557 | } | ||
558 | |||
559 | int ocfs2_vote_thread(void *arg) | ||
560 | { | ||
561 | int status = 0; | ||
562 | struct ocfs2_super *osb = arg; | ||
563 | |||
564 | /* only quit once we've been asked to stop and there is no more | ||
565 | * work available */ | ||
566 | while (!(kthread_should_stop() && | ||
567 | ocfs2_vote_thread_lists_empty(osb))) { | ||
568 | |||
569 | wait_event_interruptible(osb->vote_event, | ||
570 | ocfs2_vote_thread_should_wake(osb) || | ||
571 | kthread_should_stop()); | ||
572 | |||
573 | mlog(0, "vote_thread: awoken\n"); | ||
574 | |||
575 | ocfs2_vote_thread_do_work(osb); | ||
576 | } | ||
577 | |||
578 | osb->vote_task = NULL; | ||
579 | return status; | ||
580 | } | ||
581 | |||
582 | static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id) | ||
583 | { | ||
584 | struct ocfs2_net_wait_ctxt *w; | ||
585 | |||
586 | w = kcalloc(1, sizeof(*w), GFP_KERNEL); | ||
587 | if (!w) { | ||
588 | mlog_errno(-ENOMEM); | ||
589 | goto bail; | ||
590 | } | ||
591 | |||
592 | INIT_LIST_HEAD(&w->n_list); | ||
593 | init_waitqueue_head(&w->n_event); | ||
594 | ocfs2_node_map_init(&w->n_node_map); | ||
595 | w->n_response_id = response_id; | ||
596 | w->n_callback = NULL; | ||
597 | bail: | ||
598 | return w; | ||
599 | } | ||
600 | |||
601 | static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb) | ||
602 | { | ||
603 | unsigned int ret; | ||
604 | |||
605 | spin_lock(&osb->net_response_lock); | ||
606 | ret = ++osb->net_response_ids; | ||
607 | spin_unlock(&osb->net_response_lock); | ||
608 | |||
609 | return ret; | ||
610 | } | ||
611 | |||
612 | static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb, | ||
613 | struct ocfs2_net_wait_ctxt *w) | ||
614 | { | ||
615 | spin_lock(&osb->net_response_lock); | ||
616 | list_del(&w->n_list); | ||
617 | spin_unlock(&osb->net_response_lock); | ||
618 | } | ||
619 | |||
620 | static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb, | ||
621 | struct ocfs2_net_wait_ctxt *w) | ||
622 | { | ||
623 | spin_lock(&osb->net_response_lock); | ||
624 | list_add_tail(&w->n_list, | ||
625 | &osb->net_response_list); | ||
626 | spin_unlock(&osb->net_response_lock); | ||
627 | } | ||
628 | |||
629 | static void __ocfs2_mark_node_responded(struct ocfs2_super *osb, | ||
630 | struct ocfs2_net_wait_ctxt *w, | ||
631 | int node_num) | ||
632 | { | ||
633 | assert_spin_locked(&osb->net_response_lock); | ||
634 | |||
635 | ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num); | ||
636 | if (ocfs2_node_map_is_empty(osb, &w->n_node_map)) | ||
637 | wake_up(&w->n_event); | ||
638 | } | ||
639 | |||
640 | /* Intended to be called from the node down callback, we fake remove | ||
641 | * the node from all our response contexts */ | ||
642 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | ||
643 | int node_num) | ||
644 | { | ||
645 | struct list_head *p; | ||
646 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
647 | |||
648 | spin_lock(&osb->net_response_lock); | ||
649 | |||
650 | list_for_each(p, &osb->net_response_list) { | ||
651 | w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); | ||
652 | |||
653 | __ocfs2_mark_node_responded(osb, w, node_num); | ||
654 | } | ||
655 | |||
656 | spin_unlock(&osb->net_response_lock); | ||
657 | } | ||
658 | |||
659 | static int ocfs2_broadcast_vote(struct ocfs2_super *osb, | ||
660 | struct ocfs2_vote_msg *request, | ||
661 | unsigned int response_id, | ||
662 | int *response, | ||
663 | struct ocfs2_net_response_cb *callback) | ||
664 | { | ||
665 | int status, i, remote_err; | ||
666 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
667 | int dequeued = 0; | ||
668 | |||
669 | mlog_entry_void(); | ||
670 | |||
671 | w = ocfs2_new_net_wait_ctxt(response_id); | ||
672 | if (!w) { | ||
673 | status = -ENOMEM; | ||
674 | mlog_errno(status); | ||
675 | goto bail; | ||
676 | } | ||
677 | w->n_callback = callback; | ||
678 | |||
679 | /* we're pretty much ready to go at this point, and this fills | ||
680 | * in n_response which we need anyway... */ | ||
681 | ocfs2_queue_net_wait_ctxt(osb, w); | ||
682 | |||
683 | i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0); | ||
684 | |||
685 | while (i != O2NM_INVALID_NODE_NUM) { | ||
686 | if (i != osb->node_num) { | ||
687 | mlog(0, "trying to send request to node %i\n", i); | ||
688 | ocfs2_node_map_set_bit(osb, &w->n_node_map, i); | ||
689 | |||
690 | remote_err = 0; | ||
691 | status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE, | ||
692 | osb->net_key, | ||
693 | request, | ||
694 | sizeof(*request), | ||
695 | i, | ||
696 | &remote_err); | ||
697 | if (status == -ETIMEDOUT) { | ||
698 | mlog(0, "remote node %d timed out!\n", i); | ||
699 | status = -EAGAIN; | ||
700 | goto bail; | ||
701 | } | ||
702 | if (remote_err < 0) { | ||
703 | status = remote_err; | ||
704 | mlog(0, "remote error %d on node %d!\n", | ||
705 | remote_err, i); | ||
706 | mlog_errno(status); | ||
707 | goto bail; | ||
708 | } | ||
709 | if (status < 0) { | ||
710 | mlog_errno(status); | ||
711 | goto bail; | ||
712 | } | ||
713 | } | ||
714 | i++; | ||
715 | i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i); | ||
716 | mlog(0, "next is %d, i am %d\n", i, osb->node_num); | ||
717 | } | ||
718 | mlog(0, "done sending, now waiting on responses...\n"); | ||
719 | |||
720 | wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map)); | ||
721 | |||
722 | ocfs2_dequeue_net_wait_ctxt(osb, w); | ||
723 | dequeued = 1; | ||
724 | |||
725 | *response = w->n_response; | ||
726 | status = 0; | ||
727 | bail: | ||
728 | if (w) { | ||
729 | if (!dequeued) | ||
730 | ocfs2_dequeue_net_wait_ctxt(osb, w); | ||
731 | kfree(w); | ||
732 | } | ||
733 | |||
734 | mlog_exit(status); | ||
735 | return status; | ||
736 | } | ||
737 | |||
738 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | ||
739 | u64 blkno, | ||
740 | unsigned int generation, | ||
741 | enum ocfs2_vote_request type, | ||
742 | u32 priv) | ||
743 | { | ||
744 | struct ocfs2_vote_msg *request; | ||
745 | struct ocfs2_msg_hdr *hdr; | ||
746 | |||
747 | BUG_ON(!ocfs2_is_valid_vote_request(type)); | ||
748 | |||
749 | request = kcalloc(1, sizeof(*request), GFP_KERNEL); | ||
750 | if (!request) { | ||
751 | mlog_errno(-ENOMEM); | ||
752 | } else { | ||
753 | hdr = &request->v_hdr; | ||
754 | hdr->h_node_num = cpu_to_be32(osb->node_num); | ||
755 | hdr->h_request = cpu_to_be32(type); | ||
756 | hdr->h_blkno = cpu_to_be64(blkno); | ||
757 | hdr->h_generation = cpu_to_be32(generation); | ||
758 | |||
759 | request->md1.v_generic1 = cpu_to_be32(priv); | ||
760 | } | ||
761 | |||
762 | return request; | ||
763 | } | ||
764 | |||
765 | /* Complete the buildup of a new vote request and process the | ||
766 | * broadcast return value. */ | ||
767 | static int ocfs2_do_request_vote(struct ocfs2_super *osb, | ||
768 | struct ocfs2_vote_msg *request, | ||
769 | struct ocfs2_net_response_cb *callback) | ||
770 | { | ||
771 | int status, response; | ||
772 | unsigned int response_id; | ||
773 | struct ocfs2_msg_hdr *hdr; | ||
774 | |||
775 | response_id = ocfs2_new_response_id(osb); | ||
776 | |||
777 | hdr = &request->v_hdr; | ||
778 | hdr->h_response_id = cpu_to_be32(response_id); | ||
779 | |||
780 | status = ocfs2_broadcast_vote(osb, request, response_id, &response, | ||
781 | callback); | ||
782 | if (status < 0) { | ||
783 | mlog_errno(status); | ||
784 | goto bail; | ||
785 | } | ||
786 | |||
787 | status = response; | ||
788 | bail: | ||
789 | |||
790 | return status; | ||
791 | } | ||
792 | |||
793 | static int ocfs2_request_vote(struct inode *inode, | ||
794 | struct ocfs2_vote_msg *request, | ||
795 | struct ocfs2_net_response_cb *callback) | ||
796 | { | ||
797 | int status; | ||
798 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
799 | |||
800 | if (ocfs2_inode_is_new(inode)) | ||
801 | return 0; | ||
802 | |||
803 | status = -EAGAIN; | ||
804 | while (status == -EAGAIN) { | ||
805 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
806 | signal_pending(current)) | ||
807 | return -ERESTARTSYS; | ||
808 | |||
809 | status = ocfs2_super_lock(osb, 0); | ||
810 | if (status < 0) { | ||
811 | mlog_errno(status); | ||
812 | break; | ||
813 | } | ||
814 | |||
815 | status = 0; | ||
816 | if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
817 | osb->node_num)) | ||
818 | status = ocfs2_do_request_vote(osb, request, callback); | ||
819 | |||
820 | ocfs2_super_unlock(osb, 0); | ||
821 | } | ||
822 | return status; | ||
823 | } | ||
824 | |||
825 | static void ocfs2_delete_response_cb(void *priv, | ||
826 | struct ocfs2_response_msg *resp) | ||
827 | { | ||
828 | int orphaned_slot, node; | ||
829 | struct inode *inode = priv; | ||
830 | |||
831 | orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); | ||
832 | node = be32_to_cpu(resp->r_hdr.h_node_num); | ||
833 | mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot " | ||
834 | "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
835 | |||
836 | /* The other node may not actually know which slot the inode | ||
837 | * is orphaned in. */ | ||
838 | if (orphaned_slot == OCFS2_INVALID_SLOT) | ||
839 | return; | ||
840 | |||
841 | /* Ok, the responding node knows which slot this inode is | ||
842 | * orphaned in. We verify that the information is correct and | ||
843 | * then record this in the inode. ocfs2_delete_inode will use | ||
844 | * this information to determine which lock to take. */ | ||
845 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
846 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && | ||
847 | OCFS2_I(inode)->ip_orphaned_slot | ||
848 | != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d " | ||
849 | "says it's orphaned in slot %d, we think it's in %d\n", | ||
850 | OCFS2_I(inode)->ip_blkno, | ||
851 | be32_to_cpu(resp->r_hdr.h_node_num), | ||
852 | orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); | ||
853 | |||
854 | OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; | ||
855 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
856 | } | ||
857 | |||
858 | int ocfs2_request_delete_vote(struct inode *inode) | ||
859 | { | ||
860 | int orphaned_slot, status; | ||
861 | struct ocfs2_net_response_cb delete_cb; | ||
862 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
863 | struct ocfs2_vote_msg *request; | ||
864 | |||
865 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
866 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
867 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
868 | |||
869 | delete_cb.rc_cb = ocfs2_delete_response_cb; | ||
870 | delete_cb.rc_priv = inode; | ||
871 | |||
872 | mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n", | ||
873 | OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
874 | |||
875 | status = -ENOMEM; | ||
876 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
877 | inode->i_generation, | ||
878 | OCFS2_VOTE_REQ_DELETE, orphaned_slot); | ||
879 | if (request) { | ||
880 | status = ocfs2_request_vote(inode, request, &delete_cb); | ||
881 | |||
882 | kfree(request); | ||
883 | } | ||
884 | |||
885 | return status; | ||
886 | } | ||
887 | |||
888 | static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request, | ||
889 | struct dentry *dentry) | ||
890 | { | ||
891 | struct inode *parent = dentry->d_parent->d_inode; | ||
892 | |||
893 | /* We need some values which will uniquely identify a dentry | ||
894 | * on the other nodes so that they can find it and run | ||
895 | * d_delete against it. Parent directory block and full name | ||
896 | * should suffice. */ | ||
897 | |||
898 | mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n", | ||
899 | OCFS2_I(parent)->ip_blkno, dentry->d_name.len, | ||
900 | dentry->d_name.name); | ||
901 | |||
902 | request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno); | ||
903 | request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len); | ||
904 | memcpy(request->v_unlink_dirent, dentry->d_name.name, | ||
905 | dentry->d_name.len); | ||
906 | } | ||
907 | |||
908 | int ocfs2_request_unlink_vote(struct inode *inode, | ||
909 | struct dentry *dentry, | ||
910 | unsigned int nlink) | ||
911 | { | ||
912 | int status; | ||
913 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
914 | struct ocfs2_vote_msg *request; | ||
915 | |||
916 | if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) | ||
917 | return -ENAMETOOLONG; | ||
918 | |||
919 | status = -ENOMEM; | ||
920 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
921 | inode->i_generation, | ||
922 | OCFS2_VOTE_REQ_UNLINK, nlink); | ||
923 | if (request) { | ||
924 | ocfs2_setup_unlink_vote(request, dentry); | ||
925 | |||
926 | status = ocfs2_request_vote(inode, request, NULL); | ||
927 | |||
928 | kfree(request); | ||
929 | } | ||
930 | return status; | ||
931 | } | ||
932 | |||
933 | int ocfs2_request_rename_vote(struct inode *inode, | ||
934 | struct dentry *dentry) | ||
935 | { | ||
936 | int status; | ||
937 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
938 | struct ocfs2_vote_msg *request; | ||
939 | |||
940 | if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) | ||
941 | return -ENAMETOOLONG; | ||
942 | |||
943 | status = -ENOMEM; | ||
944 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
945 | inode->i_generation, | ||
946 | OCFS2_VOTE_REQ_RENAME, 0); | ||
947 | if (request) { | ||
948 | ocfs2_setup_unlink_vote(request, dentry); | ||
949 | |||
950 | status = ocfs2_request_vote(inode, request, NULL); | ||
951 | |||
952 | kfree(request); | ||
953 | } | ||
954 | return status; | ||
955 | } | ||
956 | |||
957 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) | ||
958 | { | ||
959 | int status; | ||
960 | struct ocfs2_vote_msg *request = NULL; | ||
961 | |||
962 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | ||
963 | OCFS2_VOTE_REQ_MOUNT, 0); | ||
964 | if (!request) { | ||
965 | status = -ENOMEM; | ||
966 | goto bail; | ||
967 | } | ||
968 | |||
969 | status = -EAGAIN; | ||
970 | while (status == -EAGAIN) { | ||
971 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
972 | signal_pending(current)) { | ||
973 | status = -ERESTARTSYS; | ||
974 | goto bail; | ||
975 | } | ||
976 | |||
977 | if (ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
978 | osb->node_num)) { | ||
979 | status = 0; | ||
980 | goto bail; | ||
981 | } | ||
982 | |||
983 | status = ocfs2_do_request_vote(osb, request, NULL); | ||
984 | } | ||
985 | |||
986 | bail: | ||
987 | if (request) | ||
988 | kfree(request); | ||
989 | |||
990 | return status; | ||
991 | } | ||
992 | |||
993 | int ocfs2_request_umount_vote(struct ocfs2_super *osb) | ||
994 | { | ||
995 | int status; | ||
996 | struct ocfs2_vote_msg *request = NULL; | ||
997 | |||
998 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | ||
999 | OCFS2_VOTE_REQ_UMOUNT, 0); | ||
1000 | if (!request) { | ||
1001 | status = -ENOMEM; | ||
1002 | goto bail; | ||
1003 | } | ||
1004 | |||
1005 | status = -EAGAIN; | ||
1006 | while (status == -EAGAIN) { | ||
1007 | /* Do not check signals on this vote... We really want | ||
1008 | * this one to go all the way through. */ | ||
1009 | |||
1010 | if (ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
1011 | osb->node_num)) { | ||
1012 | status = 0; | ||
1013 | goto bail; | ||
1014 | } | ||
1015 | |||
1016 | status = ocfs2_do_request_vote(osb, request, NULL); | ||
1017 | } | ||
1018 | |||
1019 | bail: | ||
1020 | if (request) | ||
1021 | kfree(request); | ||
1022 | |||
1023 | return status; | ||
1024 | } | ||
1025 | |||
1026 | /* TODO: This should eventually be a hash table! */ | ||
1027 | static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb, | ||
1028 | u32 response_id) | ||
1029 | { | ||
1030 | struct list_head *p; | ||
1031 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
1032 | |||
1033 | list_for_each(p, &osb->net_response_list) { | ||
1034 | w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); | ||
1035 | if (response_id == w->n_response_id) | ||
1036 | break; | ||
1037 | w = NULL; | ||
1038 | } | ||
1039 | |||
1040 | return w; | ||
1041 | } | ||
1042 | |||
1043 | /* Translate response codes into local node errno values */ | ||
1044 | static inline int ocfs2_translate_response(int response) | ||
1045 | { | ||
1046 | int ret; | ||
1047 | |||
1048 | switch (response) { | ||
1049 | case OCFS2_RESPONSE_OK: | ||
1050 | ret = 0; | ||
1051 | break; | ||
1052 | |||
1053 | case OCFS2_RESPONSE_BUSY: | ||
1054 | ret = -EBUSY; | ||
1055 | break; | ||
1056 | |||
1057 | default: | ||
1058 | ret = -EINVAL; | ||
1059 | } | ||
1060 | |||
1061 | return ret; | ||
1062 | } | ||
1063 | |||
1064 | static int ocfs2_handle_response_message(struct o2net_msg *msg, | ||
1065 | u32 len, | ||
1066 | void *data) | ||
1067 | { | ||
1068 | unsigned int response_id, node_num; | ||
1069 | int response_status; | ||
1070 | struct ocfs2_super *osb = data; | ||
1071 | struct ocfs2_response_msg *resp; | ||
1072 | struct ocfs2_net_wait_ctxt * w; | ||
1073 | struct ocfs2_net_response_cb *resp_cb; | ||
1074 | |||
1075 | resp = (struct ocfs2_response_msg *) msg->buf; | ||
1076 | |||
1077 | response_id = be32_to_cpu(resp->r_hdr.h_response_id); | ||
1078 | node_num = be32_to_cpu(resp->r_hdr.h_node_num); | ||
1079 | response_status = | ||
1080 | ocfs2_translate_response(be32_to_cpu(resp->r_response)); | ||
1081 | |||
1082 | mlog(0, "received response message:\n"); | ||
1083 | mlog(0, "h_response_id = %u\n", response_id); | ||
1084 | mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request)); | ||
1085 | mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno)); | ||
1086 | mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation)); | ||
1087 | mlog(0, "h_node_num = %u\n", node_num); | ||
1088 | mlog(0, "r_response = %d\n", response_status); | ||
1089 | |||
1090 | spin_lock(&osb->net_response_lock); | ||
1091 | w = __ocfs2_find_net_wait_ctxt(osb, response_id); | ||
1092 | if (!w) { | ||
1093 | mlog(0, "request not found!\n"); | ||
1094 | goto bail; | ||
1095 | } | ||
1096 | resp_cb = w->n_callback; | ||
1097 | |||
1098 | if (response_status && (!w->n_response)) { | ||
1099 | /* we only really need one negative response so don't | ||
1100 | * set it twice. */ | ||
1101 | w->n_response = response_status; | ||
1102 | } | ||
1103 | |||
1104 | if (resp_cb) { | ||
1105 | spin_unlock(&osb->net_response_lock); | ||
1106 | |||
1107 | resp_cb->rc_cb(resp_cb->rc_priv, resp); | ||
1108 | |||
1109 | spin_lock(&osb->net_response_lock); | ||
1110 | } | ||
1111 | |||
1112 | __ocfs2_mark_node_responded(osb, w, node_num); | ||
1113 | bail: | ||
1114 | spin_unlock(&osb->net_response_lock); | ||
1115 | |||
1116 | return 0; | ||
1117 | } | ||
1118 | |||
1119 | static int ocfs2_handle_vote_message(struct o2net_msg *msg, | ||
1120 | u32 len, | ||
1121 | void *data) | ||
1122 | { | ||
1123 | int status; | ||
1124 | struct ocfs2_super *osb = data; | ||
1125 | struct ocfs2_vote_work *work; | ||
1126 | |||
1127 | work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); | ||
1128 | if (!work) { | ||
1129 | status = -ENOMEM; | ||
1130 | mlog_errno(status); | ||
1131 | goto bail; | ||
1132 | } | ||
1133 | |||
1134 | INIT_LIST_HEAD(&work->w_list); | ||
1135 | memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg)); | ||
1136 | |||
1137 | mlog(0, "scheduling vote request:\n"); | ||
1138 | mlog(0, "h_response_id = %u\n", | ||
1139 | be32_to_cpu(work->w_msg.v_hdr.h_response_id)); | ||
1140 | mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request)); | ||
1141 | mlog(0, "h_blkno = %"MLFu64"\n", | ||
1142 | be64_to_cpu(work->w_msg.v_hdr.h_blkno)); | ||
1143 | mlog(0, "h_generation = %u\n", | ||
1144 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); | ||
1145 | mlog(0, "h_node_num = %u\n", | ||
1146 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); | ||
1147 | mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); | ||
1148 | |||
1149 | spin_lock(&osb->vote_task_lock); | ||
1150 | list_add_tail(&work->w_list, &osb->vote_list); | ||
1151 | osb->vote_count++; | ||
1152 | spin_unlock(&osb->vote_task_lock); | ||
1153 | |||
1154 | ocfs2_kick_vote_thread(osb); | ||
1155 | |||
1156 | status = 0; | ||
1157 | bail: | ||
1158 | return status; | ||
1159 | } | ||
1160 | |||
1161 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb) | ||
1162 | { | ||
1163 | if (!osb->net_key) | ||
1164 | return; | ||
1165 | |||
1166 | o2net_unregister_handler_list(&osb->osb_net_handlers); | ||
1167 | |||
1168 | if (!list_empty(&osb->net_response_list)) | ||
1169 | mlog(ML_ERROR, "net response list not empty!\n"); | ||
1170 | |||
1171 | osb->net_key = 0; | ||
1172 | } | ||
1173 | |||
1174 | int ocfs2_register_net_handlers(struct ocfs2_super *osb) | ||
1175 | { | ||
1176 | int status = 0; | ||
1177 | |||
1178 | status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, | ||
1179 | osb->net_key, | ||
1180 | sizeof(struct ocfs2_response_msg), | ||
1181 | ocfs2_handle_response_message, | ||
1182 | osb, &osb->osb_net_handlers); | ||
1183 | if (status) { | ||
1184 | mlog_errno(status); | ||
1185 | goto bail; | ||
1186 | } | ||
1187 | |||
1188 | status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE, | ||
1189 | osb->net_key, | ||
1190 | sizeof(struct ocfs2_vote_msg), | ||
1191 | ocfs2_handle_vote_message, | ||
1192 | osb, &osb->osb_net_handlers); | ||
1193 | if (status) { | ||
1194 | mlog_errno(status); | ||
1195 | goto bail; | ||
1196 | } | ||
1197 | bail: | ||
1198 | if (status < 0) | ||
1199 | ocfs2_unregister_net_handlers(osb); | ||
1200 | |||
1201 | return status; | ||
1202 | } | ||
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h new file mode 100644 index 000000000000..9cce60703466 --- /dev/null +++ b/fs/ocfs2/vote.h | |||
@@ -0,0 +1,56 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * vote.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef VOTE_H | ||
28 | #define VOTE_H | ||
29 | |||
30 | int ocfs2_vote_thread(void *arg); | ||
31 | static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) | ||
32 | { | ||
33 | spin_lock(&osb->vote_task_lock); | ||
34 | /* make sure the voting thread gets a swipe at whatever changes | ||
35 | * the caller may have made to the voting state */ | ||
36 | osb->vote_wake_sequence++; | ||
37 | spin_unlock(&osb->vote_task_lock); | ||
38 | wake_up(&osb->vote_event); | ||
39 | } | ||
40 | |||
41 | int ocfs2_request_delete_vote(struct inode *inode); | ||
42 | int ocfs2_request_unlink_vote(struct inode *inode, | ||
43 | struct dentry *dentry, | ||
44 | unsigned int nlink); | ||
45 | int ocfs2_request_rename_vote(struct inode *inode, | ||
46 | struct dentry *dentry); | ||
47 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); | ||
48 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); | ||
49 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); | ||
50 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); | ||
51 | |||
52 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode); | ||
53 | |||
54 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | ||
55 | int node_num); | ||
56 | #endif | ||