diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-09-22 20:28:19 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-09-24 16:50:46 -0400 |
commit | 24c19ef40474c3930597f31ae233dc06319bd881 (patch) | |
tree | e05b1cf72435d25bf47e67b206aa376bbea33b7d /fs/ocfs2/inode.c | |
parent | f9e2d82e6395cfa0802446b54b63cc412089d82c (diff) |
ocfs2: Remove i_generation from inode lock names
OCFS2 puts inode meta data in the "lock value block" provided by the DLM.
Typically, i_generation is encoded in the lock name so that a deleted inode
on and a new one in the same block don't share the same lvb.
Unfortunately, that scheme means that the read in ocfs2_read_locked_inode()
is potentially thrown away as soon as the meta data lock is taken - we
cannot encode the lock name without first knowing i_generation, which
requires a disk read.
This patch encodes i_generation in the inode meta data lvb, and removes the
value from the inode meta data lock name. This way, the read can be covered
by a lock, and at the same time we can distinguish between an up to date and
a stale LVB.
This will help cold-cache stat(2) performance in particular.
Since this patch changes the protocol version, we take the opportunity to do
a minor re-organization of two of the LVB fields.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/inode.c')
-rw-r--r-- | fs/ocfs2/inode.c | 146 |
1 files changed, 111 insertions, 35 deletions
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 66ca7a82b68a..69d3db569166 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -54,8 +54,6 @@ | |||
54 | 54 | ||
55 | #include "buffer_head_io.h" | 55 | #include "buffer_head_io.h" |
56 | 56 | ||
57 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | ||
58 | #define OCFS2_FI_FLAG_DELETE 0x2 | ||
59 | struct ocfs2_find_inode_args | 57 | struct ocfs2_find_inode_args |
60 | { | 58 | { |
61 | u64 fi_blkno; | 59 | u64 fi_blkno; |
@@ -109,7 +107,7 @@ struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | |||
109 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | 107 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); |
110 | } | 108 | } |
111 | 109 | ||
112 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | 110 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) |
113 | { | 111 | { |
114 | struct inode *inode = NULL; | 112 | struct inode *inode = NULL; |
115 | struct super_block *sb = osb->sb; | 113 | struct super_block *sb = osb->sb; |
@@ -127,7 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | |||
127 | } | 125 | } |
128 | 126 | ||
129 | args.fi_blkno = blkno; | 127 | args.fi_blkno = blkno; |
130 | args.fi_flags = 0; | 128 | args.fi_flags = flags; |
131 | args.fi_ino = ino_from_blkno(sb, blkno); | 129 | args.fi_ino = ino_from_blkno(sb, blkno); |
132 | 130 | ||
133 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, | 131 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, |
@@ -297,15 +295,11 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
297 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | 295 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; |
298 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | 296 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); |
299 | 297 | ||
300 | if (create_ino) | ||
301 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
302 | le64_to_cpu(fe->i_blkno)); | ||
303 | |||
304 | mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n", | ||
305 | (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); | ||
306 | |||
307 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | 298 | inode->i_nlink = le16_to_cpu(fe->i_links_count); |
308 | 299 | ||
300 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | ||
301 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
302 | |||
309 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { | 303 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { |
310 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | 304 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; |
311 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); | 305 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); |
@@ -343,12 +337,28 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
343 | break; | 337 | break; |
344 | } | 338 | } |
345 | 339 | ||
340 | if (create_ino) { | ||
341 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
342 | le64_to_cpu(fe->i_blkno)); | ||
343 | |||
344 | /* | ||
345 | * If we ever want to create system files from kernel, | ||
346 | * the generation argument to | ||
347 | * ocfs2_inode_lock_res_init() will have to change. | ||
348 | */ | ||
349 | BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)); | ||
350 | |||
351 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
352 | OCFS2_LOCK_TYPE_META, 0, inode); | ||
353 | } | ||
354 | |||
346 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | 355 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, |
347 | OCFS2_LOCK_TYPE_RW, inode); | 356 | OCFS2_LOCK_TYPE_RW, inode->i_generation, |
348 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | 357 | inode); |
349 | OCFS2_LOCK_TYPE_META, inode); | 358 | |
350 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, | 359 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, |
351 | OCFS2_LOCK_TYPE_DATA, inode); | 360 | OCFS2_LOCK_TYPE_DATA, inode->i_generation, |
361 | inode); | ||
352 | 362 | ||
353 | ocfs2_set_inode_flags(inode); | 363 | ocfs2_set_inode_flags(inode); |
354 | inode->i_flags |= S_NOATIME; | 364 | inode->i_flags |= S_NOATIME; |
@@ -366,15 +376,15 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
366 | struct ocfs2_super *osb; | 376 | struct ocfs2_super *osb; |
367 | struct ocfs2_dinode *fe; | 377 | struct ocfs2_dinode *fe; |
368 | struct buffer_head *bh = NULL; | 378 | struct buffer_head *bh = NULL; |
369 | int status; | 379 | int status, can_lock; |
370 | int sysfile = 0; | 380 | u32 generation = 0; |
371 | 381 | ||
372 | mlog_entry("(0x%p, 0x%p)\n", inode, args); | 382 | mlog_entry("(0x%p, 0x%p)\n", inode, args); |
373 | 383 | ||
374 | status = -EINVAL; | 384 | status = -EINVAL; |
375 | if (inode == NULL || inode->i_sb == NULL) { | 385 | if (inode == NULL || inode->i_sb == NULL) { |
376 | mlog(ML_ERROR, "bad inode\n"); | 386 | mlog(ML_ERROR, "bad inode\n"); |
377 | goto bail; | 387 | return status; |
378 | } | 388 | } |
379 | sb = inode->i_sb; | 389 | sb = inode->i_sb; |
380 | osb = OCFS2_SB(sb); | 390 | osb = OCFS2_SB(sb); |
@@ -382,50 +392,110 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
382 | if (!args) { | 392 | if (!args) { |
383 | mlog(ML_ERROR, "bad inode args\n"); | 393 | mlog(ML_ERROR, "bad inode args\n"); |
384 | make_bad_inode(inode); | 394 | make_bad_inode(inode); |
385 | goto bail; | 395 | return status; |
386 | } | 396 | } |
387 | 397 | ||
388 | /* Read the FE off disk. This is safe because the kernel only | 398 | /* |
389 | * does one read_inode2 for a new inode, and if it doesn't | 399 | * To improve performance of cold-cache inode stats, we take |
390 | * exist yet then nobody can be working on it! */ | 400 | * the cluster lock here if possible. |
391 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); | 401 | * |
402 | * Generally, OCFS2 never trusts the contents of an inode | ||
403 | * unless it's holding a cluster lock, so taking it here isn't | ||
404 | * a correctness issue as much as it is a performance | ||
405 | * improvement. | ||
406 | * | ||
407 | * There are three times when taking the lock is not a good idea: | ||
408 | * | ||
409 | * 1) During startup, before we have initialized the DLM. | ||
410 | * | ||
411 | * 2) If we are reading certain system files which never get | ||
412 | * cluster locks (local alloc, truncate log). | ||
413 | * | ||
414 | * 3) If the process doing the iget() is responsible for | ||
415 | * orphan dir recovery. We're holding the orphan dir lock and | ||
416 | * can get into a deadlock with another process on another | ||
417 | * node in ->delete_inode(). | ||
418 | * | ||
419 | * #1 and #2 can be simply solved by never taking the lock | ||
420 | * here for system files (which are the only type we read | ||
421 | * during mount). It's a heavier approach, but our main | ||
422 | * concern is user-accesible files anyway. | ||
423 | * | ||
424 | * #3 works itself out because we'll eventually take the | ||
425 | * cluster lock before trusting anything anyway. | ||
426 | */ | ||
427 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | ||
428 | && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); | ||
429 | |||
430 | /* | ||
431 | * To maintain backwards compatibility with older versions of | ||
432 | * ocfs2-tools, we still store the generation value for system | ||
433 | * files. The only ones that actually matter to userspace are | ||
434 | * the journals, but it's easier and inexpensive to just flag | ||
435 | * all system files similarly. | ||
436 | */ | ||
437 | if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | ||
438 | generation = osb->fs_generation; | ||
439 | |||
440 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
441 | OCFS2_LOCK_TYPE_META, | ||
442 | generation, inode); | ||
443 | |||
444 | if (can_lock) { | ||
445 | status = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
446 | if (status) { | ||
447 | make_bad_inode(inode); | ||
448 | mlog_errno(status); | ||
449 | return status; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, | ||
454 | can_lock ? inode : NULL); | ||
392 | if (status < 0) { | 455 | if (status < 0) { |
393 | mlog_errno(status); | 456 | mlog_errno(status); |
394 | make_bad_inode(inode); | ||
395 | goto bail; | 457 | goto bail; |
396 | } | 458 | } |
397 | 459 | ||
460 | status = -EINVAL; | ||
398 | fe = (struct ocfs2_dinode *) bh->b_data; | 461 | fe = (struct ocfs2_dinode *) bh->b_data; |
399 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 462 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
400 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", | 463 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", |
401 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); | 464 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); |
402 | make_bad_inode(inode); | ||
403 | goto bail; | 465 | goto bail; |
404 | } | 466 | } |
405 | 467 | ||
406 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | 468 | /* |
407 | sysfile = 1; | 469 | * This is a code bug. Right now the caller needs to |
470 | * understand whether it is asking for a system file inode or | ||
471 | * not so the proper lock names can be built. | ||
472 | */ | ||
473 | mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != | ||
474 | !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), | ||
475 | "Inode %llu: system file state is ambigous\n", | ||
476 | (unsigned long long)args->fi_blkno); | ||
408 | 477 | ||
409 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || | 478 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || |
410 | S_ISBLK(le16_to_cpu(fe->i_mode))) | 479 | S_ISBLK(le16_to_cpu(fe->i_mode))) |
411 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | 480 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); |
412 | 481 | ||
413 | status = -EINVAL; | ||
414 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { | 482 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { |
415 | mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", | 483 | mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", |
416 | (unsigned long long)fe->i_blkno, inode->i_ino); | 484 | (unsigned long long)fe->i_blkno, inode->i_ino); |
417 | make_bad_inode(inode); | ||
418 | goto bail; | 485 | goto bail; |
419 | } | 486 | } |
420 | 487 | ||
421 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); | 488 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); |
422 | 489 | ||
423 | if (sysfile) | ||
424 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
425 | |||
426 | status = 0; | 490 | status = 0; |
427 | 491 | ||
428 | bail: | 492 | bail: |
493 | if (can_lock) | ||
494 | ocfs2_meta_unlock(inode, 0); | ||
495 | |||
496 | if (status < 0) | ||
497 | make_bad_inode(inode); | ||
498 | |||
429 | if (args && bh) | 499 | if (args && bh) |
430 | brelse(bh); | 500 | brelse(bh); |
431 | 501 | ||
@@ -898,9 +968,15 @@ void ocfs2_delete_inode(struct inode *inode) | |||
898 | goto bail_unlock_inode; | 968 | goto bail_unlock_inode; |
899 | } | 969 | } |
900 | 970 | ||
901 | /* Mark the inode as successfully deleted. This is important | 971 | /* |
902 | * for ocfs2_clear_inode as it will check this flag and skip | 972 | * Mark the inode as successfully deleted. |
903 | * any checkpointing work */ | 973 | * |
974 | * This is important for ocfs2_clear_inode() as it will check | ||
975 | * this flag and skip any checkpointing work | ||
976 | * | ||
977 | * ocfs2_stuff_meta_lvb() also uses this flag to invalidate | ||
978 | * the LVB for other nodes. | ||
979 | */ | ||
904 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; | 980 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; |
905 | 981 | ||
906 | bail_unlock_inode: | 982 | bail_unlock_inode: |