aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/inode.c
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2006-09-22 20:28:19 -0400
committerMark Fasheh <mark.fasheh@oracle.com>2006-09-24 16:50:46 -0400
commit24c19ef40474c3930597f31ae233dc06319bd881 (patch)
treee05b1cf72435d25bf47e67b206aa376bbea33b7d /fs/ocfs2/inode.c
parentf9e2d82e6395cfa0802446b54b63cc412089d82c (diff)
ocfs2: Remove i_generation from inode lock names
OCFS2 puts inode meta data in the "lock value block" provided by the DLM. Typically, i_generation is encoded in the lock name so that a deleted inode on and a new one in the same block don't share the same lvb. Unfortunately, that scheme means that the read in ocfs2_read_locked_inode() is potentially thrown away as soon as the meta data lock is taken - we cannot encode the lock name without first knowing i_generation, which requires a disk read. This patch encodes i_generation in the inode meta data lvb, and removes the value from the inode meta data lock name. This way, the read can be covered by a lock, and at the same time we can distinguish between an up to date and a stale LVB. This will help cold-cache stat(2) performance in particular. Since this patch changes the protocol version, we take the opportunity to do a minor re-organization of two of the LVB fields. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/inode.c')
-rw-r--r--fs/ocfs2/inode.c146
1 files changed, 111 insertions, 35 deletions
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 66ca7a82b68a..69d3db569166 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -54,8 +54,6 @@
54 54
55#include "buffer_head_io.h" 55#include "buffer_head_io.h"
56 56
57#define OCFS2_FI_FLAG_NOWAIT 0x1
58#define OCFS2_FI_FLAG_DELETE 0x2
59struct ocfs2_find_inode_args 57struct ocfs2_find_inode_args
60{ 58{
61 u64 fi_blkno; 59 u64 fi_blkno;
@@ -109,7 +107,7 @@ struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
109 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); 107 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
110} 108}
111 109
112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) 110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
113{ 111{
114 struct inode *inode = NULL; 112 struct inode *inode = NULL;
115 struct super_block *sb = osb->sb; 113 struct super_block *sb = osb->sb;
@@ -127,7 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
127 } 125 }
128 126
129 args.fi_blkno = blkno; 127 args.fi_blkno = blkno;
130 args.fi_flags = 0; 128 args.fi_flags = flags;
131 args.fi_ino = ino_from_blkno(sb, blkno); 129 args.fi_ino = ino_from_blkno(sb, blkno);
132 130
133 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
@@ -297,15 +295,11 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
297 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; 295 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
298 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 296 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
299 297
300 if (create_ino)
301 inode->i_ino = ino_from_blkno(inode->i_sb,
302 le64_to_cpu(fe->i_blkno));
303
304 mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n",
305 (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
306
307 inode->i_nlink = le16_to_cpu(fe->i_links_count); 298 inode->i_nlink = le16_to_cpu(fe->i_links_count);
308 299
300 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
301 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
302
309 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 303 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
310 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 304 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
311 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); 305 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
@@ -343,12 +337,28 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
343 break; 337 break;
344 } 338 }
345 339
340 if (create_ino) {
341 inode->i_ino = ino_from_blkno(inode->i_sb,
342 le64_to_cpu(fe->i_blkno));
343
344 /*
345 * If we ever want to create system files from kernel,
346 * the generation argument to
347 * ocfs2_inode_lock_res_init() will have to change.
348 */
349 BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
350
351 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
352 OCFS2_LOCK_TYPE_META, 0, inode);
353 }
354
346 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 355 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
347 OCFS2_LOCK_TYPE_RW, inode); 356 OCFS2_LOCK_TYPE_RW, inode->i_generation,
348 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 357 inode);
349 OCFS2_LOCK_TYPE_META, inode); 358
350 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, 359 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
351 OCFS2_LOCK_TYPE_DATA, inode); 360 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
361 inode);
352 362
353 ocfs2_set_inode_flags(inode); 363 ocfs2_set_inode_flags(inode);
354 inode->i_flags |= S_NOATIME; 364 inode->i_flags |= S_NOATIME;
@@ -366,15 +376,15 @@ static int ocfs2_read_locked_inode(struct inode *inode,
366 struct ocfs2_super *osb; 376 struct ocfs2_super *osb;
367 struct ocfs2_dinode *fe; 377 struct ocfs2_dinode *fe;
368 struct buffer_head *bh = NULL; 378 struct buffer_head *bh = NULL;
369 int status; 379 int status, can_lock;
370 int sysfile = 0; 380 u32 generation = 0;
371 381
372 mlog_entry("(0x%p, 0x%p)\n", inode, args); 382 mlog_entry("(0x%p, 0x%p)\n", inode, args);
373 383
374 status = -EINVAL; 384 status = -EINVAL;
375 if (inode == NULL || inode->i_sb == NULL) { 385 if (inode == NULL || inode->i_sb == NULL) {
376 mlog(ML_ERROR, "bad inode\n"); 386 mlog(ML_ERROR, "bad inode\n");
377 goto bail; 387 return status;
378 } 388 }
379 sb = inode->i_sb; 389 sb = inode->i_sb;
380 osb = OCFS2_SB(sb); 390 osb = OCFS2_SB(sb);
@@ -382,50 +392,110 @@ static int ocfs2_read_locked_inode(struct inode *inode,
382 if (!args) { 392 if (!args) {
383 mlog(ML_ERROR, "bad inode args\n"); 393 mlog(ML_ERROR, "bad inode args\n");
384 make_bad_inode(inode); 394 make_bad_inode(inode);
385 goto bail; 395 return status;
386 } 396 }
387 397
388 /* Read the FE off disk. This is safe because the kernel only 398 /*
389 * does one read_inode2 for a new inode, and if it doesn't 399 * To improve performance of cold-cache inode stats, we take
390 * exist yet then nobody can be working on it! */ 400 * the cluster lock here if possible.
391 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); 401 *
402 * Generally, OCFS2 never trusts the contents of an inode
403 * unless it's holding a cluster lock, so taking it here isn't
404 * a correctness issue as much as it is a performance
405 * improvement.
406 *
407 * There are three times when taking the lock is not a good idea:
408 *
409 * 1) During startup, before we have initialized the DLM.
410 *
411 * 2) If we are reading certain system files which never get
412 * cluster locks (local alloc, truncate log).
413 *
414 * 3) If the process doing the iget() is responsible for
415 * orphan dir recovery. We're holding the orphan dir lock and
416 * can get into a deadlock with another process on another
417 * node in ->delete_inode().
418 *
419 * #1 and #2 can be simply solved by never taking the lock
420 * here for system files (which are the only type we read
421 * during mount). It's a heavier approach, but our main
422 * concern is user-accesible files anyway.
423 *
424 * #3 works itself out because we'll eventually take the
425 * cluster lock before trusting anything anyway.
426 */
427 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
428 && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK);
429
430 /*
431 * To maintain backwards compatibility with older versions of
432 * ocfs2-tools, we still store the generation value for system
433 * files. The only ones that actually matter to userspace are
434 * the journals, but it's easier and inexpensive to just flag
435 * all system files similarly.
436 */
437 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
438 generation = osb->fs_generation;
439
440 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
441 OCFS2_LOCK_TYPE_META,
442 generation, inode);
443
444 if (can_lock) {
445 status = ocfs2_meta_lock(inode, NULL, NULL, 0);
446 if (status) {
447 make_bad_inode(inode);
448 mlog_errno(status);
449 return status;
450 }
451 }
452
453 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
454 can_lock ? inode : NULL);
392 if (status < 0) { 455 if (status < 0) {
393 mlog_errno(status); 456 mlog_errno(status);
394 make_bad_inode(inode);
395 goto bail; 457 goto bail;
396 } 458 }
397 459
460 status = -EINVAL;
398 fe = (struct ocfs2_dinode *) bh->b_data; 461 fe = (struct ocfs2_dinode *) bh->b_data;
399 if (!OCFS2_IS_VALID_DINODE(fe)) { 462 if (!OCFS2_IS_VALID_DINODE(fe)) {
400 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 463 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
401 (unsigned long long)fe->i_blkno, 7, fe->i_signature); 464 (unsigned long long)fe->i_blkno, 7, fe->i_signature);
402 make_bad_inode(inode);
403 goto bail; 465 goto bail;
404 } 466 }
405 467
406 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 468 /*
407 sysfile = 1; 469 * This is a code bug. Right now the caller needs to
470 * understand whether it is asking for a system file inode or
471 * not so the proper lock names can be built.
472 */
473 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
474 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
475 "Inode %llu: system file state is ambigous\n",
476 (unsigned long long)args->fi_blkno);
408 477
409 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 478 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
410 S_ISBLK(le16_to_cpu(fe->i_mode))) 479 S_ISBLK(le16_to_cpu(fe->i_mode)))
411 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 480 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
412 481
413 status = -EINVAL;
414 if (ocfs2_populate_inode(inode, fe, 0) < 0) { 482 if (ocfs2_populate_inode(inode, fe, 0) < 0) {
415 mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", 483 mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n",
416 (unsigned long long)fe->i_blkno, inode->i_ino); 484 (unsigned long long)fe->i_blkno, inode->i_ino);
417 make_bad_inode(inode);
418 goto bail; 485 goto bail;
419 } 486 }
420 487
421 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 488 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
422 489
423 if (sysfile)
424 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
425
426 status = 0; 490 status = 0;
427 491
428bail: 492bail:
493 if (can_lock)
494 ocfs2_meta_unlock(inode, 0);
495
496 if (status < 0)
497 make_bad_inode(inode);
498
429 if (args && bh) 499 if (args && bh)
430 brelse(bh); 500 brelse(bh);
431 501
@@ -898,9 +968,15 @@ void ocfs2_delete_inode(struct inode *inode)
898 goto bail_unlock_inode; 968 goto bail_unlock_inode;
899 } 969 }
900 970
901 /* Mark the inode as successfully deleted. This is important 971 /*
902 * for ocfs2_clear_inode as it will check this flag and skip 972 * Mark the inode as successfully deleted.
903 * any checkpointing work */ 973 *
974 * This is important for ocfs2_clear_inode() as it will check
975 * this flag and skip any checkpointing work
976 *
977 * ocfs2_stuff_meta_lvb() also uses this flag to invalidate
978 * the LVB for other nodes.
979 */
904 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 980 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
905 981
906bail_unlock_inode: 982bail_unlock_inode: