diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-09-22 20:28:19 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-09-24 16:50:46 -0400 |
commit | 24c19ef40474c3930597f31ae233dc06319bd881 (patch) | |
tree | e05b1cf72435d25bf47e67b206aa376bbea33b7d | |
parent | f9e2d82e6395cfa0802446b54b63cc412089d82c (diff) |
ocfs2: Remove i_generation from inode lock names
OCFS2 puts inode meta data in the "lock value block" provided by the DLM.
Typically, i_generation is encoded in the lock name so that a deleted inode
on and a new one in the same block don't share the same lvb.
Unfortunately, that scheme means that the read in ocfs2_read_locked_inode()
is potentially thrown away as soon as the meta data lock is taken - we
cannot encode the lock name without first knowing i_generation, which
requires a disk read.
This patch encodes i_generation in the inode meta data lvb, and removes the
value from the inode meta data lock name. This way, the read can be covered
by a lock, and at the same time we can distinguish between an up to date and
a stale LVB.
This will help cold-cache stat(2) performance in particular.
Since this patch changes the protocol version, we take the opportunity to do
a minor re-organization of two of the LVB fields.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 42 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.h | 7 | ||||
-rw-r--r-- | fs/ocfs2/export.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 146 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 8 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/sysfile.c | 2 |
10 files changed, 170 insertions, 53 deletions
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index da42b515cd1d..4b46aac7d243 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
@@ -44,6 +44,9 @@ | |||
44 | * locking semantics of the file system using the protocol. It should | 44 | * locking semantics of the file system using the protocol. It should |
45 | * be somewhere else, I'm sure, but right now it isn't. | 45 | * be somewhere else, I'm sure, but right now it isn't. |
46 | * | 46 | * |
47 | * New in version 4: | ||
48 | * - Remove i_generation from lock names for better stat performance. | ||
49 | * | ||
47 | * New in version 3: | 50 | * New in version 3: |
48 | * - Replace dentry votes with a cluster lock | 51 | * - Replace dentry votes with a cluster lock |
49 | * | 52 | * |
@@ -51,7 +54,7 @@ | |||
51 | * - full 64 bit i_size in the metadata lock lvbs | 54 | * - full 64 bit i_size in the metadata lock lvbs |
52 | * - introduction of "rw" lock and pushing meta/data locking down | 55 | * - introduction of "rw" lock and pushing meta/data locking down |
53 | */ | 56 | */ |
54 | #define O2NET_PROTOCOL_VERSION 3ULL | 57 | #define O2NET_PROTOCOL_VERSION 4ULL |
55 | struct o2net_handshake { | 58 | struct o2net_handshake { |
56 | __be64 protocol_version; | 59 | __be64 protocol_version; |
57 | __be64 connector_id; | 60 | __be64 connector_id; |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6cd84dffbbf4..ecb3cba22814 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -320,6 +320,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) | |||
320 | 320 | ||
321 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | 321 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, |
322 | enum ocfs2_lock_type type, | 322 | enum ocfs2_lock_type type, |
323 | unsigned int generation, | ||
323 | struct inode *inode) | 324 | struct inode *inode) |
324 | { | 325 | { |
325 | struct ocfs2_lock_res_ops *ops; | 326 | struct ocfs2_lock_res_ops *ops; |
@@ -341,7 +342,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | |||
341 | }; | 342 | }; |
342 | 343 | ||
343 | ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, | 344 | ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, |
344 | inode->i_generation, res->l_name); | 345 | generation, res->l_name); |
345 | ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); | 346 | ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); |
346 | } | 347 | } |
347 | 348 | ||
@@ -1173,17 +1174,19 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | |||
1173 | 1174 | ||
1174 | int ocfs2_create_new_lock(struct ocfs2_super *osb, | 1175 | int ocfs2_create_new_lock(struct ocfs2_super *osb, |
1175 | struct ocfs2_lock_res *lockres, | 1176 | struct ocfs2_lock_res *lockres, |
1176 | int ex) | 1177 | int ex, |
1178 | int local) | ||
1177 | { | 1179 | { |
1178 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 1180 | int level = ex ? LKM_EXMODE : LKM_PRMODE; |
1179 | unsigned long flags; | 1181 | unsigned long flags; |
1182 | int lkm_flags = local ? LKM_LOCAL : 0; | ||
1180 | 1183 | ||
1181 | spin_lock_irqsave(&lockres->l_lock, flags); | 1184 | spin_lock_irqsave(&lockres->l_lock, flags); |
1182 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | 1185 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); |
1183 | lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); | 1186 | lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); |
1184 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 1187 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
1185 | 1188 | ||
1186 | return ocfs2_lock_create(osb, lockres, level, LKM_LOCAL); | 1189 | return ocfs2_lock_create(osb, lockres, level, lkm_flags); |
1187 | } | 1190 | } |
1188 | 1191 | ||
1189 | /* Grants us an EX lock on the data and metadata resources, skipping | 1192 | /* Grants us an EX lock on the data and metadata resources, skipping |
@@ -1212,19 +1215,23 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
1212 | * on a resource which has an invalid one -- we'll set it | 1215 | * on a resource which has an invalid one -- we'll set it |
1213 | * valid when we release the EX. */ | 1216 | * valid when we release the EX. */ |
1214 | 1217 | ||
1215 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1); | 1218 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1); |
1216 | if (ret) { | 1219 | if (ret) { |
1217 | mlog_errno(ret); | 1220 | mlog_errno(ret); |
1218 | goto bail; | 1221 | goto bail; |
1219 | } | 1222 | } |
1220 | 1223 | ||
1221 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1); | 1224 | /* |
1225 | * We don't want to use LKM_LOCAL on a meta data lock as they | ||
1226 | * don't use a generation in their lock names. | ||
1227 | */ | ||
1228 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); | ||
1222 | if (ret) { | 1229 | if (ret) { |
1223 | mlog_errno(ret); | 1230 | mlog_errno(ret); |
1224 | goto bail; | 1231 | goto bail; |
1225 | } | 1232 | } |
1226 | 1233 | ||
1227 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1); | 1234 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1); |
1228 | if (ret) { | 1235 | if (ret) { |
1229 | mlog_errno(ret); | 1236 | mlog_errno(ret); |
1230 | goto bail; | 1237 | goto bail; |
@@ -1413,6 +1420,16 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) | |||
1413 | 1420 | ||
1414 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | 1421 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; |
1415 | 1422 | ||
1423 | /* | ||
1424 | * Invalidate the LVB of a deleted inode - this way other | ||
1425 | * nodes are forced to go to disk and discover the new inode | ||
1426 | * status. | ||
1427 | */ | ||
1428 | if (oi->ip_flags & OCFS2_INODE_DELETED) { | ||
1429 | lvb->lvb_version = 0; | ||
1430 | goto out; | ||
1431 | } | ||
1432 | |||
1416 | lvb->lvb_version = OCFS2_LVB_VERSION; | 1433 | lvb->lvb_version = OCFS2_LVB_VERSION; |
1417 | lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); | 1434 | lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); |
1418 | lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); | 1435 | lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); |
@@ -1429,6 +1446,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) | |||
1429 | lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); | 1446 | lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); |
1430 | lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); | 1447 | lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); |
1431 | 1448 | ||
1449 | out: | ||
1432 | mlog_meta_lvb(0, lockres); | 1450 | mlog_meta_lvb(0, lockres); |
1433 | 1451 | ||
1434 | mlog_exit_void(); | 1452 | mlog_exit_void(); |
@@ -1727,6 +1745,18 @@ int ocfs2_meta_lock_full(struct inode *inode, | |||
1727 | wait_event(osb->recovery_event, | 1745 | wait_event(osb->recovery_event, |
1728 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | 1746 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); |
1729 | 1747 | ||
1748 | /* | ||
1749 | * We only see this flag if we're being called from | ||
1750 | * ocfs2_read_locked_inode(). It means we're locking an inode | ||
1751 | * which hasn't been populated yet, so clear the refresh flag | ||
1752 | * and let the caller handle it. | ||
1753 | */ | ||
1754 | if (inode->i_state & I_NEW) { | ||
1755 | status = 0; | ||
1756 | ocfs2_complete_lock_res_refresh(lockres, 0); | ||
1757 | goto bail; | ||
1758 | } | ||
1759 | |||
1730 | /* This is fun. The caller may want a bh back, or it may | 1760 | /* This is fun. The caller may want a bh back, or it may |
1731 | * not. ocfs2_meta_lock_update definitely wants one in, but | 1761 | * not. ocfs2_meta_lock_update definitely wants one in, but |
1732 | * may or may not read one, depending on what's in the | 1762 | * may or may not read one, depending on what's in the |
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 45a74f44b688..4a2769387229 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h | |||
@@ -32,9 +32,9 @@ | |||
32 | #define OCFS2_LVB_VERSION 4 | 32 | #define OCFS2_LVB_VERSION 4 |
33 | 33 | ||
34 | struct ocfs2_meta_lvb { | 34 | struct ocfs2_meta_lvb { |
35 | __be16 lvb_reserved0; | ||
36 | __u8 lvb_reserved1; | ||
37 | __u8 lvb_version; | 35 | __u8 lvb_version; |
36 | __u8 lvb_reserved0; | ||
37 | __be16 lvb_reserved1; | ||
38 | __be32 lvb_iclusters; | 38 | __be32 lvb_iclusters; |
39 | __be32 lvb_iuid; | 39 | __be32 lvb_iuid; |
40 | __be32 lvb_igid; | 40 | __be32 lvb_igid; |
@@ -62,13 +62,14 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb); | |||
62 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); | 62 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); |
63 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | 63 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, |
64 | enum ocfs2_lock_type type, | 64 | enum ocfs2_lock_type type, |
65 | unsigned int generation, | ||
65 | struct inode *inode); | 66 | struct inode *inode); |
66 | void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, | 67 | void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, |
67 | u64 parent, struct inode *inode); | 68 | u64 parent, struct inode *inode); |
68 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res); | 69 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res); |
69 | int ocfs2_create_new_inode_locks(struct inode *inode); | 70 | int ocfs2_create_new_inode_locks(struct inode *inode); |
70 | int ocfs2_create_new_lock(struct ocfs2_super *osb, | 71 | int ocfs2_create_new_lock(struct ocfs2_super *osb, |
71 | struct ocfs2_lock_res *lockres, int ex); | 72 | struct ocfs2_lock_res *lockres, int ex, int local); |
72 | int ocfs2_drop_inode_locks(struct inode *inode); | 73 | int ocfs2_drop_inode_locks(struct inode *inode); |
73 | int ocfs2_data_lock_full(struct inode *inode, | 74 | int ocfs2_data_lock_full(struct inode *inode, |
74 | int write, | 75 | int write, |
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index ffcd79749e0d..fb91089a60a7 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c | |||
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) | |||
58 | return ERR_PTR(-ESTALE); | 58 | return ERR_PTR(-ESTALE); |
59 | } | 59 | } |
60 | 60 | ||
61 | inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); | 61 | inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); |
62 | 62 | ||
63 | if (IS_ERR(inode)) { | 63 | if (IS_ERR(inode)) { |
64 | mlog_errno(PTR_ERR(inode)); | 64 | mlog_errno(PTR_ERR(inode)); |
@@ -115,7 +115,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) | |||
115 | goto bail_unlock; | 115 | goto bail_unlock; |
116 | } | 116 | } |
117 | 117 | ||
118 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | 118 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); |
119 | if (IS_ERR(inode)) { | 119 | if (IS_ERR(inode)) { |
120 | mlog(ML_ERROR, "Unable to create inode %llu\n", | 120 | mlog(ML_ERROR, "Unable to create inode %llu\n", |
121 | (unsigned long long)blkno); | 121 | (unsigned long long)blkno); |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 66ca7a82b68a..69d3db569166 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -54,8 +54,6 @@ | |||
54 | 54 | ||
55 | #include "buffer_head_io.h" | 55 | #include "buffer_head_io.h" |
56 | 56 | ||
57 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | ||
58 | #define OCFS2_FI_FLAG_DELETE 0x2 | ||
59 | struct ocfs2_find_inode_args | 57 | struct ocfs2_find_inode_args |
60 | { | 58 | { |
61 | u64 fi_blkno; | 59 | u64 fi_blkno; |
@@ -109,7 +107,7 @@ struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | |||
109 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | 107 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); |
110 | } | 108 | } |
111 | 109 | ||
112 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | 110 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) |
113 | { | 111 | { |
114 | struct inode *inode = NULL; | 112 | struct inode *inode = NULL; |
115 | struct super_block *sb = osb->sb; | 113 | struct super_block *sb = osb->sb; |
@@ -127,7 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | |||
127 | } | 125 | } |
128 | 126 | ||
129 | args.fi_blkno = blkno; | 127 | args.fi_blkno = blkno; |
130 | args.fi_flags = 0; | 128 | args.fi_flags = flags; |
131 | args.fi_ino = ino_from_blkno(sb, blkno); | 129 | args.fi_ino = ino_from_blkno(sb, blkno); |
132 | 130 | ||
133 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, | 131 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, |
@@ -297,15 +295,11 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
297 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | 295 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; |
298 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | 296 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); |
299 | 297 | ||
300 | if (create_ino) | ||
301 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
302 | le64_to_cpu(fe->i_blkno)); | ||
303 | |||
304 | mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n", | ||
305 | (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); | ||
306 | |||
307 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | 298 | inode->i_nlink = le16_to_cpu(fe->i_links_count); |
308 | 299 | ||
300 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | ||
301 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
302 | |||
309 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { | 303 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { |
310 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | 304 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; |
311 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); | 305 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); |
@@ -343,12 +337,28 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
343 | break; | 337 | break; |
344 | } | 338 | } |
345 | 339 | ||
340 | if (create_ino) { | ||
341 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
342 | le64_to_cpu(fe->i_blkno)); | ||
343 | |||
344 | /* | ||
345 | * If we ever want to create system files from kernel, | ||
346 | * the generation argument to | ||
347 | * ocfs2_inode_lock_res_init() will have to change. | ||
348 | */ | ||
349 | BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)); | ||
350 | |||
351 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
352 | OCFS2_LOCK_TYPE_META, 0, inode); | ||
353 | } | ||
354 | |||
346 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | 355 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, |
347 | OCFS2_LOCK_TYPE_RW, inode); | 356 | OCFS2_LOCK_TYPE_RW, inode->i_generation, |
348 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | 357 | inode); |
349 | OCFS2_LOCK_TYPE_META, inode); | 358 | |
350 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, | 359 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, |
351 | OCFS2_LOCK_TYPE_DATA, inode); | 360 | OCFS2_LOCK_TYPE_DATA, inode->i_generation, |
361 | inode); | ||
352 | 362 | ||
353 | ocfs2_set_inode_flags(inode); | 363 | ocfs2_set_inode_flags(inode); |
354 | inode->i_flags |= S_NOATIME; | 364 | inode->i_flags |= S_NOATIME; |
@@ -366,15 +376,15 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
366 | struct ocfs2_super *osb; | 376 | struct ocfs2_super *osb; |
367 | struct ocfs2_dinode *fe; | 377 | struct ocfs2_dinode *fe; |
368 | struct buffer_head *bh = NULL; | 378 | struct buffer_head *bh = NULL; |
369 | int status; | 379 | int status, can_lock; |
370 | int sysfile = 0; | 380 | u32 generation = 0; |
371 | 381 | ||
372 | mlog_entry("(0x%p, 0x%p)\n", inode, args); | 382 | mlog_entry("(0x%p, 0x%p)\n", inode, args); |
373 | 383 | ||
374 | status = -EINVAL; | 384 | status = -EINVAL; |
375 | if (inode == NULL || inode->i_sb == NULL) { | 385 | if (inode == NULL || inode->i_sb == NULL) { |
376 | mlog(ML_ERROR, "bad inode\n"); | 386 | mlog(ML_ERROR, "bad inode\n"); |
377 | goto bail; | 387 | return status; |
378 | } | 388 | } |
379 | sb = inode->i_sb; | 389 | sb = inode->i_sb; |
380 | osb = OCFS2_SB(sb); | 390 | osb = OCFS2_SB(sb); |
@@ -382,50 +392,110 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
382 | if (!args) { | 392 | if (!args) { |
383 | mlog(ML_ERROR, "bad inode args\n"); | 393 | mlog(ML_ERROR, "bad inode args\n"); |
384 | make_bad_inode(inode); | 394 | make_bad_inode(inode); |
385 | goto bail; | 395 | return status; |
386 | } | 396 | } |
387 | 397 | ||
388 | /* Read the FE off disk. This is safe because the kernel only | 398 | /* |
389 | * does one read_inode2 for a new inode, and if it doesn't | 399 | * To improve performance of cold-cache inode stats, we take |
390 | * exist yet then nobody can be working on it! */ | 400 | * the cluster lock here if possible. |
391 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); | 401 | * |
402 | * Generally, OCFS2 never trusts the contents of an inode | ||
403 | * unless it's holding a cluster lock, so taking it here isn't | ||
404 | * a correctness issue as much as it is a performance | ||
405 | * improvement. | ||
406 | * | ||
407 | * There are three times when taking the lock is not a good idea: | ||
408 | * | ||
409 | * 1) During startup, before we have initialized the DLM. | ||
410 | * | ||
411 | * 2) If we are reading certain system files which never get | ||
412 | * cluster locks (local alloc, truncate log). | ||
413 | * | ||
414 | * 3) If the process doing the iget() is responsible for | ||
415 | * orphan dir recovery. We're holding the orphan dir lock and | ||
416 | * can get into a deadlock with another process on another | ||
417 | * node in ->delete_inode(). | ||
418 | * | ||
419 | * #1 and #2 can be simply solved by never taking the lock | ||
420 | * here for system files (which are the only type we read | ||
421 | * during mount). It's a heavier approach, but our main | ||
422 | * concern is user-accesible files anyway. | ||
423 | * | ||
424 | * #3 works itself out because we'll eventually take the | ||
425 | * cluster lock before trusting anything anyway. | ||
426 | */ | ||
427 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | ||
428 | && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); | ||
429 | |||
430 | /* | ||
431 | * To maintain backwards compatibility with older versions of | ||
432 | * ocfs2-tools, we still store the generation value for system | ||
433 | * files. The only ones that actually matter to userspace are | ||
434 | * the journals, but it's easier and inexpensive to just flag | ||
435 | * all system files similarly. | ||
436 | */ | ||
437 | if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | ||
438 | generation = osb->fs_generation; | ||
439 | |||
440 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
441 | OCFS2_LOCK_TYPE_META, | ||
442 | generation, inode); | ||
443 | |||
444 | if (can_lock) { | ||
445 | status = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
446 | if (status) { | ||
447 | make_bad_inode(inode); | ||
448 | mlog_errno(status); | ||
449 | return status; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, | ||
454 | can_lock ? inode : NULL); | ||
392 | if (status < 0) { | 455 | if (status < 0) { |
393 | mlog_errno(status); | 456 | mlog_errno(status); |
394 | make_bad_inode(inode); | ||
395 | goto bail; | 457 | goto bail; |
396 | } | 458 | } |
397 | 459 | ||
460 | status = -EINVAL; | ||
398 | fe = (struct ocfs2_dinode *) bh->b_data; | 461 | fe = (struct ocfs2_dinode *) bh->b_data; |
399 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 462 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
400 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", | 463 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", |
401 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); | 464 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); |
402 | make_bad_inode(inode); | ||
403 | goto bail; | 465 | goto bail; |
404 | } | 466 | } |
405 | 467 | ||
406 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | 468 | /* |
407 | sysfile = 1; | 469 | * This is a code bug. Right now the caller needs to |
470 | * understand whether it is asking for a system file inode or | ||
471 | * not so the proper lock names can be built. | ||
472 | */ | ||
473 | mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != | ||
474 | !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), | ||
475 | "Inode %llu: system file state is ambigous\n", | ||
476 | (unsigned long long)args->fi_blkno); | ||
408 | 477 | ||
409 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || | 478 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || |
410 | S_ISBLK(le16_to_cpu(fe->i_mode))) | 479 | S_ISBLK(le16_to_cpu(fe->i_mode))) |
411 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | 480 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); |
412 | 481 | ||
413 | status = -EINVAL; | ||
414 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { | 482 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { |
415 | mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", | 483 | mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", |
416 | (unsigned long long)fe->i_blkno, inode->i_ino); | 484 | (unsigned long long)fe->i_blkno, inode->i_ino); |
417 | make_bad_inode(inode); | ||
418 | goto bail; | 485 | goto bail; |
419 | } | 486 | } |
420 | 487 | ||
421 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); | 488 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); |
422 | 489 | ||
423 | if (sysfile) | ||
424 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
425 | |||
426 | status = 0; | 490 | status = 0; |
427 | 491 | ||
428 | bail: | 492 | bail: |
493 | if (can_lock) | ||
494 | ocfs2_meta_unlock(inode, 0); | ||
495 | |||
496 | if (status < 0) | ||
497 | make_bad_inode(inode); | ||
498 | |||
429 | if (args && bh) | 499 | if (args && bh) |
430 | brelse(bh); | 500 | brelse(bh); |
431 | 501 | ||
@@ -898,9 +968,15 @@ void ocfs2_delete_inode(struct inode *inode) | |||
898 | goto bail_unlock_inode; | 968 | goto bail_unlock_inode; |
899 | } | 969 | } |
900 | 970 | ||
901 | /* Mark the inode as successfully deleted. This is important | 971 | /* |
902 | * for ocfs2_clear_inode as it will check this flag and skip | 972 | * Mark the inode as successfully deleted. |
903 | * any checkpointing work */ | 973 | * |
974 | * This is important for ocfs2_clear_inode() as it will check | ||
975 | * this flag and skip any checkpointing work | ||
976 | * | ||
977 | * ocfs2_stuff_meta_lvb() also uses this flag to invalidate | ||
978 | * the LVB for other nodes. | ||
979 | */ | ||
904 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; | 980 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; |
905 | 981 | ||
906 | bail_unlock_inode: | 982 | bail_unlock_inode: |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 4d1e53992566..9957810fdf85 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -122,7 +122,13 @@ struct buffer_head *ocfs2_bread(struct inode *inode, int block, | |||
122 | void ocfs2_clear_inode(struct inode *inode); | 122 | void ocfs2_clear_inode(struct inode *inode); |
123 | void ocfs2_delete_inode(struct inode *inode); | 123 | void ocfs2_delete_inode(struct inode *inode); |
124 | void ocfs2_drop_inode(struct inode *inode); | 124 | void ocfs2_drop_inode(struct inode *inode); |
125 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); | 125 | |
126 | /* Flags for ocfs2_iget() */ | ||
127 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | ||
128 | #define OCFS2_FI_FLAG_DELETE 0x2 | ||
129 | #define OCFS2_FI_FLAG_SYSFILE 0x4 | ||
130 | #define OCFS2_FI_FLAG_NOLOCK 0x8 | ||
131 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); | ||
126 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | 132 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, |
127 | u64 blkno, | 133 | u64 blkno, |
128 | int delete_vote); | 134 | int delete_vote); |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f92bf1dd379a..fd9734def551 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1493,7 +1493,8 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, | |||
1493 | if (de->name_len == 2 && !strncmp("..", de->name, 2)) | 1493 | if (de->name_len == 2 && !strncmp("..", de->name, 2)) |
1494 | continue; | 1494 | continue; |
1495 | 1495 | ||
1496 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); | 1496 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode), |
1497 | OCFS2_FI_FLAG_NOLOCK); | ||
1497 | if (IS_ERR(iter)) | 1498 | if (IS_ERR(iter)) |
1498 | continue; | 1499 | continue; |
1499 | 1500 | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6fa978874c33..849c3b4bb94a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -179,7 +179,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | |||
179 | if (status < 0) | 179 | if (status < 0) |
180 | goto bail_add; | 180 | goto bail_add; |
181 | 181 | ||
182 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | 182 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); |
183 | if (IS_ERR(inode)) { | 183 | if (IS_ERR(inode)) { |
184 | mlog(ML_ERROR, "Unable to create inode %llu\n", | 184 | mlog(ML_ERROR, "Unable to create inode %llu\n", |
185 | (unsigned long long)blkno); | 185 | (unsigned long long)blkno); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 33a6de6fc612..4c29cd7cc8e6 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -202,7 +202,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) | |||
202 | 202 | ||
203 | mlog_entry_void(); | 203 | mlog_entry_void(); |
204 | 204 | ||
205 | new = ocfs2_iget(osb, osb->root_blkno); | 205 | new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); |
206 | if (IS_ERR(new)) { | 206 | if (IS_ERR(new)) { |
207 | status = PTR_ERR(new); | 207 | status = PTR_ERR(new); |
208 | mlog_errno(status); | 208 | mlog_errno(status); |
@@ -210,7 +210,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) | |||
210 | } | 210 | } |
211 | osb->root_inode = new; | 211 | osb->root_inode = new; |
212 | 212 | ||
213 | new = ocfs2_iget(osb, osb->system_dir_blkno); | 213 | new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); |
214 | if (IS_ERR(new)) { | 214 | if (IS_ERR(new)) { |
215 | status = PTR_ERR(new); | 215 | status = PTR_ERR(new); |
216 | mlog_errno(status); | 216 | mlog_errno(status); |
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 98435002ac44..5df6e35d09b1 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c | |||
@@ -115,7 +115,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | |||
115 | goto bail; | 115 | goto bail; |
116 | } | 116 | } |
117 | 117 | ||
118 | inode = ocfs2_iget(osb, blkno); | 118 | inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); |
119 | if (IS_ERR(inode)) { | 119 | if (IS_ERR(inode)) { |
120 | mlog_errno(PTR_ERR(inode)); | 120 | mlog_errno(PTR_ERR(inode)); |
121 | inode = NULL; | 121 | inode = NULL; |