diff options
-rw-r--r-- | include/linux/fs.h | 2 | ||||
-rw-r--r-- | include/linux/iversion.h | 208 |
2 files changed, 154 insertions, 56 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h index 76382c24e9d0..6804d075933e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -639,7 +639,7 @@ struct inode { | |||
639 | struct hlist_head i_dentry; | 639 | struct hlist_head i_dentry; |
640 | struct rcu_head i_rcu; | 640 | struct rcu_head i_rcu; |
641 | }; | 641 | }; |
642 | u64 i_version; | 642 | atomic64_t i_version; |
643 | atomic_t i_count; | 643 | atomic_t i_count; |
644 | atomic_t i_dio_count; | 644 | atomic_t i_dio_count; |
645 | atomic_t i_writecount; | 645 | atomic_t i_writecount; |
diff --git a/include/linux/iversion.h b/include/linux/iversion.h index f268828f9f7e..858463fca249 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h | |||
@@ -5,6 +5,8 @@ | |||
5 | #include <linux/fs.h> | 5 | #include <linux/fs.h> |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * The inode->i_version field: | ||
9 | * --------------------------- | ||
8 | * The change attribute (i_version) is mandated by NFSv4 and is mostly for | 10 | * The change attribute (i_version) is mandated by NFSv4 and is mostly for |
9 | * knfsd, but is also used for other purposes (e.g. IMA). The i_version must | 11 | * knfsd, but is also used for other purposes (e.g. IMA). The i_version must |
10 | * appear different to observers if there was a change to the inode's data or | 12 | * appear different to observers if there was a change to the inode's data or |
@@ -33,86 +35,171 @@ | |||
33 | * them. Also, i_version updates should never be delayed longer than it takes | 35 | * them. Also, i_version updates should never be delayed longer than it takes |
34 | * the original change to reach disk. | 36 | * the original change to reach disk. |
35 | * | 37 | * |
38 | * This implementation uses the low bit in the i_version field as a flag to | ||
39 | * track when the value has been queried. If it has not been queried since it | ||
40 | * was last incremented, we can skip the increment in most cases. | ||
41 | * | ||
42 | * In the event that we're updating the ctime, we will usually go ahead and | ||
43 | * bump the i_version anyway. Since that has to go to stable storage in some | ||
44 | * fashion, we might as well increment it as well. | ||
45 | * | ||
46 | * With this implementation, the value should always appear to observers to | ||
47 | * increase over time if the file has changed. It's recommended to use | ||
48 | * inode_cmp_iversion() helper to compare values. | ||
49 | * | ||
36 | * Note that some filesystems (e.g. NFS and AFS) just use the field to store | 50 | * Note that some filesystems (e.g. NFS and AFS) just use the field to store |
37 | * a server-provided value (for the most part). For that reason, those | 51 | * a server-provided value (for the most part). For that reason, those |
38 | * filesystems do not set SB_I_VERSION. These filesystems are considered to | 52 | * filesystems do not set SB_I_VERSION. These filesystems are considered to |
39 | * have a self-managed i_version. | 53 | * have a self-managed i_version. |
54 | * | ||
55 | * Persistently storing the i_version | ||
56 | * ---------------------------------- | ||
57 | * Queries of the i_version field are not gated on them hitting the backing | ||
58 | * store. It's always possible that the host could crash after allowing | ||
59 | * a query of the value but before it has made it to disk. | ||
60 | * | ||
61 | * To mitigate this problem, filesystems should always use | ||
62 | * inode_set_iversion_queried when loading an existing inode from disk. This | ||
63 | * ensures that the next attempted inode increment will result in the value | ||
64 | * changing. | ||
65 | * | ||
66 | * Storing the value to disk therefore does not count as a query, so those | ||
67 | * filesystems should use inode_peek_iversion to grab the value to be stored. | ||
68 | * There is no need to flag the value as having been queried in that case. | ||
40 | */ | 69 | */ |
41 | 70 | ||
71 | /* | ||
72 | * We borrow the lowest bit in the i_version to use as a flag to tell whether | ||
73 | * it has been queried since we last incremented it. If it has, then we must | ||
74 | * increment it on the next change. After that, we can clear the flag and | ||
75 | * avoid incrementing it again until it has again been queried. | ||
76 | */ | ||
77 | #define I_VERSION_QUERIED_SHIFT (1) | ||
78 | #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) | ||
79 | #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) | ||
80 | |||
42 | /** | 81 | /** |
43 | * inode_set_iversion_raw - set i_version to the specified raw value | 82 | * inode_set_iversion_raw - set i_version to the specified raw value |
44 | * @inode: inode to set | 83 | * @inode: inode to set |
45 | * @new: new i_version value to set | 84 | * @val: new i_version value to set |
46 | * | 85 | * |
47 | * Set @inode's i_version field to @new. This function is for use by | 86 | * Set @inode's i_version field to @val. This function is for use by |
48 | * filesystems that self-manage the i_version. | 87 | * filesystems that self-manage the i_version. |
49 | * | 88 | * |
50 | * For example, the NFS client stores its NFSv4 change attribute in this way, | 89 | * For example, the NFS client stores its NFSv4 change attribute in this way, |
51 | * and the AFS client stores the data_version from the server here. | 90 | * and the AFS client stores the data_version from the server here. |
52 | */ | 91 | */ |
53 | static inline void | 92 | static inline void |
54 | inode_set_iversion_raw(struct inode *inode, u64 new) | 93 | inode_set_iversion_raw(struct inode *inode, u64 val) |
94 | { | ||
95 | atomic64_set(&inode->i_version, val); | ||
96 | } | ||
97 | |||
98 | /** | ||
99 | * inode_peek_iversion_raw - grab a "raw" iversion value | ||
100 | * @inode: inode from which i_version should be read | ||
101 | * | ||
102 | * Grab a "raw" inode->i_version value and return it. The i_version is not | ||
103 | * flagged or converted in any way. This is mostly used to access a self-managed | ||
104 | * i_version. | ||
105 | * | ||
106 | * With those filesystems, we want to treat the i_version as an entirely | ||
107 | * opaque value. | ||
108 | */ | ||
109 | static inline u64 | ||
110 | inode_peek_iversion_raw(const struct inode *inode) | ||
55 | { | 111 | { |
56 | inode->i_version = new; | 112 | return atomic64_read(&inode->i_version); |
57 | } | 113 | } |
58 | 114 | ||
59 | /** | 115 | /** |
60 | * inode_set_iversion - set i_version to a particular value | 116 | * inode_set_iversion - set i_version to a particular value |
61 | * @inode: inode to set | 117 | * @inode: inode to set |
62 | * @new: new i_version value to set | 118 | * @val: new i_version value to set |
63 | * | 119 | * |
64 | * Set @inode's i_version field to @new. This function is for filesystems with | 120 | * Set @inode's i_version field to @val. This function is for filesystems with |
65 | * a kernel-managed i_version. | 121 | * a kernel-managed i_version, for initializing a newly-created inode from |
122 | * scratch. | ||
66 | * | 123 | * |
67 | * For now, this just does the same thing as the _raw variant. | 124 | * In this case, we do not set the QUERIED flag since we know that this value |
125 | * has never been queried. | ||
68 | */ | 126 | */ |
69 | static inline void | 127 | static inline void |
70 | inode_set_iversion(struct inode *inode, u64 new) | 128 | inode_set_iversion(struct inode *inode, u64 val) |
71 | { | 129 | { |
72 | inode_set_iversion_raw(inode, new); | 130 | inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); |
73 | } | 131 | } |
74 | 132 | ||
75 | /** | 133 | /** |
76 | * inode_set_iversion_queried - set i_version to a particular value and set | 134 | * inode_set_iversion_queried - set i_version to a particular value as quereied |
77 | * flag to indicate that it has been viewed | ||
78 | * @inode: inode to set | 135 | * @inode: inode to set |
79 | * @new: new i_version value to set | 136 | * @val: new i_version value to set |
80 | * | 137 | * |
81 | * When loading in an i_version value from a backing store, we typically don't | 138 | * Set @inode's i_version field to @val, and flag it for increment on the next |
82 | * know whether it was previously viewed before being stored or not. Thus, we | 139 | * change. |
83 | * must assume that it was, to ensure that any changes will result in the | ||
84 | * value changing. | ||
85 | * | 140 | * |
86 | * This function will set the inode's i_version, and possibly flag the value | 141 | * Filesystems that persistently store the i_version on disk should use this |
87 | * as if it has already been viewed at least once. | 142 | * when loading an existing inode from disk. |
88 | * | 143 | * |
89 | * For now, this just does what inode_set_iversion does. | 144 | * When loading in an i_version value from a backing store, we can't be certain |
145 | * that it wasn't previously viewed before being stored. Thus, we must assume | ||
146 | * that it was, to ensure that we don't end up handing out the same value for | ||
147 | * different versions of the same inode. | ||
90 | */ | 148 | */ |
91 | static inline void | 149 | static inline void |
92 | inode_set_iversion_queried(struct inode *inode, u64 new) | 150 | inode_set_iversion_queried(struct inode *inode, u64 val) |
93 | { | 151 | { |
94 | inode_set_iversion(inode, new); | 152 | inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | |
153 | I_VERSION_QUERIED); | ||
95 | } | 154 | } |
96 | 155 | ||
97 | /** | 156 | /** |
98 | * inode_maybe_inc_iversion - increments i_version | 157 | * inode_maybe_inc_iversion - increments i_version |
99 | * @inode: inode with the i_version that should be updated | 158 | * @inode: inode with the i_version that should be updated |
100 | * @force: increment the counter even if it's not necessary | 159 | * @force: increment the counter even if it's not necessary? |
101 | * | 160 | * |
102 | * Every time the inode is modified, the i_version field must be seen to have | 161 | * Every time the inode is modified, the i_version field must be seen to have |
103 | * changed by any observer. | 162 | * changed by any observer. |
104 | * | 163 | * |
105 | * In this implementation, we always increment it after taking the i_lock to | 164 | * If "force" is set or the QUERIED flag is set, then ensure that we increment |
106 | * ensure that we don't race with other incrementors. | 165 | * the value, and clear the queried flag. |
107 | * | 166 | * |
108 | * Returns true if counter was bumped, and false if it wasn't. | 167 | * In the common case where neither is set, then we can return "false" without |
168 | * updating i_version. | ||
169 | * | ||
170 | * If this function returns false, and no other metadata has changed, then we | ||
171 | * can avoid logging the metadata. | ||
109 | */ | 172 | */ |
110 | static inline bool | 173 | static inline bool |
111 | inode_maybe_inc_iversion(struct inode *inode, bool force) | 174 | inode_maybe_inc_iversion(struct inode *inode, bool force) |
112 | { | 175 | { |
113 | atomic64_t *ivp = (atomic64_t *)&inode->i_version; | 176 | u64 cur, old, new; |
177 | |||
178 | /* | ||
179 | * The i_version field is not strictly ordered with any other inode | ||
180 | * information, but the legacy inode_inc_iversion code used a spinlock | ||
181 | * to serialize increments. | ||
182 | * | ||
183 | * Here, we add full memory barriers to ensure that any de-facto | ||
184 | * ordering with other info is preserved. | ||
185 | * | ||
186 | * This barrier pairs with the barrier in inode_query_iversion() | ||
187 | */ | ||
188 | smp_mb(); | ||
189 | cur = inode_peek_iversion_raw(inode); | ||
190 | for (;;) { | ||
191 | /* If flag is clear then we needn't do anything */ | ||
192 | if (!force && !(cur & I_VERSION_QUERIED)) | ||
193 | return false; | ||
114 | 194 | ||
115 | atomic64_inc(ivp); | 195 | /* Since lowest bit is flag, add 2 to avoid it */ |
196 | new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; | ||
197 | |||
198 | old = atomic64_cmpxchg(&inode->i_version, cur, new); | ||
199 | if (likely(old == cur)) | ||
200 | break; | ||
201 | cur = old; | ||
202 | } | ||
116 | return true; | 203 | return true; |
117 | } | 204 | } |
118 | 205 | ||
@@ -135,31 +222,12 @@ inode_inc_iversion(struct inode *inode) | |||
135 | * @inode: inode to check | 222 | * @inode: inode to check |
136 | * | 223 | * |
137 | * Returns whether the inode->i_version counter needs incrementing on the next | 224 | * Returns whether the inode->i_version counter needs incrementing on the next |
138 | * change. | 225 | * change. Just fetch the value and check the QUERIED flag. |
139 | * | ||
140 | * For now, we assume that it always does. | ||
141 | */ | 226 | */ |
142 | static inline bool | 227 | static inline bool |
143 | inode_iversion_need_inc(struct inode *inode) | 228 | inode_iversion_need_inc(struct inode *inode) |
144 | { | 229 | { |
145 | return true; | 230 | return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; |
146 | } | ||
147 | |||
148 | /** | ||
149 | * inode_peek_iversion_raw - grab a "raw" iversion value | ||
150 | * @inode: inode from which i_version should be read | ||
151 | * | ||
152 | * Grab a "raw" inode->i_version value and return it. The i_version is not | ||
153 | * flagged or converted in any way. This is mostly used to access a self-managed | ||
154 | * i_version. | ||
155 | * | ||
156 | * With those filesystems, we want to treat the i_version as an entirely | ||
157 | * opaque value. | ||
158 | */ | ||
159 | static inline u64 | ||
160 | inode_peek_iversion_raw(const struct inode *inode) | ||
161 | { | ||
162 | return inode->i_version; | ||
163 | } | 231 | } |
164 | 232 | ||
165 | /** | 233 | /** |
@@ -176,7 +244,7 @@ inode_peek_iversion_raw(const struct inode *inode) | |||
176 | static inline void | 244 | static inline void |
177 | inode_inc_iversion_raw(struct inode *inode) | 245 | inode_inc_iversion_raw(struct inode *inode) |
178 | { | 246 | { |
179 | inode_inc_iversion(inode); | 247 | atomic64_inc(&inode->i_version); |
180 | } | 248 | } |
181 | 249 | ||
182 | /** | 250 | /** |
@@ -193,7 +261,7 @@ inode_inc_iversion_raw(struct inode *inode) | |||
193 | static inline u64 | 261 | static inline u64 |
194 | inode_peek_iversion(const struct inode *inode) | 262 | inode_peek_iversion(const struct inode *inode) |
195 | { | 263 | { |
196 | return inode_peek_iversion_raw(inode); | 264 | return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; |
197 | } | 265 | } |
198 | 266 | ||
199 | /** | 267 | /** |
@@ -205,12 +273,35 @@ inode_peek_iversion(const struct inode *inode) | |||
205 | * that a later query of the i_version will result in a different value if | 273 | * that a later query of the i_version will result in a different value if |
206 | * anything has changed. | 274 | * anything has changed. |
207 | * | 275 | * |
208 | * This implementation just does a peek. | 276 | * In this implementation, we fetch the current value, set the QUERIED flag and |
277 | * then try to swap it into place with a cmpxchg, if it wasn't already set. If | ||
278 | * that fails, we try again with the newly fetched value from the cmpxchg. | ||
209 | */ | 279 | */ |
210 | static inline u64 | 280 | static inline u64 |
211 | inode_query_iversion(struct inode *inode) | 281 | inode_query_iversion(struct inode *inode) |
212 | { | 282 | { |
213 | return inode_peek_iversion(inode); | 283 | u64 cur, old, new; |
284 | |||
285 | cur = inode_peek_iversion_raw(inode); | ||
286 | for (;;) { | ||
287 | /* If flag is already set, then no need to swap */ | ||
288 | if (cur & I_VERSION_QUERIED) { | ||
289 | /* | ||
290 | * This barrier (and the implicit barrier in the | ||
291 | * cmpxchg below) pairs with the barrier in | ||
292 | * inode_maybe_inc_iversion(). | ||
293 | */ | ||
294 | smp_mb(); | ||
295 | break; | ||
296 | } | ||
297 | |||
298 | new = cur | I_VERSION_QUERIED; | ||
299 | old = atomic64_cmpxchg(&inode->i_version, cur, new); | ||
300 | if (likely(old == cur)) | ||
301 | break; | ||
302 | cur = old; | ||
303 | } | ||
304 | return cur >> I_VERSION_QUERIED_SHIFT; | ||
214 | } | 305 | } |
215 | 306 | ||
216 | /** | 307 | /** |
@@ -233,11 +324,18 @@ inode_cmp_iversion_raw(const struct inode *inode, u64 old) | |||
233 | * @old: old value to check against its i_version | 324 | * @old: old value to check against its i_version |
234 | * | 325 | * |
235 | * Compare an i_version counter with a previous one. Returns 0 if they are | 326 | * Compare an i_version counter with a previous one. Returns 0 if they are |
236 | * the same or non-zero if they are different. | 327 | * the same, a positive value if the one in the inode appears newer than @old, |
328 | * and a negative value if @old appears to be newer than the one in the | ||
329 | * inode. | ||
330 | * | ||
331 | * Note that we don't need to set the QUERIED flag in this case, as the value | ||
332 | * in the inode is not being recorded for later use. | ||
237 | */ | 333 | */ |
334 | |||
238 | static inline s64 | 335 | static inline s64 |
239 | inode_cmp_iversion(const struct inode *inode, u64 old) | 336 | inode_cmp_iversion(const struct inode *inode, u64 old) |
240 | { | 337 | { |
241 | return (s64)inode_peek_iversion(inode) - (s64)old; | 338 | return (s64)(inode_peek_iversion_raw(inode) & ~I_VERSION_QUERIED) - |
339 | (s64)(old << I_VERSION_QUERIED_SHIFT); | ||
242 | } | 340 | } |
243 | #endif | 341 | #endif |