summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/iversion.h208
2 files changed, 154 insertions, 56 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76382c24e9d0..6804d075933e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -639,7 +639,7 @@ struct inode {
639 struct hlist_head i_dentry; 639 struct hlist_head i_dentry;
640 struct rcu_head i_rcu; 640 struct rcu_head i_rcu;
641 }; 641 };
642 u64 i_version; 642 atomic64_t i_version;
643 atomic_t i_count; 643 atomic_t i_count;
644 atomic_t i_dio_count; 644 atomic_t i_dio_count;
645 atomic_t i_writecount; 645 atomic_t i_writecount;
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index f268828f9f7e..858463fca249 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -5,6 +5,8 @@
5#include <linux/fs.h> 5#include <linux/fs.h>
6 6
7/* 7/*
8 * The inode->i_version field:
9 * ---------------------------
8 * The change attribute (i_version) is mandated by NFSv4 and is mostly for 10 * The change attribute (i_version) is mandated by NFSv4 and is mostly for
9 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must 11 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
10 * appear different to observers if there was a change to the inode's data or 12 * appear different to observers if there was a change to the inode's data or
@@ -33,86 +35,171 @@
33 * them. Also, i_version updates should never be delayed longer than it takes 35 * them. Also, i_version updates should never be delayed longer than it takes
34 * the original change to reach disk. 36 * the original change to reach disk.
35 * 37 *
38 * This implementation uses the low bit in the i_version field as a flag to
39 * track when the value has been queried. If it has not been queried since it
40 * was last incremented, we can skip the increment in most cases.
41 *
42 * In the event that we're updating the ctime, we will usually go ahead and
43 * bump the i_version anyway. Since that has to go to stable storage in some
44 * fashion, we might as well increment it as well.
45 *
46 * With this implementation, the value should always appear to observers to
47 * increase over time if the file has changed. It's recommended to use
48 * inode_cmp_iversion() helper to compare values.
49 *
36 * Note that some filesystems (e.g. NFS and AFS) just use the field to store 50 * Note that some filesystems (e.g. NFS and AFS) just use the field to store
37 * a server-provided value (for the most part). For that reason, those 51 * a server-provided value (for the most part). For that reason, those
38 * filesystems do not set SB_I_VERSION. These filesystems are considered to 52 * filesystems do not set SB_I_VERSION. These filesystems are considered to
39 * have a self-managed i_version. 53 * have a self-managed i_version.
54 *
55 * Persistently storing the i_version
56 * ----------------------------------
57 * Queries of the i_version field are not gated on them hitting the backing
58 * store. It's always possible that the host could crash after allowing
59 * a query of the value but before it has made it to disk.
60 *
61 * To mitigate this problem, filesystems should always use
62 * inode_set_iversion_queried when loading an existing inode from disk. This
63 * ensures that the next attempted inode increment will result in the value
64 * changing.
65 *
66 * Storing the value to disk therefore does not count as a query, so those
67 * filesystems should use inode_peek_iversion to grab the value to be stored.
68 * There is no need to flag the value as having been queried in that case.
40 */ 69 */
41 70
71/*
72 * We borrow the lowest bit in the i_version to use as a flag to tell whether
73 * it has been queried since we last incremented it. If it has, then we must
74 * increment it on the next change. After that, we can clear the flag and
75 * avoid incrementing it again until it has again been queried.
76 */
77#define I_VERSION_QUERIED_SHIFT (1)
78#define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
79#define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT)
80
42/** 81/**
43 * inode_set_iversion_raw - set i_version to the specified raw value 82 * inode_set_iversion_raw - set i_version to the specified raw value
44 * @inode: inode to set 83 * @inode: inode to set
45 * @new: new i_version value to set 84 * @val: new i_version value to set
46 * 85 *
47 * Set @inode's i_version field to @new. This function is for use by 86 * Set @inode's i_version field to @val. This function is for use by
48 * filesystems that self-manage the i_version. 87 * filesystems that self-manage the i_version.
49 * 88 *
50 * For example, the NFS client stores its NFSv4 change attribute in this way, 89 * For example, the NFS client stores its NFSv4 change attribute in this way,
51 * and the AFS client stores the data_version from the server here. 90 * and the AFS client stores the data_version from the server here.
52 */ 91 */
53static inline void 92static inline void
54inode_set_iversion_raw(struct inode *inode, u64 new) 93inode_set_iversion_raw(struct inode *inode, u64 val)
94{
95 atomic64_set(&inode->i_version, val);
96}
97
98/**
99 * inode_peek_iversion_raw - grab a "raw" iversion value
100 * @inode: inode from which i_version should be read
101 *
102 * Grab a "raw" inode->i_version value and return it. The i_version is not
103 * flagged or converted in any way. This is mostly used to access a self-managed
104 * i_version.
105 *
106 * With those filesystems, we want to treat the i_version as an entirely
107 * opaque value.
108 */
109static inline u64
110inode_peek_iversion_raw(const struct inode *inode)
55{ 111{
56 inode->i_version = new; 112 return atomic64_read(&inode->i_version);
57} 113}
58 114
59/** 115/**
60 * inode_set_iversion - set i_version to a particular value 116 * inode_set_iversion - set i_version to a particular value
61 * @inode: inode to set 117 * @inode: inode to set
62 * @new: new i_version value to set 118 * @val: new i_version value to set
63 * 119 *
64 * Set @inode's i_version field to @new. This function is for filesystems with 120 * Set @inode's i_version field to @val. This function is for filesystems with
65 * a kernel-managed i_version. 121 * a kernel-managed i_version, for initializing a newly-created inode from
122 * scratch.
66 * 123 *
67 * For now, this just does the same thing as the _raw variant. 124 * In this case, we do not set the QUERIED flag since we know that this value
125 * has never been queried.
68 */ 126 */
69static inline void 127static inline void
70inode_set_iversion(struct inode *inode, u64 new) 128inode_set_iversion(struct inode *inode, u64 val)
71{ 129{
72 inode_set_iversion_raw(inode, new); 130 inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
73} 131}
74 132
75/** 133/**
76 * inode_set_iversion_queried - set i_version to a particular value and set 134 * inode_set_iversion_queried - set i_version to a particular value as quereied
77 * flag to indicate that it has been viewed
78 * @inode: inode to set 135 * @inode: inode to set
79 * @new: new i_version value to set 136 * @val: new i_version value to set
80 * 137 *
81 * When loading in an i_version value from a backing store, we typically don't 138 * Set @inode's i_version field to @val, and flag it for increment on the next
82 * know whether it was previously viewed before being stored or not. Thus, we 139 * change.
83 * must assume that it was, to ensure that any changes will result in the
84 * value changing.
85 * 140 *
86 * This function will set the inode's i_version, and possibly flag the value 141 * Filesystems that persistently store the i_version on disk should use this
87 * as if it has already been viewed at least once. 142 * when loading an existing inode from disk.
88 * 143 *
89 * For now, this just does what inode_set_iversion does. 144 * When loading in an i_version value from a backing store, we can't be certain
145 * that it wasn't previously viewed before being stored. Thus, we must assume
146 * that it was, to ensure that we don't end up handing out the same value for
147 * different versions of the same inode.
90 */ 148 */
91static inline void 149static inline void
92inode_set_iversion_queried(struct inode *inode, u64 new) 150inode_set_iversion_queried(struct inode *inode, u64 val)
93{ 151{
94 inode_set_iversion(inode, new); 152 inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
153 I_VERSION_QUERIED);
95} 154}
96 155
97/** 156/**
98 * inode_maybe_inc_iversion - increments i_version 157 * inode_maybe_inc_iversion - increments i_version
99 * @inode: inode with the i_version that should be updated 158 * @inode: inode with the i_version that should be updated
100 * @force: increment the counter even if it's not necessary 159 * @force: increment the counter even if it's not necessary?
101 * 160 *
102 * Every time the inode is modified, the i_version field must be seen to have 161 * Every time the inode is modified, the i_version field must be seen to have
103 * changed by any observer. 162 * changed by any observer.
104 * 163 *
105 * In this implementation, we always increment it after taking the i_lock to 164 * If "force" is set or the QUERIED flag is set, then ensure that we increment
106 * ensure that we don't race with other incrementors. 165 * the value, and clear the queried flag.
107 * 166 *
108 * Returns true if counter was bumped, and false if it wasn't. 167 * In the common case where neither is set, then we can return "false" without
168 * updating i_version.
169 *
170 * If this function returns false, and no other metadata has changed, then we
171 * can avoid logging the metadata.
109 */ 172 */
110static inline bool 173static inline bool
111inode_maybe_inc_iversion(struct inode *inode, bool force) 174inode_maybe_inc_iversion(struct inode *inode, bool force)
112{ 175{
113 atomic64_t *ivp = (atomic64_t *)&inode->i_version; 176 u64 cur, old, new;
177
178 /*
179 * The i_version field is not strictly ordered with any other inode
180 * information, but the legacy inode_inc_iversion code used a spinlock
181 * to serialize increments.
182 *
183 * Here, we add full memory barriers to ensure that any de-facto
184 * ordering with other info is preserved.
185 *
186 * This barrier pairs with the barrier in inode_query_iversion()
187 */
188 smp_mb();
189 cur = inode_peek_iversion_raw(inode);
190 for (;;) {
191 /* If flag is clear then we needn't do anything */
192 if (!force && !(cur & I_VERSION_QUERIED))
193 return false;
114 194
115 atomic64_inc(ivp); 195 /* Since lowest bit is flag, add 2 to avoid it */
196 new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
197
198 old = atomic64_cmpxchg(&inode->i_version, cur, new);
199 if (likely(old == cur))
200 break;
201 cur = old;
202 }
116 return true; 203 return true;
117} 204}
118 205
@@ -135,31 +222,12 @@ inode_inc_iversion(struct inode *inode)
135 * @inode: inode to check 222 * @inode: inode to check
136 * 223 *
137 * Returns whether the inode->i_version counter needs incrementing on the next 224 * Returns whether the inode->i_version counter needs incrementing on the next
138 * change. 225 * change. Just fetch the value and check the QUERIED flag.
139 *
140 * For now, we assume that it always does.
141 */ 226 */
142static inline bool 227static inline bool
143inode_iversion_need_inc(struct inode *inode) 228inode_iversion_need_inc(struct inode *inode)
144{ 229{
145 return true; 230 return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
146}
147
148/**
149 * inode_peek_iversion_raw - grab a "raw" iversion value
150 * @inode: inode from which i_version should be read
151 *
152 * Grab a "raw" inode->i_version value and return it. The i_version is not
153 * flagged or converted in any way. This is mostly used to access a self-managed
154 * i_version.
155 *
156 * With those filesystems, we want to treat the i_version as an entirely
157 * opaque value.
158 */
159static inline u64
160inode_peek_iversion_raw(const struct inode *inode)
161{
162 return inode->i_version;
163} 231}
164 232
165/** 233/**
@@ -176,7 +244,7 @@ inode_peek_iversion_raw(const struct inode *inode)
176static inline void 244static inline void
177inode_inc_iversion_raw(struct inode *inode) 245inode_inc_iversion_raw(struct inode *inode)
178{ 246{
179 inode_inc_iversion(inode); 247 atomic64_inc(&inode->i_version);
180} 248}
181 249
182/** 250/**
@@ -193,7 +261,7 @@ inode_inc_iversion_raw(struct inode *inode)
193static inline u64 261static inline u64
194inode_peek_iversion(const struct inode *inode) 262inode_peek_iversion(const struct inode *inode)
195{ 263{
196 return inode_peek_iversion_raw(inode); 264 return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
197} 265}
198 266
199/** 267/**
@@ -205,12 +273,35 @@ inode_peek_iversion(const struct inode *inode)
205 * that a later query of the i_version will result in a different value if 273 * that a later query of the i_version will result in a different value if
206 * anything has changed. 274 * anything has changed.
207 * 275 *
208 * This implementation just does a peek. 276 * In this implementation, we fetch the current value, set the QUERIED flag and
277 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
278 * that fails, we try again with the newly fetched value from the cmpxchg.
209 */ 279 */
210static inline u64 280static inline u64
211inode_query_iversion(struct inode *inode) 281inode_query_iversion(struct inode *inode)
212{ 282{
213 return inode_peek_iversion(inode); 283 u64 cur, old, new;
284
285 cur = inode_peek_iversion_raw(inode);
286 for (;;) {
287 /* If flag is already set, then no need to swap */
288 if (cur & I_VERSION_QUERIED) {
289 /*
290 * This barrier (and the implicit barrier in the
291 * cmpxchg below) pairs with the barrier in
292 * inode_maybe_inc_iversion().
293 */
294 smp_mb();
295 break;
296 }
297
298 new = cur | I_VERSION_QUERIED;
299 old = atomic64_cmpxchg(&inode->i_version, cur, new);
300 if (likely(old == cur))
301 break;
302 cur = old;
303 }
304 return cur >> I_VERSION_QUERIED_SHIFT;
214} 305}
215 306
216/** 307/**
@@ -233,11 +324,18 @@ inode_cmp_iversion_raw(const struct inode *inode, u64 old)
233 * @old: old value to check against its i_version 324 * @old: old value to check against its i_version
234 * 325 *
235 * Compare an i_version counter with a previous one. Returns 0 if they are 326 * Compare an i_version counter with a previous one. Returns 0 if they are
236 * the same or non-zero if they are different. 327 * the same, a positive value if the one in the inode appears newer than @old,
328 * and a negative value if @old appears to be newer than the one in the
329 * inode.
330 *
331 * Note that we don't need to set the QUERIED flag in this case, as the value
332 * in the inode is not being recorded for later use.
237 */ 333 */
334
238static inline s64 335static inline s64
239inode_cmp_iversion(const struct inode *inode, u64 old) 336inode_cmp_iversion(const struct inode *inode, u64 old)
240{ 337{
241 return (s64)inode_peek_iversion(inode) - (s64)old; 338 return (s64)(inode_peek_iversion_raw(inode) & ~I_VERSION_QUERIED) -
339 (s64)(old << I_VERSION_QUERIED_SHIFT);
242} 340}
243#endif 341#endif