summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJeff Layton <jlayton@redhat.com>2017-12-21 07:45:44 -0500
committerJeff Layton <jlayton@redhat.com>2018-01-29 06:42:21 -0500
commitf02a9ad1f15daf4378afeda025a53455f72645dd (patch)
treeffbc9e45fb0c15a214699c0bbe38905a9ce80fcc /include
parent3a8c7231d53641a21d794c7406044e19ad299a00 (diff)
fs: handle inode->i_version more efficiently
Since i_version is mostly treated as an opaque value, we can exploit that fact to avoid incrementing it when no one is watching. With that change, we can avoid incrementing the counter on writes, unless someone has queried for it since it was last incremented. If the a/c/mtime don't change, and the i_version hasn't changed, then there's no need to dirty the inode metadata on a write. Convert the i_version counter to an atomic64_t, and use the lowest order bit to hold a flag that will tell whether anyone has queried the value since it was last incremented. When we go to maybe increment it, we fetch the value and check the flag bit. If it's clear then we don't need to do anything if the update isn't being forced. If we do need to update, then we increment the counter by 2, and clear the flag bit, and then use a CAS op to swap it into place. If that works, we return true. If it doesn't then do it again with the value that we fetch from the CAS operation. On the query side, if the flag is already set, then we just shift the value down by 1 bit and return it. Otherwise, we set the flag in our on-stack value and again use cmpxchg to swap it into place if it hasn't changed. If it has, then we use the value from the cmpxchg as the new "old" value and try again. This method allows us to avoid incrementing the counter on writes (and dirtying the metadata) under typical workloads. We only need to increment if it has been queried since it was last changed. Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz> Acked-by: Dave Chinner <dchinner@redhat.com> Tested-by: Krzysztof Kozlowski <krzk@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/iversion.h208
2 files changed, 154 insertions, 56 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76382c24e9d0..6804d075933e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -639,7 +639,7 @@ struct inode {
639 struct hlist_head i_dentry; 639 struct hlist_head i_dentry;
640 struct rcu_head i_rcu; 640 struct rcu_head i_rcu;
641 }; 641 };
642 u64 i_version; 642 atomic64_t i_version;
643 atomic_t i_count; 643 atomic_t i_count;
644 atomic_t i_dio_count; 644 atomic_t i_dio_count;
645 atomic_t i_writecount; 645 atomic_t i_writecount;
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index f268828f9f7e..858463fca249 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -5,6 +5,8 @@
5#include <linux/fs.h> 5#include <linux/fs.h>
6 6
7/* 7/*
8 * The inode->i_version field:
9 * ---------------------------
8 * The change attribute (i_version) is mandated by NFSv4 and is mostly for 10 * The change attribute (i_version) is mandated by NFSv4 and is mostly for
9 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must 11 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
10 * appear different to observers if there was a change to the inode's data or 12 * appear different to observers if there was a change to the inode's data or
@@ -33,86 +35,171 @@
33 * them. Also, i_version updates should never be delayed longer than it takes 35 * them. Also, i_version updates should never be delayed longer than it takes
34 * the original change to reach disk. 36 * the original change to reach disk.
35 * 37 *
38 * This implementation uses the low bit in the i_version field as a flag to
39 * track when the value has been queried. If it has not been queried since it
40 * was last incremented, we can skip the increment in most cases.
41 *
42 * In the event that we're updating the ctime, we will usually go ahead and
43 * bump the i_version anyway. Since that has to go to stable storage in some
44 * fashion, we might as well increment it as well.
45 *
46 * With this implementation, the value should always appear to observers to
47 * increase over time if the file has changed. It's recommended to use
48 * inode_cmp_iversion() helper to compare values.
49 *
36 * Note that some filesystems (e.g. NFS and AFS) just use the field to store 50 * Note that some filesystems (e.g. NFS and AFS) just use the field to store
37 * a server-provided value (for the most part). For that reason, those 51 * a server-provided value (for the most part). For that reason, those
38 * filesystems do not set SB_I_VERSION. These filesystems are considered to 52 * filesystems do not set SB_I_VERSION. These filesystems are considered to
39 * have a self-managed i_version. 53 * have a self-managed i_version.
54 *
55 * Persistently storing the i_version
56 * ----------------------------------
57 * Queries of the i_version field are not gated on them hitting the backing
58 * store. It's always possible that the host could crash after allowing
59 * a query of the value but before it has made it to disk.
60 *
61 * To mitigate this problem, filesystems should always use
62 * inode_set_iversion_queried when loading an existing inode from disk. This
63 * ensures that the next attempted inode increment will result in the value
64 * changing.
65 *
66 * Storing the value to disk therefore does not count as a query, so those
67 * filesystems should use inode_peek_iversion to grab the value to be stored.
68 * There is no need to flag the value as having been queried in that case.
40 */ 69 */
41 70
71/*
72 * We borrow the lowest bit in the i_version to use as a flag to tell whether
73 * it has been queried since we last incremented it. If it has, then we must
74 * increment it on the next change. After that, we can clear the flag and
75 * avoid incrementing it again until it has again been queried.
76 */
77#define I_VERSION_QUERIED_SHIFT (1)
78#define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
79#define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT)
80
42/** 81/**
43 * inode_set_iversion_raw - set i_version to the specified raw value 82 * inode_set_iversion_raw - set i_version to the specified raw value
44 * @inode: inode to set 83 * @inode: inode to set
45 * @new: new i_version value to set 84 * @val: new i_version value to set
46 * 85 *
47 * Set @inode's i_version field to @new. This function is for use by 86 * Set @inode's i_version field to @val. This function is for use by
48 * filesystems that self-manage the i_version. 87 * filesystems that self-manage the i_version.
49 * 88 *
50 * For example, the NFS client stores its NFSv4 change attribute in this way, 89 * For example, the NFS client stores its NFSv4 change attribute in this way,
51 * and the AFS client stores the data_version from the server here. 90 * and the AFS client stores the data_version from the server here.
52 */ 91 */
53static inline void 92static inline void
54inode_set_iversion_raw(struct inode *inode, u64 new) 93inode_set_iversion_raw(struct inode *inode, u64 val)
94{
95 atomic64_set(&inode->i_version, val);
96}
97
98/**
99 * inode_peek_iversion_raw - grab a "raw" iversion value
100 * @inode: inode from which i_version should be read
101 *
102 * Grab a "raw" inode->i_version value and return it. The i_version is not
103 * flagged or converted in any way. This is mostly used to access a self-managed
104 * i_version.
105 *
106 * With those filesystems, we want to treat the i_version as an entirely
107 * opaque value.
108 */
109static inline u64
110inode_peek_iversion_raw(const struct inode *inode)
55{ 111{
56 inode->i_version = new; 112 return atomic64_read(&inode->i_version);
57} 113}
58 114
59/** 115/**
60 * inode_set_iversion - set i_version to a particular value 116 * inode_set_iversion - set i_version to a particular value
61 * @inode: inode to set 117 * @inode: inode to set
62 * @new: new i_version value to set 118 * @val: new i_version value to set
63 * 119 *
64 * Set @inode's i_version field to @new. This function is for filesystems with 120 * Set @inode's i_version field to @val. This function is for filesystems with
65 * a kernel-managed i_version. 121 * a kernel-managed i_version, for initializing a newly-created inode from
122 * scratch.
66 * 123 *
67 * For now, this just does the same thing as the _raw variant. 124 * In this case, we do not set the QUERIED flag since we know that this value
125 * has never been queried.
68 */ 126 */
69static inline void 127static inline void
70inode_set_iversion(struct inode *inode, u64 new) 128inode_set_iversion(struct inode *inode, u64 val)
71{ 129{
72 inode_set_iversion_raw(inode, new); 130 inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
73} 131}
74 132
75/** 133/**
76 * inode_set_iversion_queried - set i_version to a particular value and set 134 * inode_set_iversion_queried - set i_version to a particular value as quereied
77 * flag to indicate that it has been viewed
78 * @inode: inode to set 135 * @inode: inode to set
79 * @new: new i_version value to set 136 * @val: new i_version value to set
80 * 137 *
81 * When loading in an i_version value from a backing store, we typically don't 138 * Set @inode's i_version field to @val, and flag it for increment on the next
82 * know whether it was previously viewed before being stored or not. Thus, we 139 * change.
83 * must assume that it was, to ensure that any changes will result in the
84 * value changing.
85 * 140 *
86 * This function will set the inode's i_version, and possibly flag the value 141 * Filesystems that persistently store the i_version on disk should use this
87 * as if it has already been viewed at least once. 142 * when loading an existing inode from disk.
88 * 143 *
89 * For now, this just does what inode_set_iversion does. 144 * When loading in an i_version value from a backing store, we can't be certain
145 * that it wasn't previously viewed before being stored. Thus, we must assume
146 * that it was, to ensure that we don't end up handing out the same value for
147 * different versions of the same inode.
90 */ 148 */
91static inline void 149static inline void
92inode_set_iversion_queried(struct inode *inode, u64 new) 150inode_set_iversion_queried(struct inode *inode, u64 val)
93{ 151{
94 inode_set_iversion(inode, new); 152 inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
153 I_VERSION_QUERIED);
95} 154}
96 155
97/** 156/**
98 * inode_maybe_inc_iversion - increments i_version 157 * inode_maybe_inc_iversion - increments i_version
99 * @inode: inode with the i_version that should be updated 158 * @inode: inode with the i_version that should be updated
100 * @force: increment the counter even if it's not necessary 159 * @force: increment the counter even if it's not necessary?
101 * 160 *
102 * Every time the inode is modified, the i_version field must be seen to have 161 * Every time the inode is modified, the i_version field must be seen to have
103 * changed by any observer. 162 * changed by any observer.
104 * 163 *
105 * In this implementation, we always increment it after taking the i_lock to 164 * If "force" is set or the QUERIED flag is set, then ensure that we increment
106 * ensure that we don't race with other incrementors. 165 * the value, and clear the queried flag.
107 * 166 *
108 * Returns true if counter was bumped, and false if it wasn't. 167 * In the common case where neither is set, then we can return "false" without
168 * updating i_version.
169 *
170 * If this function returns false, and no other metadata has changed, then we
171 * can avoid logging the metadata.
109 */ 172 */
110static inline bool 173static inline bool
111inode_maybe_inc_iversion(struct inode *inode, bool force) 174inode_maybe_inc_iversion(struct inode *inode, bool force)
112{ 175{
113 atomic64_t *ivp = (atomic64_t *)&inode->i_version; 176 u64 cur, old, new;
177
178 /*
179 * The i_version field is not strictly ordered with any other inode
180 * information, but the legacy inode_inc_iversion code used a spinlock
181 * to serialize increments.
182 *
183 * Here, we add full memory barriers to ensure that any de-facto
184 * ordering with other info is preserved.
185 *
186 * This barrier pairs with the barrier in inode_query_iversion()
187 */
188 smp_mb();
189 cur = inode_peek_iversion_raw(inode);
190 for (;;) {
191 /* If flag is clear then we needn't do anything */
192 if (!force && !(cur & I_VERSION_QUERIED))
193 return false;
114 194
115 atomic64_inc(ivp); 195 /* Since lowest bit is flag, add 2 to avoid it */
196 new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
197
198 old = atomic64_cmpxchg(&inode->i_version, cur, new);
199 if (likely(old == cur))
200 break;
201 cur = old;
202 }
116 return true; 203 return true;
117} 204}
118 205
@@ -135,31 +222,12 @@ inode_inc_iversion(struct inode *inode)
135 * @inode: inode to check 222 * @inode: inode to check
136 * 223 *
137 * Returns whether the inode->i_version counter needs incrementing on the next 224 * Returns whether the inode->i_version counter needs incrementing on the next
138 * change. 225 * change. Just fetch the value and check the QUERIED flag.
139 *
140 * For now, we assume that it always does.
141 */ 226 */
142static inline bool 227static inline bool
143inode_iversion_need_inc(struct inode *inode) 228inode_iversion_need_inc(struct inode *inode)
144{ 229{
145 return true; 230 return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
146}
147
148/**
149 * inode_peek_iversion_raw - grab a "raw" iversion value
150 * @inode: inode from which i_version should be read
151 *
152 * Grab a "raw" inode->i_version value and return it. The i_version is not
153 * flagged or converted in any way. This is mostly used to access a self-managed
154 * i_version.
155 *
156 * With those filesystems, we want to treat the i_version as an entirely
157 * opaque value.
158 */
159static inline u64
160inode_peek_iversion_raw(const struct inode *inode)
161{
162 return inode->i_version;
163} 231}
164 232
165/** 233/**
@@ -176,7 +244,7 @@ inode_peek_iversion_raw(const struct inode *inode)
176static inline void 244static inline void
177inode_inc_iversion_raw(struct inode *inode) 245inode_inc_iversion_raw(struct inode *inode)
178{ 246{
179 inode_inc_iversion(inode); 247 atomic64_inc(&inode->i_version);
180} 248}
181 249
182/** 250/**
@@ -193,7 +261,7 @@ inode_inc_iversion_raw(struct inode *inode)
193static inline u64 261static inline u64
194inode_peek_iversion(const struct inode *inode) 262inode_peek_iversion(const struct inode *inode)
195{ 263{
196 return inode_peek_iversion_raw(inode); 264 return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
197} 265}
198 266
199/** 267/**
@@ -205,12 +273,35 @@ inode_peek_iversion(const struct inode *inode)
205 * that a later query of the i_version will result in a different value if 273 * that a later query of the i_version will result in a different value if
206 * anything has changed. 274 * anything has changed.
207 * 275 *
208 * This implementation just does a peek. 276 * In this implementation, we fetch the current value, set the QUERIED flag and
277 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
278 * that fails, we try again with the newly fetched value from the cmpxchg.
209 */ 279 */
210static inline u64 280static inline u64
211inode_query_iversion(struct inode *inode) 281inode_query_iversion(struct inode *inode)
212{ 282{
213 return inode_peek_iversion(inode); 283 u64 cur, old, new;
284
285 cur = inode_peek_iversion_raw(inode);
286 for (;;) {
287 /* If flag is already set, then no need to swap */
288 if (cur & I_VERSION_QUERIED) {
289 /*
290 * This barrier (and the implicit barrier in the
291 * cmpxchg below) pairs with the barrier in
292 * inode_maybe_inc_iversion().
293 */
294 smp_mb();
295 break;
296 }
297
298 new = cur | I_VERSION_QUERIED;
299 old = atomic64_cmpxchg(&inode->i_version, cur, new);
300 if (likely(old == cur))
301 break;
302 cur = old;
303 }
304 return cur >> I_VERSION_QUERIED_SHIFT;
214} 305}
215 306
216/** 307/**
@@ -233,11 +324,18 @@ inode_cmp_iversion_raw(const struct inode *inode, u64 old)
233 * @old: old value to check against its i_version 324 * @old: old value to check against its i_version
234 * 325 *
235 * Compare an i_version counter with a previous one. Returns 0 if they are 326 * Compare an i_version counter with a previous one. Returns 0 if they are
236 * the same or non-zero if they are different. 327 * the same, a positive value if the one in the inode appears newer than @old,
328 * and a negative value if @old appears to be newer than the one in the
329 * inode.
330 *
331 * Note that we don't need to set the QUERIED flag in this case, as the value
332 * in the inode is not being recorded for later use.
237 */ 333 */
334
238static inline s64 335static inline s64
239inode_cmp_iversion(const struct inode *inode, u64 old) 336inode_cmp_iversion(const struct inode *inode, u64 old)
240{ 337{
241 return (s64)inode_peek_iversion(inode) - (s64)old; 338 return (s64)(inode_peek_iversion_raw(inode) & ~I_VERSION_QUERIED) -
339 (s64)(old << I_VERSION_QUERIED_SHIFT);
242} 340}
243#endif 341#endif