aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking29
-rw-r--r--Documentation/filesystems/dentry-locking.txt174
-rw-r--r--Documentation/filesystems/ntfs.txt3
-rw-r--r--Documentation/filesystems/path-lookup.txt382
-rw-r--r--Documentation/filesystems/porting78
-rw-r--r--Documentation/filesystems/proc.txt31
-rw-r--r--Documentation/filesystems/vfs.txt74
7 files changed, 566 insertions, 205 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 01d7cbdcd3bd..ef9349a4b5d1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -9,22 +9,25 @@ be able to use diff(1).
9 9
10--------------------------- dentry_operations -------------------------- 10--------------------------- dentry_operations --------------------------
11prototypes: 11prototypes:
12 int (*d_revalidate)(struct dentry *, int); 12 int (*d_revalidate)(struct dentry *, struct nameidata *);
13 int (*d_hash) (struct dentry *, struct qstr *); 13 int (*d_hash)(const struct dentry *, const struct inode *,
14 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); 14 struct qstr *);
15 int (*d_compare)(const struct dentry *, const struct inode *,
16 const struct dentry *, const struct inode *,
17 unsigned int, const char *, const struct qstr *);
15 int (*d_delete)(struct dentry *); 18 int (*d_delete)(struct dentry *);
16 void (*d_release)(struct dentry *); 19 void (*d_release)(struct dentry *);
17 void (*d_iput)(struct dentry *, struct inode *); 20 void (*d_iput)(struct dentry *, struct inode *);
18 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 21 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
19 22
20locking rules: 23locking rules:
21 dcache_lock rename_lock ->d_lock may block 24 rename_lock ->d_lock may block rcu-walk
22d_revalidate: no no no yes 25d_revalidate: no no yes (ref-walk) maybe
23d_hash no no no yes 26d_hash no no no maybe
24d_compare: no yes no no 27d_compare: yes no no maybe
25d_delete: yes no yes no 28d_delete: no yes no no
26d_release: no no no yes 29d_release: no no yes no
27d_iput: no no no yes 30d_iput: no no yes no
28d_dname: no no no no 31d_dname: no no no no
29 32
30--------------------------- inode_operations --------------------------- 33--------------------------- inode_operations ---------------------------
@@ -44,8 +47,8 @@ ata *);
44 void * (*follow_link) (struct dentry *, struct nameidata *); 47 void * (*follow_link) (struct dentry *, struct nameidata *);
45 void (*put_link) (struct dentry *, struct nameidata *, void *); 48 void (*put_link) (struct dentry *, struct nameidata *, void *);
46 void (*truncate) (struct inode *); 49 void (*truncate) (struct inode *);
47 int (*permission) (struct inode *, int, struct nameidata *); 50 int (*permission) (struct inode *, int, unsigned int);
48 int (*check_acl)(struct inode *, int); 51 int (*check_acl)(struct inode *, int, unsigned int);
49 int (*setattr) (struct dentry *, struct iattr *); 52 int (*setattr) (struct dentry *, struct iattr *);
50 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 53 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
51 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 54 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -73,7 +76,7 @@ follow_link: no
73put_link: no 76put_link: no
74truncate: yes (see below) 77truncate: yes (see below)
75setattr: yes 78setattr: yes
76permission: no 79permission: no (may not block if called in rcu-walk mode)
77check_acl: no 80check_acl: no
78getattr: no 81getattr: no
79setxattr: yes 82setxattr: yes
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
deleted file mode 100644
index 79334ed5daa7..000000000000
--- a/Documentation/filesystems/dentry-locking.txt
+++ /dev/null
@@ -1,174 +0,0 @@
1RCU-based dcache locking model
2==============================
3
4On many workloads, the most common operation on dcache is to look up a
5dentry, given a parent dentry and the name of the child. Typically,
6for every open(), stat() etc., the dentry corresponding to the
7pathname will be looked up by walking the tree starting with the first
8component of the pathname and using that dentry along with the next
9component to look up the next level and so on. Since it is a frequent
10operation for workloads like multiuser environments and web servers,
11it is important to optimize this path.
12
13Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in
14every component during path look-up. Since 2.5.10 onwards, fast-walk
15algorithm changed this by holding the dcache_lock at the beginning and
16walking as many cached path component dentries as possible. This
17significantly decreases the number of acquisition of
18dcache_lock. However it also increases the lock hold time
19significantly and affects performance in large SMP machines. Since
202.5.62 kernel, dcache has been using a new locking model that uses RCU
21to make dcache look-up lock-free.
22
23The current dcache locking model is not very different from the
24existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock
25protected the hash chain, d_child, d_alias, d_lru lists as well as
26d_inode and several other things like mount look-up. RCU-based changes
27affect only the way the hash chain is protected. For everything else
28the dcache_lock must be taken for both traversing as well as
29updating. The hash chain updates too take the dcache_lock. The
30significant change is the way d_lookup traverses the hash chain, it
31doesn't acquire the dcache_lock for this and rely on RCU to ensure
32that the dentry has not been *freed*.
33
34
35Dcache locking details
36======================
37
38For many multi-user workloads, open() and stat() on files are very
39frequently occurring operations. Both involve walking of path names to
40find the dentry corresponding to the concerned file. In 2.4 kernel,
41dcache_lock was held during look-up of each path component. Contention
42and cache-line bouncing of this global lock caused significant
43scalability problems. With the introduction of RCU in Linux kernel,
44this was worked around by making the look-up of path components during
45path walking lock-free.
46
47
48Safe lock-free look-up of dcache hash table
49===========================================
50
51Dcache is a complex data structure with the hash table entries also
52linked together in other lists. In 2.4 kernel, dcache_lock protected
53all the lists. We applied RCU only on hash chain walking. The rest of
54the lists are still protected by dcache_lock. Some of the important
55changes are :
56
571. The deletion from hash chain is done using hlist_del_rcu() macro
58 which doesn't initialize next pointer of the deleted dentry and
59 this allows us to walk safely lock-free while a deletion is
60 happening.
61
622. Insertion of a dentry into the hash table is done using
63 hlist_add_head_rcu() which take care of ordering the writes - the
64 writes to the dentry must be visible before the dentry is
65 inserted. This works in conjunction with hlist_for_each_rcu(),
66 which has since been replaced by hlist_for_each_entry_rcu(), while
67 walking the hash chain. The only requirement is that all
68 initialization to the dentry must be done before
69 hlist_add_head_rcu() since we don't have dcache_lock protection
70 while traversing the hash chain. This isn't different from the
71 existing code.
72
733. The dentry looked up without holding dcache_lock by cannot be
74 returned for walking if it is unhashed. It then may have a NULL
75 d_inode or other bogosity since RCU doesn't protect the other
76 fields in the dentry. We therefore use a flag DCACHE_UNHASHED to
77 indicate unhashed dentries and use this in conjunction with a
78 per-dentry lock (d_lock). Once looked up without the dcache_lock,
79 we acquire the per-dentry lock (d_lock) and check if the dentry is
80 unhashed. If so, the look-up is failed. If not, the reference count
81 of the dentry is increased and the dentry is returned.
82
834. Once a dentry is looked up, it must be ensured during the path walk
84 for that component it doesn't go away. In pre-2.5.10 code, this was
85 done holding a reference to the dentry. dcache_rcu does the same.
86 In some sense, dcache_rcu path walking looks like the pre-2.5.10
87 version.
88
895. All dentry hash chain updates must take the dcache_lock as well as
90 the per-dentry lock in that order. dput() does this to ensure that
91 a dentry that has just been looked up in another CPU doesn't get
92 deleted before dget() can be done on it.
93
946. There are several ways to do reference counting of RCU protected
95 objects. One such example is in ipv4 route cache where deferred
96 freeing (using call_rcu()) is done as soon as the reference count
97 goes to zero. This cannot be done in the case of dentries because
98 tearing down of dentries require blocking (dentry_iput()) which
99 isn't supported from RCU callbacks. Instead, tearing down of
100 dentries happen synchronously in dput(), but actual freeing happens
101 later when RCU grace period is over. This allows safe lock-free
102 walking of the hash chains, but a matched dentry may have been
103 partially torn down. The checking of DCACHE_UNHASHED flag with
104 d_lock held detects such dentries and prevents them from being
105 returned from look-up.
106
107
108Maintaining POSIX rename semantics
109==================================
110
111Since look-up of dentries is lock-free, it can race against a
112concurrent rename operation. For example, during rename of file A to
113B, look-up of either A or B must succeed. So, if look-up of B happens
114after A has been removed from the hash chain but not added to the new
115hash chain, it may fail. Also, a comparison while the name is being
116written concurrently by a rename may result in false positive matches
117violating rename semantics. Issues related to race with rename are
118handled as described below :
119
1201. Look-up can be done in two ways - d_lookup() which is safe from
121 simultaneous renames and __d_lookup() which is not. If
122 __d_lookup() fails, it must be followed up by a d_lookup() to
123 correctly determine whether a dentry is in the hash table or
124 not. d_lookup() protects look-ups using a sequence lock
125 (rename_lock).
126
1272. The name associated with a dentry (d_name) may be changed if a
128 rename is allowed to happen simultaneously. To avoid memcmp() in
129 __d_lookup() go out of bounds due to a rename and false positive
130 comparison, the name comparison is done while holding the
131 per-dentry lock. This prevents concurrent renames during this
132 operation.
133
1343. Hash table walking during look-up may move to a different bucket as
135 the current dentry is moved to a different bucket due to rename.
136 But we use hlists in dcache hash table and they are
137 null-terminated. So, even if a dentry moves to a different bucket,
138 hash chain walk will terminate. [with a list_head list, it may not
139 since termination is when the list_head in the original bucket is
140 reached]. Since we redo the d_parent check and compare name while
141 holding d_lock, lock-free look-up will not race against d_move().
142
1434. There can be a theoretical race when a dentry keeps coming back to
144 original bucket due to double moves. Due to this look-up may
145 consider that it has never moved and can end up in a infinite loop.
146 But this is not any worse that theoretical livelocks we already
147 have in the kernel.
148
149
150Important guidelines for filesystem developers related to dcache_rcu
151====================================================================
152
1531. Existing dcache interfaces (pre-2.5.62) exported to filesystem
154 don't change. Only dcache internal implementation changes. However
155 filesystems *must not* delete from the dentry hash chains directly
156 using the list macros like allowed earlier. They must use dcache
157 APIs like d_drop() or __d_drop() depending on the situation.
158
1592. d_flags is now protected by a per-dentry lock (d_lock). All access
160 to d_flags must be protected by it.
161
1623. For a hashed dentry, checking of d_count needs to be protected by
163 d_lock.
164
165
166Papers and other documentation on dcache locking
167================================================
168
1691. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
170
1712. http://lse.sourceforge.net/locking/dcache/dcache.html
172
173
174
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index ac2a261c5f7d..6ef8cf3bc9a3 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -457,6 +457,9 @@ ChangeLog
457 457
458Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. 458Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
459 459
4602.1.30:
461 - Fix writev() (it kept writing the first segment over and over again
462 instead of moving onto subsequent segments).
4602.1.29: 4632.1.29:
461 - Fix a deadlock when mounting read-write. 464 - Fix a deadlock when mounting read-write.
4622.1.28: 4652.1.28:
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt
new file mode 100644
index 000000000000..eb59c8b44be9
--- /dev/null
+++ b/Documentation/filesystems/path-lookup.txt
@@ -0,0 +1,382 @@
1Path walking and name lookup locking
2====================================
3
4Path resolution is the finding a dentry corresponding to a path name string, by
5performing a path walk. Typically, for every open(), stat() etc., the path name
6will be resolved. Paths are resolved by walking the namespace tree, starting
7with the first component of the pathname (eg. root or cwd) with a known dentry,
8then finding the child of that dentry, which is named the next component in the
9path string. Then repeating the lookup from the child dentry and finding its
10child with the next element, and so on.
11
12Since it is a frequent operation for workloads like multiuser environments and
13web servers, it is important to optimize this code.
14
15Path walking synchronisation history:
16Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and
17thus in every component during path look-up. Since 2.5.10 onwards, fast-walk
18algorithm changed this by holding the dcache_lock at the beginning and walking
19as many cached path component dentries as possible. This significantly
20decreases the number of acquisition of dcache_lock. However it also increases
21the lock hold time significantly and affects performance in large SMP machines.
22Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to
23make dcache look-up lock-free.
24
25All the above algorithms required taking a lock and reference count on the
26dentry that was looked up, so that may be used as the basis for walking the
27next path element. This is inefficient and unscalable. It is inefficient
28because of the locks and atomic operations required for every dentry element
29slows things down. It is not scalable because many parallel applications that
30are path-walk intensive tend to do path lookups starting from a common dentry
31(usually, the root "/" or current working directory). So contention on these
32common path elements causes lock and cacheline queueing.
33
34Since 2.6.38, RCU is used to make a significant part of the entire path walk
35(including dcache look-up) completely "store-free" (so, no locks, atomics, or
36even stores into cachelines of common dentries). This is known as "rcu-walk"
37path walking.
38
39Path walking overview
40=====================
41
42A name string specifies a start (root directory, cwd, fd-relative) and a
43sequence of elements (directory entry names), which together refer to a path in
44the namespace. A path is represented as a (dentry, vfsmount) tuple. The name
45elements are sub-strings, seperated by '/'.
46
47Name lookups will want to find a particular path that a name string refers to
48(usually the final element, or parent of final element). This is done by taking
49the path given by the name's starting point (which we know in advance -- eg.
50current->fs->cwd or current->fs->root) as the first parent of the lookup. Then
51iteratively for each subsequent name element, look up the child of the current
52parent with the given name and if it is not the desired entry, make it the
53parent for the next lookup.
54
55A parent, of course, must be a directory, and we must have appropriate
56permissions on the parent inode to be able to walk into it.
57
58Turning the child into a parent for the next lookup requires more checks and
59procedures. Symlinks essentially substitute the symlink name for the target
60name in the name string, and require some recursive path walking. Mount points
61must be followed into (thus changing the vfsmount that subsequent path elements
62refer to), switching from the mount point path to the root of the particular
63mounted vfsmount. These behaviours are variously modified depending on the
64exact path walking flags.
65
66Path walking then must, broadly, do several particular things:
67- find the start point of the walk;
68- perform permissions and validity checks on inodes;
69- perform dcache hash name lookups on (parent, name element) tuples;
70- traverse mount points;
71- traverse symlinks;
72- lookup and create missing parts of the path on demand.
73
74Safe store-free look-up of dcache hash table
75============================================
76
77Dcache name lookup
78------------------
79In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple
80and use that to select a bucket in the dcache-hash table. The list of entries
81in that bucket is then walked, and we do a full comparison of each entry
82against our (parent, name) tuple.
83
84The hash lists are RCU protected, so list walking is not serialised with
85concurrent updates (insertion, deletion from the hash). This is a standard RCU
86list application with the exception of renames, which will be covered below.
87
88Parent and name members of a dentry, as well as its membership in the dcache
89hash, and its inode are protected by the per-dentry d_lock spinlock. A
90reference is taken on the dentry (while the fields are verified under d_lock),
91and this stabilises its d_inode pointer and actual inode. This gives a stable
92point to perform the next step of our path walk against.
93
94These members are also protected by d_seq seqlock, although this offers
95read-only protection and no durability of results, so care must be taken when
96using d_seq for synchronisation (see seqcount based lookups, below).
97
98Renames
99-------
100Back to the rename case. In usual RCU protected lists, the only operations that
101will happen to an object is insertion, and then eventually removal from the
102list. The object will not be reused until an RCU grace period is complete.
103This ensures the RCU list traversal primitives can run over the object without
104problems (see RCU documentation for how this works).
105
106However when a dentry is renamed, its hash value can change, requiring it to be
107moved to a new hash list. Allocating and inserting a new alias would be
108expensive and also problematic for directory dentries. Latency would be far to
109high to wait for a grace period after removing the dentry and before inserting
110it in the new hash bucket. So what is done is to insert the dentry into the
111new list immediately.
112
113However, when the dentry's list pointers are updated to point to objects in the
114new list before waiting for a grace period, this can result in a concurrent RCU
115lookup of the old list veering off into the new (incorrect) list and missing
116the remaining dentries on the list.
117
118There is no fundamental problem with walking down the wrong list, because the
119dentry comparisons will never match. However it is fatal to miss a matching
120dentry. So a seqlock is used to detect when a rename has occurred, and so the
121lookup can be retried.
122
123 1 2 3
124 +---+ +---+ +---+
125hlist-->| N-+->| N-+->| N-+->
126head <--+-P |<-+-P |<-+-P |
127 +---+ +---+ +---+
128
129Rename of dentry 2 may require it deleted from the above list, and inserted
130into a new list. Deleting 2 gives the following list.
131
132 1 3
133 +---+ +---+ (don't worry, the longer pointers do not
134hlist-->| N-+-------->| N-+-> impose a measurable performance overhead
135head <--+-P |<--------+-P | on modern CPUs)
136 +---+ +---+
137 ^ 2 ^
138 | +---+ |
139 | | N-+----+
140 +----+-P |
141 +---+
142
143This is a standard RCU-list deletion, which leaves the deleted object's
144pointers intact, so a concurrent list walker that is currently looking at
145object 2 will correctly continue to object 3 when it is time to traverse the
146next object.
147
148However, when inserting object 2 onto a new list, we end up with this:
149
150 1 3
151 +---+ +---+
152hlist-->| N-+-------->| N-+->
153head <--+-P |<--------+-P |
154 +---+ +---+
155 2
156 +---+
157 | N-+---->
158 <----+-P |
159 +---+
160
161Because we didn't wait for a grace period, there may be a concurrent lookup
162still at 2. Now when it follows 2's 'next' pointer, it will walk off into
163another list without ever having checked object 3.
164
165A related, but distinctly different, issue is that of rename atomicity versus
166lookup operations. If a file is renamed from 'A' to 'B', a lookup must only
167find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup
168of 'B' must succeed (note the reverse is not true).
169
170Between deleting the dentry from the old hash list, and inserting it on the new
171hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same
172rename seqlock is also used to cover this race in much the same way, by
173retrying a negative lookup result if a rename was in progress.
174
175Seqcount based lookups
176----------------------
177In refcount based dcache lookups, d_lock is used to serialise access to
178the dentry, stabilising it while comparing its name and parent and then
179taking a reference count (the reference count then gives a stable place to
180start the next part of the path walk from).
181
182As explained above, we would like to do path walking without taking locks or
183reference counts on intermediate dentries along the path. To do this, a per
184dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry
185looks like (its name, parent, and inode). That snapshot is then used to start
186the next part of the path walk. When loading the coherent snapshot under d_seq,
187care must be taken to load the members up-front, and use those pointers rather
188than reloading from the dentry later on (otherwise we'd have interesting things
189like d_inode going NULL underneath us, if the name was unlinked).
190
191Also important is to avoid performing any destructive operations (pretty much:
192no non-atomic stores to shared data), and to recheck the seqcount when we are
193"done" with the operation. Retry or abort if the seqcount does not match.
194Avoiding destructive or changing operations means we can easily unwind from
195failure.
196
197What this means is that a caller, provided they are holding RCU lock to
198protect the dentry object from disappearing, can perform a seqcount based
199lookup which does not increment the refcount on the dentry or write to
200it in any way. This returned dentry can be used for subsequent operations,
201provided that d_seq is rechecked after that operation is complete.
202
203Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be
204queried for permissions.
205
206With this two parts of the puzzle, we can do path lookups without taking
207locks or refcounts on dentry elements.
208
209RCU-walk path walking design
210============================
211
212Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk
213is the traditional[*] way of performing dcache lookups using d_lock to
214serialise concurrent modifications to the dentry and take a reference count on
215it. ref-walk is simple and obvious, and may sleep, take locks, etc while path
216walking is operating on each dentry. rcu-walk uses seqcount based dentry
217lookups, and can perform lookup of intermediate elements without any stores to
218shared data in the dentry or inode. rcu-walk can not be applied to all cases,
219eg. if the filesystem must sleep or perform non trivial operations, rcu-walk
220must be switched to ref-walk mode.
221
222[*] RCU is still used for the dentry hash lookup in ref-walk, but not the full
223 path walk.
224
225Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining
226path string, rcu-walk uses a d_seq protected snapshot. When looking up a
227child of this parent snapshot, we open d_seq critical section on the child
228before closing d_seq critical section on the parent. This gives an interlocking
229ladder of snapshots to walk down.
230
231
232 proc 101
233 /----------------\
234 / comm: "vi" \
235 / fs.root: dentry0 \
236 \ fs.cwd: dentry2 /
237 \ /
238 \----------------/
239
240So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will
241start from current->fs->root, which is a pinned dentry. Alternatively,
242"./test.c" would start from cwd; both names refer to the same path in
243the context of proc101.
244
245 dentry 0
246 +---------------------+ rcu-walk begins here, we note d_seq, check the
247 | name: "/" | inode's permission, and then look up the next
248 | inode: 10 | path element which is "home"...
249 | children:"home", ...|
250 +---------------------+
251 |
252 dentry 1 V
253 +---------------------+ ... which brings us here. We find dentry1 via
254 | name: "home" | hash lookup, then note d_seq and compare name
255 | inode: 678 | string and parent pointer. When we have a match,
256 | children:"npiggin" | we now recheck the d_seq of dentry0. Then we
257 +---------------------+ check inode and look up the next element.
258 |
259 dentry2 V
260 +---------------------+ Note: if dentry0 is now modified, lookup is
261 | name: "npiggin" | not necessarily invalid, so we need only keep a
262 | inode: 543 | parent for d_seq verification, and grandparents
263 | children:"a.c", ... | can be forgotten.
264 +---------------------+
265 |
266 dentry3 V
267 +---------------------+ At this point we have our destination dentry.
268 | name: "a.c" | We now take its d_lock, verify d_seq of this
269 | inode: 14221 | dentry. If that checks out, we can increment
270 | children:NULL | its refcount because we're holding d_lock.
271 +---------------------+
272
273Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock,
274re-checking its d_seq, and then incrementing its refcount is called
275"dropping rcu" or dropping from rcu-walk into ref-walk mode.
276
277It is, in some sense, a bit of a house of cards. If the seqcount check of the
278parent snapshot fails, the house comes down, because we had closed the d_seq
279section on the grandparent, so we have nothing left to stand on. In that case,
280the path walk must be fully restarted (which we do in ref-walk mode, to avoid
281live locks). It is costly to have a full restart, but fortunately they are
282quite rare.
283
284When we reach a point where sleeping is required, or a filesystem callout
285requires ref-walk, then instead of restarting the walk, we attempt to drop rcu
286at the last known good dentry we have. Avoiding a full restart in ref-walk in
287these cases is fundamental for performance and scalability because blocking
288operations such as creates and unlinks are not uncommon.
289
290The detailed design for rcu-walk is like this:
291* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
292* Take the RCU lock for the entire path walk, starting with the acquiring
293 of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
294 not required for dentry persistence.
295* synchronize_rcu is called when unregistering a filesystem, so we can
296 access d_ops and i_ops during rcu-walk.
297* Similarly take the vfsmount lock for the entire path walk. So now mnt
298 refcounts are not required for persistence. Also we are free to perform mount
299 lookups, and to assume dentry mount points and mount roots are stable up and
300 down the path.
301* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
302 so we can load this tuple atomically, and also check whether any of its
303 members have changed.
304* Dentry lookups (based on parent, candidate string tuple) recheck the parent
305 sequence after the child is found in case anything changed in the parent
306 during the path walk.
307* inode is also RCU protected so we can load d_inode and use the inode for
308 limited things.
309* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
310* i_op can be loaded.
311* When the destination dentry is reached, drop rcu there (ie. take d_lock,
312 verify d_seq, increment refcount).
313* If seqlock verification fails anywhere along the path, do a full restart
314 of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of
315 a better errno) to signal an rcu-walk failure.
316
317The cases where rcu-walk cannot continue are:
318* NULL dentry (ie. any uncached path element)
319* Following links
320
321It may be possible eventually to make following links rcu-walk aware.
322
323Uncached path elements will always require dropping to ref-walk mode, at the
324very least because i_mutex needs to be grabbed, and objects allocated.
325
326Final note:
327"store-free" path walking is not strictly store free. We take vfsmount lock
328and refcounts (both of which can be made per-cpu), and we also store to the
329stack (which is essentially CPU-local), and we also have to take locks and
330refcount on final dentry.
331
332The point is that shared data, where practically possible, is not locked
333or stored into. The result is massive improvements in performance and
334scalability of path resolution.
335
336
337Interesting statistics
338======================
339
340The following table gives rcu lookup statistics for a few simple workloads
341(2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to
342drop rcu that fail due to d_seq failure and requiring the entire path lookup
343again. Other cases are successful rcu-drops that are required before the final
344element, nodentry for missing dentry, revalidate for filesystem revalidate
345routine requiring rcu drop, permission for permission check requiring drop,
346and link for symlink traversal requiring drop.
347
348 rcu-lookups restart nodentry link revalidate permission
349bootup 47121 0 4624 1010 10283 7852
350dbench 25386793 0 6778659(26.7%) 55 549 1156
351kbuild 2696672 10 64442(2.3%) 108764(4.0%) 1 1590
352git diff 39605 0 28 2 0 106
353vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651
354
355What this shows is that failed rcu-walk lookups, ie. ones that are restarted
356entirely with ref-walk, are quite rare. Even the "vfstest" case which
357specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise
358such races is not showing a huge amount of restarts.
359
360Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where
361the reference count needs to be taken for some reason. This is either because
362we have reached the target of the path walk, or because we have encountered a
363condition that can't be resolved in rcu-walk mode. Ideally, we drop rcu-walk
364only when we have reached the target dentry, so the other statistics show where
365this does not happen.
366
367Note that a graceful drop from rcu-walk mode due to something such as the
368dentry not existing (which can be common) is not necessarily a failure of
369rcu-walk scheme, because some elements of the path may have been walked in
370rcu-walk mode. The further we get from common path elements (such as cwd or
371root), the less contended the dentry is likely to be. The closer we are to
372common path elements, the more likely they will exist in dentry cache.
373
374
375Papers and other documentation on dcache locking
376================================================
377
3781. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
379
3802. http://lse.sourceforge.net/locking/dcache/dcache.html
381
382
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b12c89538680..dfbcd1b00b0a 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink().
216->d_parent changes are not protected by BKL anymore. Read access is safe 216->d_parent changes are not protected by BKL anymore. Read access is safe
217if at least one of the following is true: 217if at least one of the following is true:
218 * filesystem has no cross-directory rename() 218 * filesystem has no cross-directory rename()
219 * dcache_lock is held
220 * we know that parent had been locked (e.g. we are looking at 219 * we know that parent had been locked (e.g. we are looking at
221->d_parent of ->lookup() argument). 220->d_parent of ->lookup() argument).
222 * we are called from ->rename(). 221 * we are called from ->rename().
@@ -318,3 +317,80 @@ if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput(
318may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly 317may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly
319free the on-disk inode, you may end up doing that while ->write_inode() is writing 318free the on-disk inode, you may end up doing that while ->write_inode() is writing
320to it. 319to it.
320
321---
322[mandatory]
323
324 .d_delete() now only advises the dcache as to whether or not to cache
325unreferenced dentries, and is now only called when the dentry refcount goes to
3260. Even on 0 refcount transition, it must be able to tolerate being called 0,
3271, or more times (eg. constant, idempotent).
328
329---
330[mandatory]
331
332 .d_compare() calling convention and locking rules are significantly
333changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
334look at examples of other filesystems) for guidance.
335
336---
337[mandatory]
338
339 .d_hash() calling convention and locking rules are significantly
340changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
341look at examples of other filesystems) for guidance.
342
343---
344[mandatory]
345 dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c
346for details of what locks to replace dcache_lock with in order to protect
347particular things. Most of the time, a filesystem only needs ->d_lock, which
348protects *all* the dcache state of a given dentry.
349
350--
351[mandatory]
352
353 Filesystems must RCU-free their inodes, if they can have been accessed
354via rcu-walk path walk (basically, if the file can have had a path name in the
355vfs namespace).
356
357 i_dentry and i_rcu share storage in a union, and the vfs expects
358i_dentry to be reinitialized before it is freed, so an:
359
360 INIT_LIST_HEAD(&inode->i_dentry);
361
362must be done in the RCU callback.
363
364--
365[recommended]
366 vfs now tries to do path walking in "rcu-walk mode", which avoids
367atomic operations and scalability hazards on dentries and inodes (see
368Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes
369(above) are examples of the changes required to support this. For more complex
370filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so
371no changes are required to the filesystem. However, this is costly and loses
372the benefits of rcu-walk mode. We will begin to add filesystem callbacks that
373are rcu-walk aware, shown below. Filesystems should take advantage of this
374where possible.
375
376--
377[mandatory]
378 d_revalidate is a callback that is made on every path element (if
379the filesystem provides it), which requires dropping out of rcu-walk mode. This
380may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
381returned if the filesystem cannot handle rcu-walk. See
382Documentation/filesystems/vfs.txt for more details.
383
384 permission and check_acl are inode permission checks that are called
385on many or all directory inodes on the way down a path walk (to check for
386exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU).
387See Documentation/filesystems/vfs.txt for more details.
388
389--
390[mandatory]
391 In ->fallocate() you must check the mode option passed in. If your
392filesystem does not support hole punching (deallocating space in the middle of a
393file) you must return -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set in mode.
394Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set,
395so the i_size should not change when hole punching, even when puching the end of
396a file off.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e73df2722ff3..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous: 0 kB
375Swap: 0 kB 375Swap: 0 kB
376KernelPageSize: 4 kB 376KernelPageSize: 4 kB
377MMUPageSize: 4 kB 377MMUPageSize: 4 kB
378Locked: 374 kB
378 379
379The first of these lines shows the same information as is displayed for the 380The first of these lines shows the same information as is displayed for the
380mapping in /proc/PID/maps. The remaining lines show the size of the mapping 381mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a
670 671
671> cat /proc/meminfo 672> cat /proc/meminfo
672 673
674The "Locked" indicates whether the mapping is locked in memory or not.
675
673 676
674MemTotal: 16344972 kB 677MemTotal: 16344972 kB
675MemFree: 13634064 kB 678MemFree: 13634064 kB
@@ -1181,6 +1184,30 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
1181 mb_groups details of multiblock allocator buddy cache of free blocks 1184 mb_groups details of multiblock allocator buddy cache of free blocks
1182.............................................................................. 1185..............................................................................
1183 1186
11872.0 /proc/consoles
1188------------------
1189Shows registered system console lines.
1190
1191To see which character device lines are currently used for the system console
1192/dev/console, you may simply look into the file /proc/consoles:
1193
1194 > cat /proc/consoles
1195 tty0 -WU (ECp) 4:7
1196 ttyS0 -W- (Ep) 4:64
1197
1198The columns are:
1199
1200 device name of the device
1201 operations R = can do read operations
1202 W = can do write operations
1203 U = can do unblank
1204 flags E = it is enabled
1205 C = it is prefered console
1206 B = it is primary boot console
1207 p = it is used for printk buffer
1208 b = it is not a TTY but a Braille device
1209 a = it is safe to use when cpu is offline
1210 major:minor major and minor number of the device separated by a colon
1184 1211
1185------------------------------------------------------------------------------ 1212------------------------------------------------------------------------------
1186Summary 1213Summary
@@ -1296,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
1296Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the 1323Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
1297other with its scaled value. 1324other with its scaled value.
1298 1325
1326The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
1327value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
1328requires CAP_SYS_RESOURCE.
1329
1299NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see 1330NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
1300Documentation/feature-removal-schedule.txt. 1331Documentation/feature-removal-schedule.txt.
1301 1332
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 20899e095e7e..cae6d27c9f5b 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -325,7 +325,8 @@ struct inode_operations {
325 void * (*follow_link) (struct dentry *, struct nameidata *); 325 void * (*follow_link) (struct dentry *, struct nameidata *);
326 void (*put_link) (struct dentry *, struct nameidata *, void *); 326 void (*put_link) (struct dentry *, struct nameidata *, void *);
327 void (*truncate) (struct inode *); 327 void (*truncate) (struct inode *);
328 int (*permission) (struct inode *, int, struct nameidata *); 328 int (*permission) (struct inode *, int, unsigned int);
329 int (*check_acl)(struct inode *, int, unsigned int);
329 int (*setattr) (struct dentry *, struct iattr *); 330 int (*setattr) (struct dentry *, struct iattr *);
330 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 331 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
331 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 332 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -414,6 +415,13 @@ otherwise noted.
414 permission: called by the VFS to check for access rights on a POSIX-like 415 permission: called by the VFS to check for access rights on a POSIX-like
415 filesystem. 416 filesystem.
416 417
418 May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk
419 mode, the filesystem must check the permission without blocking or
420 storing to the inode.
421
422 If a situation is encountered that rcu-walk cannot handle, return
423 -ECHILD and it will be called again in ref-walk mode.
424
417 setattr: called by the VFS to set attributes for a file. This method 425 setattr: called by the VFS to set attributes for a file. This method
418 is called by chmod(2) and related system calls. 426 is called by chmod(2) and related system calls.
419 427
@@ -847,9 +855,12 @@ defined:
847 855
848struct dentry_operations { 856struct dentry_operations {
849 int (*d_revalidate)(struct dentry *, struct nameidata *); 857 int (*d_revalidate)(struct dentry *, struct nameidata *);
850 int (*d_hash) (struct dentry *, struct qstr *); 858 int (*d_hash)(const struct dentry *, const struct inode *,
851 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); 859 struct qstr *);
852 int (*d_delete)(struct dentry *); 860 int (*d_compare)(const struct dentry *, const struct inode *,
861 const struct dentry *, const struct inode *,
862 unsigned int, const char *, const struct qstr *);
863 int (*d_delete)(const struct dentry *);
853 void (*d_release)(struct dentry *); 864 void (*d_release)(struct dentry *);
854 void (*d_iput)(struct dentry *, struct inode *); 865 void (*d_iput)(struct dentry *, struct inode *);
855 char *(*d_dname)(struct dentry *, char *, int); 866 char *(*d_dname)(struct dentry *, char *, int);
@@ -860,13 +871,45 @@ struct dentry_operations {
860 dcache. Most filesystems leave this as NULL, because all their 871 dcache. Most filesystems leave this as NULL, because all their
861 dentries in the dcache are valid 872 dentries in the dcache are valid
862 873
863 d_hash: called when the VFS adds a dentry to the hash table 874 d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU).
875 If in rcu-walk mode, the filesystem must revalidate the dentry without
876 blocking or storing to the dentry, d_parent and d_inode should not be
877 used without care (because they can go NULL), instead nd->inode should
878 be used.
879
880 If a situation is encountered that rcu-walk cannot handle, return
881 -ECHILD and it will be called again in ref-walk mode.
882
883 d_hash: called when the VFS adds a dentry to the hash table. The first
884 dentry passed to d_hash is the parent directory that the name is
885 to be hashed into. The inode is the dentry's inode.
886
887 Same locking and synchronisation rules as d_compare regarding
888 what is safe to dereference etc.
889
890 d_compare: called to compare a dentry name with a given name. The first
891 dentry is the parent of the dentry to be compared, the second is
892 the parent's inode, then the dentry and inode (may be NULL) of the
893 child dentry. len and name string are properties of the dentry to be
894 compared. qstr is the name to compare it with.
895
896 Must be constant and idempotent, and should not take locks if
897 possible, and should not or store into the dentry or inodes.
898 Should not dereference pointers outside the dentry or inodes without
899 lots of care (eg. d_parent, d_inode, d_name should not be used).
900
901 However, our vfsmount is pinned, and RCU held, so the dentries and
902 inodes won't disappear, neither will our sb or filesystem module.
903 ->i_sb and ->d_sb may be used.
864 904
865 d_compare: called when a dentry should be compared with another 905 It is a tricky calling convention because it needs to be called under
906 "rcu-walk", ie. without any locks or references on things.
866 907
867 d_delete: called when the last reference to a dentry is 908 d_delete: called when the last reference to a dentry is dropped and the
868 deleted. This means no-one is using the dentry, however it is 909 dcache is deciding whether or not to cache it. Return 1 to delete
869 still valid and in the dcache 910 immediately, or 0 to cache the dentry. Default is NULL which means to
911 always cache a reachable dentry. d_delete must be constant and
912 idempotent.
870 913
871 d_release: called when a dentry is really deallocated 914 d_release: called when a dentry is really deallocated
872 915
@@ -910,14 +953,11 @@ manipulate dentries:
910 the usage count) 953 the usage count)
911 954
912 dput: close a handle for a dentry (decrements the usage count). If 955 dput: close a handle for a dentry (decrements the usage count). If
913 the usage count drops to 0, the "d_delete" method is called 956 the usage count drops to 0, and the dentry is still in its
914 and the dentry is placed on the unused list if the dentry is 957 parent's hash, the "d_delete" method is called to check whether
915 still in its parents hash list. Putting the dentry on the 958 it should be cached. If it should not be cached, or if the dentry
916 unused list just means that if the system needs some RAM, it 959 is not hashed, it is deleted. Otherwise cached dentries are put
917 goes through the unused list of dentries and deallocates them. 960 into an LRU list to be reclaimed on memory shortage.
918 If the dentry has already been unhashed and the usage count
919 drops to 0, in this case the dentry is deallocated after the
920 "d_delete" method is called
921 961
922 d_drop: this unhashes a dentry from its parents hash list. A 962 d_drop: this unhashes a dentry from its parents hash list. A
923 subsequent call to dput() will deallocate the dentry if its 963 subsequent call to dput() will deallocate the dentry if its