diff options
-rw-r--r-- | Documentation/filesystems/dentry-locking.txt | 172 | ||||
-rw-r--r-- | Documentation/filesystems/path-lookup.txt | 345 | ||||
-rw-r--r-- | fs/dcache.c | 203 | ||||
-rw-r--r-- | fs/filesystems.c | 3 | ||||
-rw-r--r-- | fs/namei.c | 743 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 4 | ||||
-rw-r--r-- | include/linux/dcache.h | 32 | ||||
-rw-r--r-- | include/linux/namei.h | 15 | ||||
-rw-r--r-- | include/linux/security.h | 8 | ||||
-rw-r--r-- | security/security.c | 9 |
10 files changed, 1194 insertions, 340 deletions
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt deleted file mode 100644 index 30b6a40f5650..000000000000 --- a/Documentation/filesystems/dentry-locking.txt +++ /dev/null | |||
@@ -1,172 +0,0 @@ | |||
1 | RCU-based dcache locking model | ||
2 | ============================== | ||
3 | |||
4 | On many workloads, the most common operation on dcache is to look up a | ||
5 | dentry, given a parent dentry and the name of the child. Typically, | ||
6 | for every open(), stat() etc., the dentry corresponding to the | ||
7 | pathname will be looked up by walking the tree starting with the first | ||
8 | component of the pathname and using that dentry along with the next | ||
9 | component to look up the next level and so on. Since it is a frequent | ||
10 | operation for workloads like multiuser environments and web servers, | ||
11 | it is important to optimize this path. | ||
12 | |||
13 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in | ||
14 | every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
15 | algorithm changed this by holding the dcache_lock at the beginning and | ||
16 | walking as many cached path component dentries as possible. This | ||
17 | significantly decreases the number of acquisition of | ||
18 | dcache_lock. However it also increases the lock hold time | ||
19 | significantly and affects performance in large SMP machines. Since | ||
20 | 2.5.62 kernel, dcache has been using a new locking model that uses RCU | ||
21 | to make dcache look-up lock-free. | ||
22 | |||
23 | The current dcache locking model is not very different from the | ||
24 | existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
25 | protected the hash chain, d_child, d_alias, d_lru lists as well as | ||
26 | d_inode and several other things like mount look-up. RCU-based changes | ||
27 | affect only the way the hash chain is protected. For everything else | ||
28 | the dcache_lock must be taken for both traversing as well as | ||
29 | updating. The hash chain updates too take the dcache_lock. The | ||
30 | significant change is the way d_lookup traverses the hash chain, it | ||
31 | doesn't acquire the dcache_lock for this and rely on RCU to ensure | ||
32 | that the dentry has not been *freed*. | ||
33 | |||
34 | dcache_lock no longer exists, dentry locking is explained in fs/dcache.c | ||
35 | |||
36 | Dcache locking details | ||
37 | ====================== | ||
38 | |||
39 | For many multi-user workloads, open() and stat() on files are very | ||
40 | frequently occurring operations. Both involve walking of path names to | ||
41 | find the dentry corresponding to the concerned file. In 2.4 kernel, | ||
42 | dcache_lock was held during look-up of each path component. Contention | ||
43 | and cache-line bouncing of this global lock caused significant | ||
44 | scalability problems. With the introduction of RCU in Linux kernel, | ||
45 | this was worked around by making the look-up of path components during | ||
46 | path walking lock-free. | ||
47 | |||
48 | |||
49 | Safe lock-free look-up of dcache hash table | ||
50 | =========================================== | ||
51 | |||
52 | Dcache is a complex data structure with the hash table entries also | ||
53 | linked together in other lists. In 2.4 kernel, dcache_lock protected | ||
54 | all the lists. RCU dentry hash walking works like this: | ||
55 | |||
56 | 1. The deletion from hash chain is done using hlist_del_rcu() macro | ||
57 | which doesn't initialize next pointer of the deleted dentry and | ||
58 | this allows us to walk safely lock-free while a deletion is | ||
59 | happening. This is a standard hlist_rcu iteration. | ||
60 | |||
61 | 2. Insertion of a dentry into the hash table is done using | ||
62 | hlist_add_head_rcu() which take care of ordering the writes - the | ||
63 | writes to the dentry must be visible before the dentry is | ||
64 | inserted. This works in conjunction with hlist_for_each_rcu(), | ||
65 | which has since been replaced by hlist_for_each_entry_rcu(), while | ||
66 | walking the hash chain. The only requirement is that all | ||
67 | initialization to the dentry must be done before | ||
68 | hlist_add_head_rcu() since we don't have lock protection | ||
69 | while traversing the hash chain. | ||
70 | |||
71 | 3. The dentry looked up without holding locks cannot be returned for | ||
72 | walking if it is unhashed. It then may have a NULL d_inode or other | ||
73 | bogosity since RCU doesn't protect the other fields in the dentry. We | ||
74 | therefore use a flag DCACHE_UNHASHED to indicate unhashed dentries | ||
75 | and use this in conjunction with a per-dentry lock (d_lock). Once | ||
76 | looked up without locks, we acquire the per-dentry lock (d_lock) and | ||
77 | check if the dentry is unhashed. If so, the look-up is failed. If not, | ||
78 | the reference count of the dentry is increased and the dentry is | ||
79 | returned. | ||
80 | |||
81 | 4. Once a dentry is looked up, it must be ensured during the path walk | ||
82 | for that component it doesn't go away. In pre-2.5.10 code, this was | ||
83 | done holding a reference to the dentry. dcache_rcu does the same. | ||
84 | In some sense, dcache_rcu path walking looks like the pre-2.5.10 | ||
85 | version. | ||
86 | |||
87 | 5. All dentry hash chain updates must take the per-dentry lock (see | ||
88 | fs/dcache.c). This excludes dput() to ensure that a dentry that has | ||
89 | been looked up concurrently does not get deleted before dget() can | ||
90 | take a ref. | ||
91 | |||
92 | 6. There are several ways to do reference counting of RCU protected | ||
93 | objects. One such example is in ipv4 route cache where deferred | ||
94 | freeing (using call_rcu()) is done as soon as the reference count | ||
95 | goes to zero. This cannot be done in the case of dentries because | ||
96 | tearing down of dentries require blocking (dentry_iput()) which | ||
97 | isn't supported from RCU callbacks. Instead, tearing down of | ||
98 | dentries happen synchronously in dput(), but actual freeing happens | ||
99 | later when RCU grace period is over. This allows safe lock-free | ||
100 | walking of the hash chains, but a matched dentry may have been | ||
101 | partially torn down. The checking of DCACHE_UNHASHED flag with | ||
102 | d_lock held detects such dentries and prevents them from being | ||
103 | returned from look-up. | ||
104 | |||
105 | |||
106 | Maintaining POSIX rename semantics | ||
107 | ================================== | ||
108 | |||
109 | Since look-up of dentries is lock-free, it can race against a | ||
110 | concurrent rename operation. For example, during rename of file A to | ||
111 | B, look-up of either A or B must succeed. So, if look-up of B happens | ||
112 | after A has been removed from the hash chain but not added to the new | ||
113 | hash chain, it may fail. Also, a comparison while the name is being | ||
114 | written concurrently by a rename may result in false positive matches | ||
115 | violating rename semantics. Issues related to race with rename are | ||
116 | handled as described below : | ||
117 | |||
118 | 1. Look-up can be done in two ways - d_lookup() which is safe from | ||
119 | simultaneous renames and __d_lookup() which is not. If | ||
120 | __d_lookup() fails, it must be followed up by a d_lookup() to | ||
121 | correctly determine whether a dentry is in the hash table or | ||
122 | not. d_lookup() protects look-ups using a sequence lock | ||
123 | (rename_lock). | ||
124 | |||
125 | 2. The name associated with a dentry (d_name) may be changed if a | ||
126 | rename is allowed to happen simultaneously. To avoid memcmp() in | ||
127 | __d_lookup() go out of bounds due to a rename and false positive | ||
128 | comparison, the name comparison is done while holding the | ||
129 | per-dentry lock. This prevents concurrent renames during this | ||
130 | operation. | ||
131 | |||
132 | 3. Hash table walking during look-up may move to a different bucket as | ||
133 | the current dentry is moved to a different bucket due to rename. | ||
134 | But we use hlists in dcache hash table and they are | ||
135 | null-terminated. So, even if a dentry moves to a different bucket, | ||
136 | hash chain walk will terminate. [with a list_head list, it may not | ||
137 | since termination is when the list_head in the original bucket is | ||
138 | reached]. Since we redo the d_parent check and compare name while | ||
139 | holding d_lock, lock-free look-up will not race against d_move(). | ||
140 | |||
141 | 4. There can be a theoretical race when a dentry keeps coming back to | ||
142 | original bucket due to double moves. Due to this look-up may | ||
143 | consider that it has never moved and can end up in a infinite loop. | ||
144 | But this is not any worse that theoretical livelocks we already | ||
145 | have in the kernel. | ||
146 | |||
147 | |||
148 | Important guidelines for filesystem developers related to dcache_rcu | ||
149 | ==================================================================== | ||
150 | |||
151 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
152 | don't change. Only dcache internal implementation changes. However | ||
153 | filesystems *must not* delete from the dentry hash chains directly | ||
154 | using the list macros like allowed earlier. They must use dcache | ||
155 | APIs like d_drop() or __d_drop() depending on the situation. | ||
156 | |||
157 | 2. d_flags is now protected by a per-dentry lock (d_lock). All access | ||
158 | to d_flags must be protected by it. | ||
159 | |||
160 | 3. For a hashed dentry, checking of d_count needs to be protected by | ||
161 | d_lock. | ||
162 | |||
163 | |||
164 | Papers and other documentation on dcache locking | ||
165 | ================================================ | ||
166 | |||
167 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
168 | |||
169 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
170 | |||
171 | |||
172 | |||
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt new file mode 100644 index 000000000000..09b2878724a1 --- /dev/null +++ b/Documentation/filesystems/path-lookup.txt | |||
@@ -0,0 +1,345 @@ | |||
1 | Path walking and name lookup locking | ||
2 | ==================================== | ||
3 | |||
4 | Path resolution is the finding a dentry corresponding to a path name string, by | ||
5 | performing a path walk. Typically, for every open(), stat() etc., the path name | ||
6 | will be resolved. Paths are resolved by walking the namespace tree, starting | ||
7 | with the first component of the pathname (eg. root or cwd) with a known dentry, | ||
8 | then finding the child of that dentry, which is named the next component in the | ||
9 | path string. Then repeating the lookup from the child dentry and finding its | ||
10 | child with the next element, and so on. | ||
11 | |||
12 | Since it is a frequent operation for workloads like multiuser environments and | ||
13 | web servers, it is important to optimize this code. | ||
14 | |||
15 | Path walking synchronisation history: | ||
16 | Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and | ||
17 | thus in every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
18 | algorithm changed this by holding the dcache_lock at the beginning and walking | ||
19 | as many cached path component dentries as possible. This significantly | ||
20 | decreases the number of acquisition of dcache_lock. However it also increases | ||
21 | the lock hold time significantly and affects performance in large SMP machines. | ||
22 | Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to | ||
23 | make dcache look-up lock-free. | ||
24 | |||
25 | All the above algorithms required taking a lock and reference count on the | ||
26 | dentry that was looked up, so that may be used as the basis for walking the | ||
27 | next path element. This is inefficient and unscalable. It is inefficient | ||
28 | because of the locks and atomic operations required for every dentry element | ||
29 | slows things down. It is not scalable because many parallel applications that | ||
30 | are path-walk intensive tend to do path lookups starting from a common dentry | ||
31 | (usually, the root "/" or current working directory). So contention on these | ||
32 | common path elements causes lock and cacheline queueing. | ||
33 | |||
34 | Since 2.6.38, RCU is used to make a significant part of the entire path walk | ||
35 | (including dcache look-up) completely "store-free" (so, no locks, atomics, or | ||
36 | even stores into cachelines of common dentries). This is known as "rcu-walk" | ||
37 | path walking. | ||
38 | |||
39 | Path walking overview | ||
40 | ===================== | ||
41 | |||
42 | A name string specifies a start (root directory, cwd, fd-relative) and a | ||
43 | sequence of elements (directory entry names), which together refer to a path in | ||
44 | the namespace. A path is represented as a (dentry, vfsmount) tuple. The name | ||
45 | elements are sub-strings, seperated by '/'. | ||
46 | |||
47 | Name lookups will want to find a particular path that a name string refers to | ||
48 | (usually the final element, or parent of final element). This is done by taking | ||
49 | the path given by the name's starting point (which we know in advance -- eg. | ||
50 | current->fs->cwd or current->fs->root) as the first parent of the lookup. Then | ||
51 | iteratively for each subsequent name element, look up the child of the current | ||
52 | parent with the given name and if it is not the desired entry, make it the | ||
53 | parent for the next lookup. | ||
54 | |||
55 | A parent, of course, must be a directory, and we must have appropriate | ||
56 | permissions on the parent inode to be able to walk into it. | ||
57 | |||
58 | Turning the child into a parent for the next lookup requires more checks and | ||
59 | procedures. Symlinks essentially substitute the symlink name for the target | ||
60 | name in the name string, and require some recursive path walking. Mount points | ||
61 | must be followed into (thus changing the vfsmount that subsequent path elements | ||
62 | refer to), switching from the mount point path to the root of the particular | ||
63 | mounted vfsmount. These behaviours are variously modified depending on the | ||
64 | exact path walking flags. | ||
65 | |||
66 | Path walking then must, broadly, do several particular things: | ||
67 | - find the start point of the walk; | ||
68 | - perform permissions and validity checks on inodes; | ||
69 | - perform dcache hash name lookups on (parent, name element) tuples; | ||
70 | - traverse mount points; | ||
71 | - traverse symlinks; | ||
72 | - lookup and create missing parts of the path on demand. | ||
73 | |||
74 | Safe store-free look-up of dcache hash table | ||
75 | ============================================ | ||
76 | |||
77 | Dcache name lookup | ||
78 | ------------------ | ||
79 | In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple | ||
80 | and use that to select a bucket in the dcache-hash table. The list of entries | ||
81 | in that bucket is then walked, and we do a full comparison of each entry | ||
82 | against our (parent, name) tuple. | ||
83 | |||
84 | The hash lists are RCU protected, so list walking is not serialised with | ||
85 | concurrent updates (insertion, deletion from the hash). This is a standard RCU | ||
86 | list application with the exception of renames, which will be covered below. | ||
87 | |||
88 | Parent and name members of a dentry, as well as its membership in the dcache | ||
89 | hash, and its inode are protected by the per-dentry d_lock spinlock. A | ||
90 | reference is taken on the dentry (while the fields are verified under d_lock), | ||
91 | and this stabilises its d_inode pointer and actual inode. This gives a stable | ||
92 | point to perform the next step of our path walk against. | ||
93 | |||
94 | These members are also protected by d_seq seqlock, although this offers | ||
95 | read-only protection and no durability of results, so care must be taken when | ||
96 | using d_seq for synchronisation (see seqcount based lookups, below). | ||
97 | |||
98 | Renames | ||
99 | ------- | ||
100 | Back to the rename case. In usual RCU protected lists, the only operations that | ||
101 | will happen to an object is insertion, and then eventually removal from the | ||
102 | list. The object will not be reused until an RCU grace period is complete. | ||
103 | This ensures the RCU list traversal primitives can run over the object without | ||
104 | problems (see RCU documentation for how this works). | ||
105 | |||
106 | However when a dentry is renamed, its hash value can change, requiring it to be | ||
107 | moved to a new hash list. Allocating and inserting a new alias would be | ||
108 | expensive and also problematic for directory dentries. Latency would be far to | ||
109 | high to wait for a grace period after removing the dentry and before inserting | ||
110 | it in the new hash bucket. So what is done is to insert the dentry into the | ||
111 | new list immediately. | ||
112 | |||
113 | However, when the dentry's list pointers are updated to point to objects in the | ||
114 | new list before waiting for a grace period, this can result in a concurrent RCU | ||
115 | lookup of the old list veering off into the new (incorrect) list and missing | ||
116 | the remaining dentries on the list. | ||
117 | |||
118 | There is no fundamental problem with walking down the wrong list, because the | ||
119 | dentry comparisons will never match. However it is fatal to miss a matching | ||
120 | dentry. So a seqlock is used to detect when a rename has occurred, and so the | ||
121 | lookup can be retried. | ||
122 | |||
123 | 1 2 3 | ||
124 | +---+ +---+ +---+ | ||
125 | hlist-->| N-+->| N-+->| N-+-> | ||
126 | head <--+-P |<-+-P |<-+-P | | ||
127 | +---+ +---+ +---+ | ||
128 | |||
129 | Rename of dentry 2 may require it deleted from the above list, and inserted | ||
130 | into a new list. Deleting 2 gives the following list. | ||
131 | |||
132 | 1 3 | ||
133 | +---+ +---+ (don't worry, the longer pointers do not | ||
134 | hlist-->| N-+-------->| N-+-> impose a measurable performance overhead | ||
135 | head <--+-P |<--------+-P | on modern CPUs) | ||
136 | +---+ +---+ | ||
137 | ^ 2 ^ | ||
138 | | +---+ | | ||
139 | | | N-+----+ | ||
140 | +----+-P | | ||
141 | +---+ | ||
142 | |||
143 | This is a standard RCU-list deletion, which leaves the deleted object's | ||
144 | pointers intact, so a concurrent list walker that is currently looking at | ||
145 | object 2 will correctly continue to object 3 when it is time to traverse the | ||
146 | next object. | ||
147 | |||
148 | However, when inserting object 2 onto a new list, we end up with this: | ||
149 | |||
150 | 1 3 | ||
151 | +---+ +---+ | ||
152 | hlist-->| N-+-------->| N-+-> | ||
153 | head <--+-P |<--------+-P | | ||
154 | +---+ +---+ | ||
155 | 2 | ||
156 | +---+ | ||
157 | | N-+----> | ||
158 | <----+-P | | ||
159 | +---+ | ||
160 | |||
161 | Because we didn't wait for a grace period, there may be a concurrent lookup | ||
162 | still at 2. Now when it follows 2's 'next' pointer, it will walk off into | ||
163 | another list without ever having checked object 3. | ||
164 | |||
165 | A related, but distinctly different, issue is that of rename atomicity versus | ||
166 | lookup operations. If a file is renamed from 'A' to 'B', a lookup must only | ||
167 | find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup | ||
168 | of 'B' must succeed (note the reverse is not true). | ||
169 | |||
170 | Between deleting the dentry from the old hash list, and inserting it on the new | ||
171 | hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same | ||
172 | rename seqlock is also used to cover this race in much the same way, by | ||
173 | retrying a negative lookup result if a rename was in progress. | ||
174 | |||
175 | Seqcount based lookups | ||
176 | ---------------------- | ||
177 | In refcount based dcache lookups, d_lock is used to serialise access to | ||
178 | the dentry, stabilising it while comparing its name and parent and then | ||
179 | taking a reference count (the reference count then gives a stable place to | ||
180 | start the next part of the path walk from). | ||
181 | |||
182 | As explained above, we would like to do path walking without taking locks or | ||
183 | reference counts on intermediate dentries along the path. To do this, a per | ||
184 | dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry | ||
185 | looks like (its name, parent, and inode). That snapshot is then used to start | ||
186 | the next part of the path walk. When loading the coherent snapshot under d_seq, | ||
187 | care must be taken to load the members up-front, and use those pointers rather | ||
188 | than reloading from the dentry later on (otherwise we'd have interesting things | ||
189 | like d_inode going NULL underneath us, if the name was unlinked). | ||
190 | |||
191 | Also important is to avoid performing any destructive operations (pretty much: | ||
192 | no non-atomic stores to shared data), and to recheck the seqcount when we are | ||
193 | "done" with the operation. Retry or abort if the seqcount does not match. | ||
194 | Avoiding destructive or changing operations means we can easily unwind from | ||
195 | failure. | ||
196 | |||
197 | What this means is that a caller, provided they are holding RCU lock to | ||
198 | protect the dentry object from disappearing, can perform a seqcount based | ||
199 | lookup which does not increment the refcount on the dentry or write to | ||
200 | it in any way. This returned dentry can be used for subsequent operations, | ||
201 | provided that d_seq is rechecked after that operation is complete. | ||
202 | |||
203 | Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be | ||
204 | queried for permissions. | ||
205 | |||
206 | With this two parts of the puzzle, we can do path lookups without taking | ||
207 | locks or refcounts on dentry elements. | ||
208 | |||
209 | RCU-walk path walking design | ||
210 | ============================ | ||
211 | |||
212 | Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk | ||
213 | is the traditional[*] way of performing dcache lookups using d_lock to | ||
214 | serialise concurrent modifications to the dentry and take a reference count on | ||
215 | it. ref-walk is simple and obvious, and may sleep, take locks, etc while path | ||
216 | walking is operating on each dentry. rcu-walk uses seqcount based dentry | ||
217 | lookups, and can perform lookup of intermediate elements without any stores to | ||
218 | shared data in the dentry or inode. rcu-walk can not be applied to all cases, | ||
219 | eg. if the filesystem must sleep or perform non trivial operations, rcu-walk | ||
220 | must be switched to ref-walk mode. | ||
221 | |||
222 | [*] RCU is still used for the dentry hash lookup in ref-walk, but not the full | ||
223 | path walk. | ||
224 | |||
225 | Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining | ||
226 | path string, rcu-walk uses a d_seq protected snapshot. When looking up a | ||
227 | child of this parent snapshot, we open d_seq critical section on the child | ||
228 | before closing d_seq critical section on the parent. This gives an interlocking | ||
229 | ladder of snapshots to walk down. | ||
230 | |||
231 | |||
232 | proc 101 | ||
233 | /----------------\ | ||
234 | / comm: "vi" \ | ||
235 | / fs.root: dentry0 \ | ||
236 | \ fs.cwd: dentry2 / | ||
237 | \ / | ||
238 | \----------------/ | ||
239 | |||
240 | So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will | ||
241 | start from current->fs->root, which is a pinned dentry. Alternatively, | ||
242 | "./test.c" would start from cwd; both names refer to the same path in | ||
243 | the context of proc101. | ||
244 | |||
245 | dentry 0 | ||
246 | +---------------------+ rcu-walk begins here, we note d_seq, check the | ||
247 | | name: "/" | inode's permission, and then look up the next | ||
248 | | inode: 10 | path element which is "home"... | ||
249 | | children:"home", ...| | ||
250 | +---------------------+ | ||
251 | | | ||
252 | dentry 1 V | ||
253 | +---------------------+ ... which brings us here. We find dentry1 via | ||
254 | | name: "home" | hash lookup, then note d_seq and compare name | ||
255 | | inode: 678 | string and parent pointer. When we have a match, | ||
256 | | children:"npiggin" | we now recheck the d_seq of dentry0. Then we | ||
257 | +---------------------+ check inode and look up the next element. | ||
258 | | | ||
259 | dentry2 V | ||
260 | +---------------------+ Note: if dentry0 is now modified, lookup is | ||
261 | | name: "npiggin" | not necessarily invalid, so we need only keep a | ||
262 | | inode: 543 | parent for d_seq verification, and grandparents | ||
263 | | children:"a.c", ... | can be forgotten. | ||
264 | +---------------------+ | ||
265 | | | ||
266 | dentry3 V | ||
267 | +---------------------+ At this point we have our destination dentry. | ||
268 | | name: "a.c" | We now take its d_lock, verify d_seq of this | ||
269 | | inode: 14221 | dentry. If that checks out, we can increment | ||
270 | | children:NULL | its refcount because we're holding d_lock. | ||
271 | +---------------------+ | ||
272 | |||
273 | Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock, | ||
274 | re-checking its d_seq, and then incrementing its refcount is called | ||
275 | "dropping rcu" or dropping from rcu-walk into ref-walk mode. | ||
276 | |||
277 | It is, in some sense, a bit of a house of cards. If the seqcount check of the | ||
278 | parent snapshot fails, the house comes down, because we had closed the d_seq | ||
279 | section on the grandparent, so we have nothing left to stand on. In that case, | ||
280 | the path walk must be fully restarted (which we do in ref-walk mode, to avoid | ||
281 | live locks). It is costly to have a full restart, but fortunately they are | ||
282 | quite rare. | ||
283 | |||
284 | When we reach a point where sleeping is required, or a filesystem callout | ||
285 | requires ref-walk, then instead of restarting the walk, we attempt to drop rcu | ||
286 | at the last known good dentry we have. Avoiding a full restart in ref-walk in | ||
287 | these cases is fundamental for performance and scalability because blocking | ||
288 | operations such as creates and unlinks are not uncommon. | ||
289 | |||
290 | The detailed design for rcu-walk is like this: | ||
291 | * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. | ||
292 | * Take the RCU lock for the entire path walk, starting with the acquiring | ||
293 | of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are | ||
294 | not required for dentry persistence. | ||
295 | * synchronize_rcu is called when unregistering a filesystem, so we can | ||
296 | access d_ops and i_ops during rcu-walk. | ||
297 | * Similarly take the vfsmount lock for the entire path walk. So now mnt | ||
298 | refcounts are not required for persistence. Also we are free to perform mount | ||
299 | lookups, and to assume dentry mount points and mount roots are stable up and | ||
300 | down the path. | ||
301 | * Have a per-dentry seqlock to protect the dentry name, parent, and inode, | ||
302 | so we can load this tuple atomically, and also check whether any of its | ||
303 | members have changed. | ||
304 | * Dentry lookups (based on parent, candidate string tuple) recheck the parent | ||
305 | sequence after the child is found in case anything changed in the parent | ||
306 | during the path walk. | ||
307 | * inode is also RCU protected so we can load d_inode and use the inode for | ||
308 | limited things. | ||
309 | * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. | ||
310 | * i_op can be loaded. | ||
311 | * When the destination dentry is reached, drop rcu there (ie. take d_lock, | ||
312 | verify d_seq, increment refcount). | ||
313 | * If seqlock verification fails anywhere along the path, do a full restart | ||
314 | of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of | ||
315 | a better errno) to signal an rcu-walk failure. | ||
316 | |||
317 | The cases where rcu-walk cannot continue are: | ||
318 | * NULL dentry (ie. any uncached path element) | ||
319 | * parent with d_inode->i_op->permission or ACLs | ||
320 | * dentries with d_revalidate | ||
321 | * Following links | ||
322 | |||
323 | In future patches, permission checks and d_revalidate become rcu-walk aware. It | ||
324 | may be possible eventually to make following links rcu-walk aware. | ||
325 | |||
326 | Uncached path elements will always require dropping to ref-walk mode, at the | ||
327 | very least because i_mutex needs to be grabbed, and objects allocated. | ||
328 | |||
329 | Final note: | ||
330 | "store-free" path walking is not strictly store free. We take vfsmount lock | ||
331 | and refcounts (both of which can be made per-cpu), and we also store to the | ||
332 | stack (which is essentially CPU-local), and we also have to take locks and | ||
333 | refcount on final dentry. | ||
334 | |||
335 | The point is that shared data, where practically possible, is not locked | ||
336 | or stored into. The result is massive improvements in performance and | ||
337 | scalability of path resolution. | ||
338 | |||
339 | |||
340 | Papers and other documentation on dcache locking | ||
341 | ================================================ | ||
342 | |||
343 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
344 | |||
345 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
diff --git a/fs/dcache.c b/fs/dcache.c index dc0551c9755d..187fea040108 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry) | |||
152 | call_rcu(&dentry->d_u.d_rcu, __d_free); | 152 | call_rcu(&dentry->d_u.d_rcu, __d_free); |
153 | } | 153 | } |
154 | 154 | ||
155 | /** | ||
156 | * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups | ||
157 | * After this call, in-progress rcu-walk path lookup will fail. This | ||
158 | * should be called after unhashing, and after changing d_inode (if | ||
159 | * the dentry has not already been unhashed). | ||
160 | */ | ||
161 | static inline void dentry_rcuwalk_barrier(struct dentry *dentry) | ||
162 | { | ||
163 | assert_spin_locked(&dentry->d_lock); | ||
164 | /* Go through a barrier */ | ||
165 | write_seqcount_barrier(&dentry->d_seq); | ||
166 | } | ||
167 | |||
155 | /* | 168 | /* |
156 | * Release the dentry's inode, using the filesystem | 169 | * Release the dentry's inode, using the filesystem |
157 | * d_iput() operation if defined. | 170 | * d_iput() operation if defined. Dentry has no refcount |
171 | * and is unhashed. | ||
158 | */ | 172 | */ |
159 | static void dentry_iput(struct dentry * dentry) | 173 | static void dentry_iput(struct dentry * dentry) |
160 | __releases(dentry->d_lock) | 174 | __releases(dentry->d_lock) |
@@ -179,6 +193,28 @@ static void dentry_iput(struct dentry * dentry) | |||
179 | } | 193 | } |
180 | 194 | ||
181 | /* | 195 | /* |
196 | * Release the dentry's inode, using the filesystem | ||
197 | * d_iput() operation if defined. dentry remains in-use. | ||
198 | */ | ||
199 | static void dentry_unlink_inode(struct dentry * dentry) | ||
200 | __releases(dentry->d_lock) | ||
201 | __releases(dcache_inode_lock) | ||
202 | { | ||
203 | struct inode *inode = dentry->d_inode; | ||
204 | dentry->d_inode = NULL; | ||
205 | list_del_init(&dentry->d_alias); | ||
206 | dentry_rcuwalk_barrier(dentry); | ||
207 | spin_unlock(&dentry->d_lock); | ||
208 | spin_unlock(&dcache_inode_lock); | ||
209 | if (!inode->i_nlink) | ||
210 | fsnotify_inoderemove(inode); | ||
211 | if (dentry->d_op && dentry->d_op->d_iput) | ||
212 | dentry->d_op->d_iput(dentry, inode); | ||
213 | else | ||
214 | iput(inode); | ||
215 | } | ||
216 | |||
217 | /* | ||
182 | * dentry_lru_(add|del|move_tail) must be called with d_lock held. | 218 | * dentry_lru_(add|del|move_tail) must be called with d_lock held. |
183 | */ | 219 | */ |
184 | static void dentry_lru_add(struct dentry *dentry) | 220 | static void dentry_lru_add(struct dentry *dentry) |
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry) | |||
272 | spin_lock(&dcache_hash_lock); | 308 | spin_lock(&dcache_hash_lock); |
273 | hlist_del_rcu(&dentry->d_hash); | 309 | hlist_del_rcu(&dentry->d_hash); |
274 | spin_unlock(&dcache_hash_lock); | 310 | spin_unlock(&dcache_hash_lock); |
311 | dentry_rcuwalk_barrier(dentry); | ||
275 | } | 312 | } |
276 | } | 313 | } |
277 | EXPORT_SYMBOL(__d_drop); | 314 | EXPORT_SYMBOL(__d_drop); |
@@ -309,6 +346,7 @@ relock: | |||
309 | spin_unlock(&dcache_inode_lock); | 346 | spin_unlock(&dcache_inode_lock); |
310 | goto relock; | 347 | goto relock; |
311 | } | 348 | } |
349 | |||
312 | if (ref) | 350 | if (ref) |
313 | dentry->d_count--; | 351 | dentry->d_count--; |
314 | /* if dentry was on the d_lru list delete it from there */ | 352 | /* if dentry was on the d_lru list delete it from there */ |
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) | |||
1221 | dentry->d_count = 1; | 1259 | dentry->d_count = 1; |
1222 | dentry->d_flags = DCACHE_UNHASHED; | 1260 | dentry->d_flags = DCACHE_UNHASHED; |
1223 | spin_lock_init(&dentry->d_lock); | 1261 | spin_lock_init(&dentry->d_lock); |
1262 | seqcount_init(&dentry->d_seq); | ||
1224 | dentry->d_inode = NULL; | 1263 | dentry->d_inode = NULL; |
1225 | dentry->d_parent = NULL; | 1264 | dentry->d_parent = NULL; |
1226 | dentry->d_sb = NULL; | 1265 | dentry->d_sb = NULL; |
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) | |||
1269 | if (inode) | 1308 | if (inode) |
1270 | list_add(&dentry->d_alias, &inode->i_dentry); | 1309 | list_add(&dentry->d_alias, &inode->i_dentry); |
1271 | dentry->d_inode = inode; | 1310 | dentry->d_inode = inode; |
1311 | dentry_rcuwalk_barrier(dentry); | ||
1272 | spin_unlock(&dentry->d_lock); | 1312 | spin_unlock(&dentry->d_lock); |
1273 | fsnotify_d_instantiate(dentry, inode); | 1313 | fsnotify_d_instantiate(dentry, inode); |
1274 | } | 1314 | } |
@@ -1611,6 +1651,111 @@ err_out: | |||
1611 | EXPORT_SYMBOL(d_add_ci); | 1651 | EXPORT_SYMBOL(d_add_ci); |
1612 | 1652 | ||
1613 | /** | 1653 | /** |
1654 | * __d_lookup_rcu - search for a dentry (racy, store-free) | ||
1655 | * @parent: parent dentry | ||
1656 | * @name: qstr of name we wish to find | ||
1657 | * @seq: returns d_seq value at the point where the dentry was found | ||
1658 | * @inode: returns dentry->d_inode when the inode was found valid. | ||
1659 | * Returns: dentry, or NULL | ||
1660 | * | ||
1661 | * __d_lookup_rcu is the dcache lookup function for rcu-walk name | ||
1662 | * resolution (store-free path walking) design described in | ||
1663 | * Documentation/filesystems/path-lookup.txt. | ||
1664 | * | ||
1665 | * This is not to be used outside core vfs. | ||
1666 | * | ||
1667 | * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock | ||
1668 | * held, and rcu_read_lock held. The returned dentry must not be stored into | ||
1669 | * without taking d_lock and checking d_seq sequence count against @seq | ||
1670 | * returned here. | ||
1671 | * | ||
1672 | * A refcount may be taken on the found dentry with the __d_rcu_to_refcount | ||
1673 | * function. | ||
1674 | * | ||
1675 | * Alternatively, __d_lookup_rcu may be called again to look up the child of | ||
1676 | * the returned dentry, so long as its parent's seqlock is checked after the | ||
1677 | * child is looked up. Thus, an interlocking stepping of sequence lock checks | ||
1678 | * is formed, giving integrity down the path walk. | ||
1679 | */ | ||
1680 | struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, | ||
1681 | unsigned *seq, struct inode **inode) | ||
1682 | { | ||
1683 | unsigned int len = name->len; | ||
1684 | unsigned int hash = name->hash; | ||
1685 | const unsigned char *str = name->name; | ||
1686 | struct hlist_head *head = d_hash(parent, hash); | ||
1687 | struct hlist_node *node; | ||
1688 | struct dentry *dentry; | ||
1689 | |||
1690 | /* | ||
1691 | * Note: There is significant duplication with __d_lookup_rcu which is | ||
1692 | * required to prevent single threaded performance regressions | ||
1693 | * especially on architectures where smp_rmb (in seqcounts) are costly. | ||
1694 | * Keep the two functions in sync. | ||
1695 | */ | ||
1696 | |||
1697 | /* | ||
1698 | * The hash list is protected using RCU. | ||
1699 | * | ||
1700 | * Carefully use d_seq when comparing a candidate dentry, to avoid | ||
1701 | * races with d_move(). | ||
1702 | * | ||
1703 | * It is possible that concurrent renames can mess up our list | ||
1704 | * walk here and result in missing our dentry, resulting in the | ||
1705 | * false-negative result. d_lookup() protects against concurrent | ||
1706 | * renames using rename_lock seqlock. | ||
1707 | * | ||
1708 | * See Documentation/vfs/dcache-locking.txt for more details. | ||
1709 | */ | ||
1710 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { | ||
1711 | struct inode *i; | ||
1712 | const char *tname; | ||
1713 | int tlen; | ||
1714 | |||
1715 | if (dentry->d_name.hash != hash) | ||
1716 | continue; | ||
1717 | |||
1718 | seqretry: | ||
1719 | *seq = read_seqcount_begin(&dentry->d_seq); | ||
1720 | if (dentry->d_parent != parent) | ||
1721 | continue; | ||
1722 | if (d_unhashed(dentry)) | ||
1723 | continue; | ||
1724 | tlen = dentry->d_name.len; | ||
1725 | tname = dentry->d_name.name; | ||
1726 | i = dentry->d_inode; | ||
1727 | /* | ||
1728 | * This seqcount check is required to ensure name and | ||
1729 | * len are loaded atomically, so as not to walk off the | ||
1730 | * edge of memory when walking. If we could load this | ||
1731 | * atomically some other way, we could drop this check. | ||
1732 | */ | ||
1733 | if (read_seqcount_retry(&dentry->d_seq, *seq)) | ||
1734 | goto seqretry; | ||
1735 | if (parent->d_op && parent->d_op->d_compare) { | ||
1736 | if (parent->d_op->d_compare(parent, *inode, | ||
1737 | dentry, i, | ||
1738 | tlen, tname, name)) | ||
1739 | continue; | ||
1740 | } else { | ||
1741 | if (tlen != len) | ||
1742 | continue; | ||
1743 | if (memcmp(tname, str, tlen)) | ||
1744 | continue; | ||
1745 | } | ||
1746 | /* | ||
1747 | * No extra seqcount check is required after the name | ||
1748 | * compare. The caller must perform a seqcount check in | ||
1749 | * order to do anything useful with the returned dentry | ||
1750 | * anyway. | ||
1751 | */ | ||
1752 | *inode = i; | ||
1753 | return dentry; | ||
1754 | } | ||
1755 | return NULL; | ||
1756 | } | ||
1757 | |||
1758 | /** | ||
1614 | * d_lookup - search for a dentry | 1759 | * d_lookup - search for a dentry |
1615 | * @parent: parent dentry | 1760 | * @parent: parent dentry |
1616 | * @name: qstr of name we wish to find | 1761 | * @name: qstr of name we wish to find |
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci); | |||
1621 | * dentry is returned. The caller must use dput to free the entry when it has | 1766 | * dentry is returned. The caller must use dput to free the entry when it has |
1622 | * finished using it. %NULL is returned if the dentry does not exist. | 1767 | * finished using it. %NULL is returned if the dentry does not exist. |
1623 | */ | 1768 | */ |
1624 | struct dentry * d_lookup(struct dentry * parent, struct qstr * name) | 1769 | struct dentry *d_lookup(struct dentry *parent, struct qstr *name) |
1625 | { | 1770 | { |
1626 | struct dentry * dentry = NULL; | 1771 | struct dentry *dentry; |
1627 | unsigned seq; | 1772 | unsigned seq; |
1628 | 1773 | ||
1629 | do { | 1774 | do { |
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) | |||
1636 | } | 1781 | } |
1637 | EXPORT_SYMBOL(d_lookup); | 1782 | EXPORT_SYMBOL(d_lookup); |
1638 | 1783 | ||
1639 | /* | 1784 | /** |
1640 | * __d_lookup - search for a dentry (racy) | 1785 | * __d_lookup - search for a dentry (racy) |
1641 | * @parent: parent dentry | 1786 | * @parent: parent dentry |
1642 | * @name: qstr of name we wish to find | 1787 | * @name: qstr of name we wish to find |
@@ -1651,17 +1796,24 @@ EXPORT_SYMBOL(d_lookup); | |||
1651 | * | 1796 | * |
1652 | * __d_lookup callers must be commented. | 1797 | * __d_lookup callers must be commented. |
1653 | */ | 1798 | */ |
1654 | struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | 1799 | struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) |
1655 | { | 1800 | { |
1656 | unsigned int len = name->len; | 1801 | unsigned int len = name->len; |
1657 | unsigned int hash = name->hash; | 1802 | unsigned int hash = name->hash; |
1658 | const unsigned char *str = name->name; | 1803 | const unsigned char *str = name->name; |
1659 | struct hlist_head *head = d_hash(parent,hash); | 1804 | struct hlist_head *head = d_hash(parent,hash); |
1660 | struct dentry *found = NULL; | ||
1661 | struct hlist_node *node; | 1805 | struct hlist_node *node; |
1806 | struct dentry *found = NULL; | ||
1662 | struct dentry *dentry; | 1807 | struct dentry *dentry; |
1663 | 1808 | ||
1664 | /* | 1809 | /* |
1810 | * Note: There is significant duplication with __d_lookup_rcu which is | ||
1811 | * required to prevent single threaded performance regressions | ||
1812 | * especially on architectures where smp_rmb (in seqcounts) are costly. | ||
1813 | * Keep the two functions in sync. | ||
1814 | */ | ||
1815 | |||
1816 | /* | ||
1665 | * The hash list is protected using RCU. | 1817 | * The hash list is protected using RCU. |
1666 | * | 1818 | * |
1667 | * Take d_lock when comparing a candidate dentry, to avoid races | 1819 | * Take d_lock when comparing a candidate dentry, to avoid races |
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | |||
1677 | rcu_read_lock(); | 1829 | rcu_read_lock(); |
1678 | 1830 | ||
1679 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { | 1831 | hlist_for_each_entry_rcu(dentry, node, head, d_hash) { |
1680 | struct qstr *qstr; | 1832 | const char *tname; |
1833 | int tlen; | ||
1681 | 1834 | ||
1682 | if (dentry->d_name.hash != hash) | 1835 | if (dentry->d_name.hash != hash) |
1683 | continue; | 1836 | continue; |
1684 | if (dentry->d_parent != parent) | ||
1685 | continue; | ||
1686 | 1837 | ||
1687 | spin_lock(&dentry->d_lock); | 1838 | spin_lock(&dentry->d_lock); |
1688 | |||
1689 | /* | ||
1690 | * Recheck the dentry after taking the lock - d_move may have | ||
1691 | * changed things. Don't bother checking the hash because | ||
1692 | * we're about to compare the whole name anyway. | ||
1693 | */ | ||
1694 | if (dentry->d_parent != parent) | 1839 | if (dentry->d_parent != parent) |
1695 | goto next; | 1840 | goto next; |
1696 | |||
1697 | /* non-existing due to RCU? */ | ||
1698 | if (d_unhashed(dentry)) | 1841 | if (d_unhashed(dentry)) |
1699 | goto next; | 1842 | goto next; |
1700 | 1843 | ||
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) | |||
1702 | * It is safe to compare names since d_move() cannot | 1845 | * It is safe to compare names since d_move() cannot |
1703 | * change the qstr (protected by d_lock). | 1846 | * change the qstr (protected by d_lock). |
1704 | */ | 1847 | */ |
1705 | qstr = &dentry->d_name; | 1848 | tlen = dentry->d_name.len; |
1849 | tname = dentry->d_name.name; | ||
1706 | if (parent->d_op && parent->d_op->d_compare) { | 1850 | if (parent->d_op && parent->d_op->d_compare) { |
1707 | if (parent->d_op->d_compare(parent, parent->d_inode, | 1851 | if (parent->d_op->d_compare(parent, parent->d_inode, |
1708 | dentry, dentry->d_inode, | 1852 | dentry, dentry->d_inode, |
1709 | qstr->len, qstr->name, name)) | 1853 | tlen, tname, name)) |
1710 | goto next; | 1854 | goto next; |
1711 | } else { | 1855 | } else { |
1712 | if (qstr->len != len) | 1856 | if (tlen != len) |
1713 | goto next; | 1857 | goto next; |
1714 | if (memcmp(qstr->name, str, len)) | 1858 | if (memcmp(tname, str, tlen)) |
1715 | goto next; | 1859 | goto next; |
1716 | } | 1860 | } |
1717 | 1861 | ||
@@ -1821,7 +1965,7 @@ again: | |||
1821 | goto again; | 1965 | goto again; |
1822 | } | 1966 | } |
1823 | dentry->d_flags &= ~DCACHE_CANT_MOUNT; | 1967 | dentry->d_flags &= ~DCACHE_CANT_MOUNT; |
1824 | dentry_iput(dentry); | 1968 | dentry_unlink_inode(dentry); |
1825 | fsnotify_nameremove(dentry, isdir); | 1969 | fsnotify_nameremove(dentry, isdir); |
1826 | return; | 1970 | return; |
1827 | } | 1971 | } |
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) | |||
1884 | BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ | 2028 | BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ |
1885 | 2029 | ||
1886 | spin_lock(&dentry->d_lock); | 2030 | spin_lock(&dentry->d_lock); |
2031 | write_seqcount_begin(&dentry->d_seq); | ||
1887 | memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); | 2032 | memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); |
2033 | write_seqcount_end(&dentry->d_seq); | ||
1888 | spin_unlock(&dentry->d_lock); | 2034 | spin_unlock(&dentry->d_lock); |
1889 | } | 2035 | } |
1890 | EXPORT_SYMBOL(dentry_update_name_case); | 2036 | EXPORT_SYMBOL(dentry_update_name_case); |
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
1997 | 2143 | ||
1998 | dentry_lock_for_move(dentry, target); | 2144 | dentry_lock_for_move(dentry, target); |
1999 | 2145 | ||
2146 | write_seqcount_begin(&dentry->d_seq); | ||
2147 | write_seqcount_begin(&target->d_seq); | ||
2148 | |||
2000 | /* Move the dentry to the target hash queue, if on different bucket */ | 2149 | /* Move the dentry to the target hash queue, if on different bucket */ |
2001 | spin_lock(&dcache_hash_lock); | 2150 | spin_lock(&dcache_hash_lock); |
2002 | if (!d_unhashed(dentry)) | 2151 | if (!d_unhashed(dentry)) |
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
2005 | spin_unlock(&dcache_hash_lock); | 2154 | spin_unlock(&dcache_hash_lock); |
2006 | 2155 | ||
2007 | /* Unhash the target: dput() will then get rid of it */ | 2156 | /* Unhash the target: dput() will then get rid of it */ |
2157 | /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ | ||
2008 | __d_drop(target); | 2158 | __d_drop(target); |
2009 | 2159 | ||
2010 | list_del(&dentry->d_u.d_child); | 2160 | list_del(&dentry->d_u.d_child); |
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target) | |||
2028 | 2178 | ||
2029 | list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); | 2179 | list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); |
2030 | 2180 | ||
2181 | write_seqcount_end(&target->d_seq); | ||
2182 | write_seqcount_end(&dentry->d_seq); | ||
2183 | |||
2031 | dentry_unlock_parents_for_move(dentry, target); | 2184 | dentry_unlock_parents_for_move(dentry, target); |
2032 | spin_unlock(&target->d_lock); | 2185 | spin_unlock(&target->d_lock); |
2033 | fsnotify_d_move(dentry); | 2186 | fsnotify_d_move(dentry); |
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) | |||
2110 | 2263 | ||
2111 | dentry_lock_for_move(anon, dentry); | 2264 | dentry_lock_for_move(anon, dentry); |
2112 | 2265 | ||
2266 | write_seqcount_begin(&dentry->d_seq); | ||
2267 | write_seqcount_begin(&anon->d_seq); | ||
2268 | |||
2113 | dparent = dentry->d_parent; | 2269 | dparent = dentry->d_parent; |
2114 | aparent = anon->d_parent; | 2270 | aparent = anon->d_parent; |
2115 | 2271 | ||
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) | |||
2130 | else | 2286 | else |
2131 | INIT_LIST_HEAD(&anon->d_u.d_child); | 2287 | INIT_LIST_HEAD(&anon->d_u.d_child); |
2132 | 2288 | ||
2289 | write_seqcount_end(&dentry->d_seq); | ||
2290 | write_seqcount_end(&anon->d_seq); | ||
2291 | |||
2133 | dentry_unlock_parents_for_move(anon, dentry); | 2292 | dentry_unlock_parents_for_move(anon, dentry); |
2134 | spin_unlock(&dentry->d_lock); | 2293 | spin_unlock(&dentry->d_lock); |
2135 | 2294 | ||
diff --git a/fs/filesystems.c b/fs/filesystems.c index 68ba492d8eef..751d6b255a12 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c | |||
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs) | |||
115 | tmp = &(*tmp)->next; | 115 | tmp = &(*tmp)->next; |
116 | } | 116 | } |
117 | write_unlock(&file_systems_lock); | 117 | write_unlock(&file_systems_lock); |
118 | |||
119 | synchronize_rcu(); | ||
120 | |||
118 | return -EINVAL; | 121 | return -EINVAL; |
119 | } | 122 | } |
120 | 123 | ||
diff --git a/fs/namei.c b/fs/namei.c index 5642bc2be418..8d3f15b3a541 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname); | |||
169 | /* | 169 | /* |
170 | * This does basic POSIX ACL permission checking | 170 | * This does basic POSIX ACL permission checking |
171 | */ | 171 | */ |
172 | static int acl_permission_check(struct inode *inode, int mask, | 172 | static inline int __acl_permission_check(struct inode *inode, int mask, |
173 | int (*check_acl)(struct inode *inode, int mask)) | 173 | int (*check_acl)(struct inode *inode, int mask), int rcu) |
174 | { | 174 | { |
175 | umode_t mode = inode->i_mode; | 175 | umode_t mode = inode->i_mode; |
176 | 176 | ||
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask, | |||
180 | mode >>= 6; | 180 | mode >>= 6; |
181 | else { | 181 | else { |
182 | if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { | 182 | if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { |
183 | int error = check_acl(inode, mask); | 183 | if (rcu) { |
184 | if (error != -EAGAIN) | 184 | return -ECHILD; |
185 | return error; | 185 | } else { |
186 | int error = check_acl(inode, mask); | ||
187 | if (error != -EAGAIN) | ||
188 | return error; | ||
189 | } | ||
186 | } | 190 | } |
187 | 191 | ||
188 | if (in_group_p(inode->i_gid)) | 192 | if (in_group_p(inode->i_gid)) |
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask, | |||
197 | return -EACCES; | 201 | return -EACCES; |
198 | } | 202 | } |
199 | 203 | ||
204 | static inline int acl_permission_check(struct inode *inode, int mask, | ||
205 | int (*check_acl)(struct inode *inode, int mask)) | ||
206 | { | ||
207 | return __acl_permission_check(inode, mask, check_acl, 0); | ||
208 | } | ||
209 | |||
200 | /** | 210 | /** |
201 | * generic_permission - check for access rights on a Posix-like filesystem | 211 | * generic_permission - check for access rights on a Posix-like filesystem |
202 | * @inode: inode to check access rights for | 212 | * @inode: inode to check access rights for |
@@ -375,6 +385,173 @@ void path_put(struct path *path) | |||
375 | EXPORT_SYMBOL(path_put); | 385 | EXPORT_SYMBOL(path_put); |
376 | 386 | ||
377 | /** | 387 | /** |
388 | * nameidata_drop_rcu - drop this nameidata out of rcu-walk | ||
389 | * @nd: nameidata pathwalk data to drop | ||
390 | * @Returns: 0 on success, -ECHLID on failure | ||
391 | * | ||
392 | * Path walking has 2 modes, rcu-walk and ref-walk (see | ||
393 | * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt | ||
394 | * to drop out of rcu-walk mode and take normal reference counts on dentries | ||
395 | * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take | ||
396 | * refcounts at the last known good point before rcu-walk got stuck, so | ||
397 | * ref-walk may continue from there. If this is not successful (eg. a seqcount | ||
398 | * has changed), then failure is returned and path walk restarts from the | ||
399 | * beginning in ref-walk mode. | ||
400 | * | ||
401 | * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into | ||
402 | * ref-walk. Must be called from rcu-walk context. | ||
403 | */ | ||
404 | static int nameidata_drop_rcu(struct nameidata *nd) | ||
405 | { | ||
406 | struct fs_struct *fs = current->fs; | ||
407 | struct dentry *dentry = nd->path.dentry; | ||
408 | |||
409 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
410 | if (nd->root.mnt) { | ||
411 | spin_lock(&fs->lock); | ||
412 | if (nd->root.mnt != fs->root.mnt || | ||
413 | nd->root.dentry != fs->root.dentry) | ||
414 | goto err_root; | ||
415 | } | ||
416 | spin_lock(&dentry->d_lock); | ||
417 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
418 | goto err; | ||
419 | BUG_ON(nd->inode != dentry->d_inode); | ||
420 | spin_unlock(&dentry->d_lock); | ||
421 | if (nd->root.mnt) { | ||
422 | path_get(&nd->root); | ||
423 | spin_unlock(&fs->lock); | ||
424 | } | ||
425 | mntget(nd->path.mnt); | ||
426 | |||
427 | rcu_read_unlock(); | ||
428 | br_read_unlock(vfsmount_lock); | ||
429 | nd->flags &= ~LOOKUP_RCU; | ||
430 | return 0; | ||
431 | err: | ||
432 | spin_unlock(&dentry->d_lock); | ||
433 | err_root: | ||
434 | if (nd->root.mnt) | ||
435 | spin_unlock(&fs->lock); | ||
436 | return -ECHILD; | ||
437 | } | ||
438 | |||
439 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
440 | static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) | ||
441 | { | ||
442 | if (nd->flags & LOOKUP_RCU) | ||
443 | return nameidata_drop_rcu(nd); | ||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | /** | ||
448 | * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk | ||
449 | * @nd: nameidata pathwalk data to drop | ||
450 | * @dentry: dentry to drop | ||
451 | * @Returns: 0 on success, -ECHLID on failure | ||
452 | * | ||
453 | * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, | ||
454 | * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on | ||
455 | * @nd. Must be called from rcu-walk context. | ||
456 | */ | ||
457 | static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) | ||
458 | { | ||
459 | struct fs_struct *fs = current->fs; | ||
460 | struct dentry *parent = nd->path.dentry; | ||
461 | |||
462 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
463 | if (nd->root.mnt) { | ||
464 | spin_lock(&fs->lock); | ||
465 | if (nd->root.mnt != fs->root.mnt || | ||
466 | nd->root.dentry != fs->root.dentry) | ||
467 | goto err_root; | ||
468 | } | ||
469 | spin_lock(&parent->d_lock); | ||
470 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
471 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
472 | goto err; | ||
473 | /* | ||
474 | * If the sequence check on the child dentry passed, then the child has | ||
475 | * not been removed from its parent. This means the parent dentry must | ||
476 | * be valid and able to take a reference at this point. | ||
477 | */ | ||
478 | BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); | ||
479 | BUG_ON(!parent->d_count); | ||
480 | parent->d_count++; | ||
481 | spin_unlock(&dentry->d_lock); | ||
482 | spin_unlock(&parent->d_lock); | ||
483 | if (nd->root.mnt) { | ||
484 | path_get(&nd->root); | ||
485 | spin_unlock(&fs->lock); | ||
486 | } | ||
487 | mntget(nd->path.mnt); | ||
488 | |||
489 | rcu_read_unlock(); | ||
490 | br_read_unlock(vfsmount_lock); | ||
491 | nd->flags &= ~LOOKUP_RCU; | ||
492 | return 0; | ||
493 | err: | ||
494 | spin_unlock(&dentry->d_lock); | ||
495 | spin_unlock(&parent->d_lock); | ||
496 | err_root: | ||
497 | if (nd->root.mnt) | ||
498 | spin_unlock(&fs->lock); | ||
499 | return -ECHILD; | ||
500 | } | ||
501 | |||
502 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
503 | static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) | ||
504 | { | ||
505 | if (nd->flags & LOOKUP_RCU) | ||
506 | return nameidata_dentry_drop_rcu(nd, dentry); | ||
507 | return 0; | ||
508 | } | ||
509 | |||
510 | /** | ||
511 | * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk | ||
512 | * @nd: nameidata pathwalk data to drop | ||
513 | * @Returns: 0 on success, -ECHLID on failure | ||
514 | * | ||
515 | * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. | ||
516 | * nd->path should be the final element of the lookup, so nd->root is discarded. | ||
517 | * Must be called from rcu-walk context. | ||
518 | */ | ||
519 | static int nameidata_drop_rcu_last(struct nameidata *nd) | ||
520 | { | ||
521 | struct dentry *dentry = nd->path.dentry; | ||
522 | |||
523 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | ||
524 | nd->flags &= ~LOOKUP_RCU; | ||
525 | nd->root.mnt = NULL; | ||
526 | spin_lock(&dentry->d_lock); | ||
527 | if (!__d_rcu_to_refcount(dentry, nd->seq)) | ||
528 | goto err_unlock; | ||
529 | BUG_ON(nd->inode != dentry->d_inode); | ||
530 | spin_unlock(&dentry->d_lock); | ||
531 | |||
532 | mntget(nd->path.mnt); | ||
533 | |||
534 | rcu_read_unlock(); | ||
535 | br_read_unlock(vfsmount_lock); | ||
536 | |||
537 | return 0; | ||
538 | |||
539 | err_unlock: | ||
540 | spin_unlock(&dentry->d_lock); | ||
541 | rcu_read_unlock(); | ||
542 | br_read_unlock(vfsmount_lock); | ||
543 | return -ECHILD; | ||
544 | } | ||
545 | |||
546 | /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ | ||
547 | static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) | ||
548 | { | ||
549 | if (likely(nd->flags & LOOKUP_RCU)) | ||
550 | return nameidata_drop_rcu_last(nd); | ||
551 | return 0; | ||
552 | } | ||
553 | |||
554 | /** | ||
378 | * release_open_intent - free up open intent resources | 555 | * release_open_intent - free up open intent resources |
379 | * @nd: pointer to nameidata | 556 | * @nd: pointer to nameidata |
380 | */ | 557 | */ |
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd) | |||
459 | * short-cut DAC fails, then call ->permission() to do more | 636 | * short-cut DAC fails, then call ->permission() to do more |
460 | * complete permission check. | 637 | * complete permission check. |
461 | */ | 638 | */ |
462 | static int exec_permission(struct inode *inode) | 639 | static inline int __exec_permission(struct inode *inode, int rcu) |
463 | { | 640 | { |
464 | int ret; | 641 | int ret; |
465 | 642 | ||
466 | if (inode->i_op->permission) { | 643 | if (inode->i_op->permission) { |
644 | if (rcu) | ||
645 | return -ECHILD; | ||
467 | ret = inode->i_op->permission(inode, MAY_EXEC); | 646 | ret = inode->i_op->permission(inode, MAY_EXEC); |
468 | if (!ret) | 647 | if (!ret) |
469 | goto ok; | 648 | goto ok; |
470 | return ret; | 649 | return ret; |
471 | } | 650 | } |
472 | ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); | 651 | ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu); |
473 | if (!ret) | 652 | if (!ret) |
474 | goto ok; | 653 | goto ok; |
654 | if (rcu && ret == -ECHILD) | ||
655 | return ret; | ||
475 | 656 | ||
476 | if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) | 657 | if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) |
477 | goto ok; | 658 | goto ok; |
478 | 659 | ||
479 | return ret; | 660 | return ret; |
480 | ok: | 661 | ok: |
481 | return security_inode_permission(inode, MAY_EXEC); | 662 | return security_inode_exec_permission(inode, rcu); |
663 | } | ||
664 | |||
665 | static int exec_permission(struct inode *inode) | ||
666 | { | ||
667 | return __exec_permission(inode, 0); | ||
668 | } | ||
669 | |||
670 | static int exec_permission_rcu(struct inode *inode) | ||
671 | { | ||
672 | return __exec_permission(inode, 1); | ||
482 | } | 673 | } |
483 | 674 | ||
484 | static __always_inline void set_root(struct nameidata *nd) | 675 | static __always_inline void set_root(struct nameidata *nd) |
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd) | |||
489 | 680 | ||
490 | static int link_path_walk(const char *, struct nameidata *); | 681 | static int link_path_walk(const char *, struct nameidata *); |
491 | 682 | ||
683 | static __always_inline void set_root_rcu(struct nameidata *nd) | ||
684 | { | ||
685 | if (!nd->root.mnt) { | ||
686 | struct fs_struct *fs = current->fs; | ||
687 | spin_lock(&fs->lock); | ||
688 | nd->root = fs->root; | ||
689 | spin_unlock(&fs->lock); | ||
690 | } | ||
691 | } | ||
692 | |||
492 | static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) | 693 | static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) |
493 | { | 694 | { |
695 | int ret; | ||
696 | |||
494 | if (IS_ERR(link)) | 697 | if (IS_ERR(link)) |
495 | goto fail; | 698 | goto fail; |
496 | 699 | ||
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l | |||
500 | nd->path = nd->root; | 703 | nd->path = nd->root; |
501 | path_get(&nd->root); | 704 | path_get(&nd->root); |
502 | } | 705 | } |
706 | nd->inode = nd->path.dentry->d_inode; | ||
503 | 707 | ||
504 | return link_path_walk(link, nd); | 708 | ret = link_path_walk(link, nd); |
709 | return ret; | ||
505 | fail: | 710 | fail: |
506 | path_put(&nd->path); | 711 | path_put(&nd->path); |
507 | return PTR_ERR(link); | 712 | return PTR_ERR(link); |
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) | |||
516 | 721 | ||
517 | static inline void path_to_nameidata(struct path *path, struct nameidata *nd) | 722 | static inline void path_to_nameidata(struct path *path, struct nameidata *nd) |
518 | { | 723 | { |
519 | dput(nd->path.dentry); | 724 | if (!(nd->flags & LOOKUP_RCU)) { |
520 | if (nd->path.mnt != path->mnt) { | 725 | dput(nd->path.dentry); |
521 | mntput(nd->path.mnt); | 726 | if (nd->path.mnt != path->mnt) |
522 | nd->path.mnt = path->mnt; | 727 | mntput(nd->path.mnt); |
523 | } | 728 | } |
729 | nd->path.mnt = path->mnt; | ||
524 | nd->path.dentry = path->dentry; | 730 | nd->path.dentry = path->dentry; |
525 | } | 731 | } |
526 | 732 | ||
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p) | |||
535 | 741 | ||
536 | if (path->mnt != nd->path.mnt) { | 742 | if (path->mnt != nd->path.mnt) { |
537 | path_to_nameidata(path, nd); | 743 | path_to_nameidata(path, nd); |
744 | nd->inode = nd->path.dentry->d_inode; | ||
538 | dget(dentry); | 745 | dget(dentry); |
539 | } | 746 | } |
540 | mntget(path->mnt); | 747 | mntget(path->mnt); |
748 | |||
541 | nd->last_type = LAST_BIND; | 749 | nd->last_type = LAST_BIND; |
542 | *p = dentry->d_inode->i_op->follow_link(dentry, nd); | 750 | *p = dentry->d_inode->i_op->follow_link(dentry, nd); |
543 | error = PTR_ERR(*p); | 751 | error = PTR_ERR(*p); |
@@ -591,6 +799,20 @@ loop: | |||
591 | return err; | 799 | return err; |
592 | } | 800 | } |
593 | 801 | ||
802 | static int follow_up_rcu(struct path *path) | ||
803 | { | ||
804 | struct vfsmount *parent; | ||
805 | struct dentry *mountpoint; | ||
806 | |||
807 | parent = path->mnt->mnt_parent; | ||
808 | if (parent == path->mnt) | ||
809 | return 0; | ||
810 | mountpoint = path->mnt->mnt_mountpoint; | ||
811 | path->dentry = mountpoint; | ||
812 | path->mnt = parent; | ||
813 | return 1; | ||
814 | } | ||
815 | |||
594 | int follow_up(struct path *path) | 816 | int follow_up(struct path *path) |
595 | { | 817 | { |
596 | struct vfsmount *parent; | 818 | struct vfsmount *parent; |
@@ -615,6 +837,21 @@ int follow_up(struct path *path) | |||
615 | /* | 837 | /* |
616 | * serialization is taken care of in namespace.c | 838 | * serialization is taken care of in namespace.c |
617 | */ | 839 | */ |
840 | static void __follow_mount_rcu(struct nameidata *nd, struct path *path, | ||
841 | struct inode **inode) | ||
842 | { | ||
843 | while (d_mountpoint(path->dentry)) { | ||
844 | struct vfsmount *mounted; | ||
845 | mounted = __lookup_mnt(path->mnt, path->dentry, 1); | ||
846 | if (!mounted) | ||
847 | return; | ||
848 | path->mnt = mounted; | ||
849 | path->dentry = mounted->mnt_root; | ||
850 | nd->seq = read_seqcount_begin(&path->dentry->d_seq); | ||
851 | *inode = path->dentry->d_inode; | ||
852 | } | ||
853 | } | ||
854 | |||
618 | static int __follow_mount(struct path *path) | 855 | static int __follow_mount(struct path *path) |
619 | { | 856 | { |
620 | int res = 0; | 857 | int res = 0; |
@@ -660,7 +897,42 @@ int follow_down(struct path *path) | |||
660 | return 0; | 897 | return 0; |
661 | } | 898 | } |
662 | 899 | ||
663 | static __always_inline void follow_dotdot(struct nameidata *nd) | 900 | static int follow_dotdot_rcu(struct nameidata *nd) |
901 | { | ||
902 | struct inode *inode = nd->inode; | ||
903 | |||
904 | set_root_rcu(nd); | ||
905 | |||
906 | while(1) { | ||
907 | if (nd->path.dentry == nd->root.dentry && | ||
908 | nd->path.mnt == nd->root.mnt) { | ||
909 | break; | ||
910 | } | ||
911 | if (nd->path.dentry != nd->path.mnt->mnt_root) { | ||
912 | struct dentry *old = nd->path.dentry; | ||
913 | struct dentry *parent = old->d_parent; | ||
914 | unsigned seq; | ||
915 | |||
916 | seq = read_seqcount_begin(&parent->d_seq); | ||
917 | if (read_seqcount_retry(&old->d_seq, nd->seq)) | ||
918 | return -ECHILD; | ||
919 | inode = parent->d_inode; | ||
920 | nd->path.dentry = parent; | ||
921 | nd->seq = seq; | ||
922 | break; | ||
923 | } | ||
924 | if (!follow_up_rcu(&nd->path)) | ||
925 | break; | ||
926 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
927 | inode = nd->path.dentry->d_inode; | ||
928 | } | ||
929 | __follow_mount_rcu(nd, &nd->path, &inode); | ||
930 | nd->inode = inode; | ||
931 | |||
932 | return 0; | ||
933 | } | ||
934 | |||
935 | static void follow_dotdot(struct nameidata *nd) | ||
664 | { | 936 | { |
665 | set_root(nd); | 937 | set_root(nd); |
666 | 938 | ||
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) | |||
681 | break; | 953 | break; |
682 | } | 954 | } |
683 | follow_mount(&nd->path); | 955 | follow_mount(&nd->path); |
956 | nd->inode = nd->path.dentry->d_inode; | ||
684 | } | 957 | } |
685 | 958 | ||
686 | /* | 959 | /* |
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent, | |||
718 | * It _is_ time-critical. | 991 | * It _is_ time-critical. |
719 | */ | 992 | */ |
720 | static int do_lookup(struct nameidata *nd, struct qstr *name, | 993 | static int do_lookup(struct nameidata *nd, struct qstr *name, |
721 | struct path *path) | 994 | struct path *path, struct inode **inode) |
722 | { | 995 | { |
723 | struct vfsmount *mnt = nd->path.mnt; | 996 | struct vfsmount *mnt = nd->path.mnt; |
724 | struct dentry *dentry, *parent; | 997 | struct dentry *dentry, *parent = nd->path.dentry; |
725 | struct inode *dir; | 998 | struct inode *dir; |
726 | /* | 999 | /* |
727 | * See if the low-level filesystem might want | 1000 | * See if the low-level filesystem might want |
728 | * to use its own hash.. | 1001 | * to use its own hash.. |
729 | */ | 1002 | */ |
730 | if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { | 1003 | if (parent->d_op && parent->d_op->d_hash) { |
731 | int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, | 1004 | int err = parent->d_op->d_hash(parent, nd->inode, name); |
732 | nd->path.dentry->d_inode, name); | ||
733 | if (err < 0) | 1005 | if (err < 0) |
734 | return err; | 1006 | return err; |
735 | } | 1007 | } |
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, | |||
739 | * of a false negative due to a concurrent rename, we're going to | 1011 | * of a false negative due to a concurrent rename, we're going to |
740 | * do the non-racy lookup, below. | 1012 | * do the non-racy lookup, below. |
741 | */ | 1013 | */ |
742 | dentry = __d_lookup(nd->path.dentry, name); | 1014 | if (nd->flags & LOOKUP_RCU) { |
743 | if (!dentry) | 1015 | unsigned seq; |
744 | goto need_lookup; | 1016 | |
1017 | *inode = nd->inode; | ||
1018 | dentry = __d_lookup_rcu(parent, name, &seq, inode); | ||
1019 | if (!dentry) { | ||
1020 | if (nameidata_drop_rcu(nd)) | ||
1021 | return -ECHILD; | ||
1022 | goto need_lookup; | ||
1023 | } | ||
1024 | /* Memory barrier in read_seqcount_begin of child is enough */ | ||
1025 | if (__read_seqcount_retry(&parent->d_seq, nd->seq)) | ||
1026 | return -ECHILD; | ||
1027 | |||
1028 | nd->seq = seq; | ||
1029 | if (dentry->d_op && dentry->d_op->d_revalidate) { | ||
1030 | /* We commonly drop rcu-walk here */ | ||
1031 | if (nameidata_dentry_drop_rcu(nd, dentry)) | ||
1032 | return -ECHILD; | ||
1033 | goto need_revalidate; | ||
1034 | } | ||
1035 | path->mnt = mnt; | ||
1036 | path->dentry = dentry; | ||
1037 | __follow_mount_rcu(nd, path, inode); | ||
1038 | } else { | ||
1039 | dentry = __d_lookup(parent, name); | ||
1040 | if (!dentry) | ||
1041 | goto need_lookup; | ||
745 | found: | 1042 | found: |
746 | if (dentry->d_op && dentry->d_op->d_revalidate) | 1043 | if (dentry->d_op && dentry->d_op->d_revalidate) |
747 | goto need_revalidate; | 1044 | goto need_revalidate; |
748 | done: | 1045 | done: |
749 | path->mnt = mnt; | 1046 | path->mnt = mnt; |
750 | path->dentry = dentry; | 1047 | path->dentry = dentry; |
751 | __follow_mount(path); | 1048 | __follow_mount(path); |
1049 | *inode = path->dentry->d_inode; | ||
1050 | } | ||
752 | return 0; | 1051 | return 0; |
753 | 1052 | ||
754 | need_lookup: | 1053 | need_lookup: |
755 | parent = nd->path.dentry; | ||
756 | dir = parent->d_inode; | 1054 | dir = parent->d_inode; |
1055 | BUG_ON(nd->inode != dir); | ||
757 | 1056 | ||
758 | mutex_lock(&dir->i_mutex); | 1057 | mutex_lock(&dir->i_mutex); |
759 | /* | 1058 | /* |
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) | |||
815 | static int link_path_walk(const char *name, struct nameidata *nd) | 1114 | static int link_path_walk(const char *name, struct nameidata *nd) |
816 | { | 1115 | { |
817 | struct path next; | 1116 | struct path next; |
818 | struct inode *inode; | ||
819 | int err; | 1117 | int err; |
820 | unsigned int lookup_flags = nd->flags; | 1118 | unsigned int lookup_flags = nd->flags; |
821 | 1119 | ||
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd) | |||
824 | if (!*name) | 1122 | if (!*name) |
825 | goto return_reval; | 1123 | goto return_reval; |
826 | 1124 | ||
827 | inode = nd->path.dentry->d_inode; | ||
828 | if (nd->depth) | 1125 | if (nd->depth) |
829 | lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); | 1126 | lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); |
830 | 1127 | ||
831 | /* At this point we know we have a real path component. */ | 1128 | /* At this point we know we have a real path component. */ |
832 | for(;;) { | 1129 | for(;;) { |
1130 | struct inode *inode; | ||
833 | unsigned long hash; | 1131 | unsigned long hash; |
834 | struct qstr this; | 1132 | struct qstr this; |
835 | unsigned int c; | 1133 | unsigned int c; |
836 | 1134 | ||
837 | nd->flags |= LOOKUP_CONTINUE; | 1135 | nd->flags |= LOOKUP_CONTINUE; |
838 | err = exec_permission(inode); | 1136 | if (nd->flags & LOOKUP_RCU) { |
1137 | err = exec_permission_rcu(nd->inode); | ||
1138 | if (err == -ECHILD) { | ||
1139 | if (nameidata_drop_rcu(nd)) | ||
1140 | return -ECHILD; | ||
1141 | goto exec_again; | ||
1142 | } | ||
1143 | } else { | ||
1144 | exec_again: | ||
1145 | err = exec_permission(nd->inode); | ||
1146 | } | ||
839 | if (err) | 1147 | if (err) |
840 | break; | 1148 | break; |
841 | 1149 | ||
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd) | |||
866 | if (this.name[0] == '.') switch (this.len) { | 1174 | if (this.name[0] == '.') switch (this.len) { |
867 | default: | 1175 | default: |
868 | break; | 1176 | break; |
869 | case 2: | 1177 | case 2: |
870 | if (this.name[1] != '.') | 1178 | if (this.name[1] != '.') |
871 | break; | 1179 | break; |
872 | follow_dotdot(nd); | 1180 | if (nd->flags & LOOKUP_RCU) { |
873 | inode = nd->path.dentry->d_inode; | 1181 | if (follow_dotdot_rcu(nd)) |
1182 | return -ECHILD; | ||
1183 | } else | ||
1184 | follow_dotdot(nd); | ||
874 | /* fallthrough */ | 1185 | /* fallthrough */ |
875 | case 1: | 1186 | case 1: |
876 | continue; | 1187 | continue; |
877 | } | 1188 | } |
878 | /* This does the actual lookups.. */ | 1189 | /* This does the actual lookups.. */ |
879 | err = do_lookup(nd, &this, &next); | 1190 | err = do_lookup(nd, &this, &next, &inode); |
880 | if (err) | 1191 | if (err) |
881 | break; | 1192 | break; |
882 | |||
883 | err = -ENOENT; | 1193 | err = -ENOENT; |
884 | inode = next.dentry->d_inode; | ||
885 | if (!inode) | 1194 | if (!inode) |
886 | goto out_dput; | 1195 | goto out_dput; |
887 | 1196 | ||
888 | if (inode->i_op->follow_link) { | 1197 | if (inode->i_op->follow_link) { |
1198 | /* We commonly drop rcu-walk here */ | ||
1199 | if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) | ||
1200 | return -ECHILD; | ||
1201 | BUG_ON(inode != next.dentry->d_inode); | ||
889 | err = do_follow_link(&next, nd); | 1202 | err = do_follow_link(&next, nd); |
890 | if (err) | 1203 | if (err) |
891 | goto return_err; | 1204 | goto return_err; |
1205 | nd->inode = nd->path.dentry->d_inode; | ||
892 | err = -ENOENT; | 1206 | err = -ENOENT; |
893 | inode = nd->path.dentry->d_inode; | 1207 | if (!nd->inode) |
894 | if (!inode) | ||
895 | break; | 1208 | break; |
896 | } else | 1209 | } else { |
897 | path_to_nameidata(&next, nd); | 1210 | path_to_nameidata(&next, nd); |
1211 | nd->inode = inode; | ||
1212 | } | ||
898 | err = -ENOTDIR; | 1213 | err = -ENOTDIR; |
899 | if (!inode->i_op->lookup) | 1214 | if (!nd->inode->i_op->lookup) |
900 | break; | 1215 | break; |
901 | continue; | 1216 | continue; |
902 | /* here ends the main loop */ | 1217 | /* here ends the main loop */ |
@@ -911,32 +1226,39 @@ last_component: | |||
911 | if (this.name[0] == '.') switch (this.len) { | 1226 | if (this.name[0] == '.') switch (this.len) { |
912 | default: | 1227 | default: |
913 | break; | 1228 | break; |
914 | case 2: | 1229 | case 2: |
915 | if (this.name[1] != '.') | 1230 | if (this.name[1] != '.') |
916 | break; | 1231 | break; |
917 | follow_dotdot(nd); | 1232 | if (nd->flags & LOOKUP_RCU) { |
918 | inode = nd->path.dentry->d_inode; | 1233 | if (follow_dotdot_rcu(nd)) |
1234 | return -ECHILD; | ||
1235 | } else | ||
1236 | follow_dotdot(nd); | ||
919 | /* fallthrough */ | 1237 | /* fallthrough */ |
920 | case 1: | 1238 | case 1: |
921 | goto return_reval; | 1239 | goto return_reval; |
922 | } | 1240 | } |
923 | err = do_lookup(nd, &this, &next); | 1241 | err = do_lookup(nd, &this, &next, &inode); |
924 | if (err) | 1242 | if (err) |
925 | break; | 1243 | break; |
926 | inode = next.dentry->d_inode; | ||
927 | if (follow_on_final(inode, lookup_flags)) { | 1244 | if (follow_on_final(inode, lookup_flags)) { |
1245 | if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) | ||
1246 | return -ECHILD; | ||
1247 | BUG_ON(inode != next.dentry->d_inode); | ||
928 | err = do_follow_link(&next, nd); | 1248 | err = do_follow_link(&next, nd); |
929 | if (err) | 1249 | if (err) |
930 | goto return_err; | 1250 | goto return_err; |
931 | inode = nd->path.dentry->d_inode; | 1251 | nd->inode = nd->path.dentry->d_inode; |
932 | } else | 1252 | } else { |
933 | path_to_nameidata(&next, nd); | 1253 | path_to_nameidata(&next, nd); |
1254 | nd->inode = inode; | ||
1255 | } | ||
934 | err = -ENOENT; | 1256 | err = -ENOENT; |
935 | if (!inode) | 1257 | if (!nd->inode) |
936 | break; | 1258 | break; |
937 | if (lookup_flags & LOOKUP_DIRECTORY) { | 1259 | if (lookup_flags & LOOKUP_DIRECTORY) { |
938 | err = -ENOTDIR; | 1260 | err = -ENOTDIR; |
939 | if (!inode->i_op->lookup) | 1261 | if (!nd->inode->i_op->lookup) |
940 | break; | 1262 | break; |
941 | } | 1263 | } |
942 | goto return_base; | 1264 | goto return_base; |
@@ -958,6 +1280,8 @@ return_reval: | |||
958 | */ | 1280 | */ |
959 | if (nd->path.dentry && nd->path.dentry->d_sb && | 1281 | if (nd->path.dentry && nd->path.dentry->d_sb && |
960 | (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { | 1282 | (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { |
1283 | if (nameidata_drop_rcu_maybe(nd)) | ||
1284 | return -ECHILD; | ||
961 | err = -ESTALE; | 1285 | err = -ESTALE; |
962 | /* Note: we do not d_invalidate() */ | 1286 | /* Note: we do not d_invalidate() */ |
963 | if (!nd->path.dentry->d_op->d_revalidate( | 1287 | if (!nd->path.dentry->d_op->d_revalidate( |
@@ -965,16 +1289,34 @@ return_reval: | |||
965 | break; | 1289 | break; |
966 | } | 1290 | } |
967 | return_base: | 1291 | return_base: |
1292 | if (nameidata_drop_rcu_last_maybe(nd)) | ||
1293 | return -ECHILD; | ||
968 | return 0; | 1294 | return 0; |
969 | out_dput: | 1295 | out_dput: |
970 | path_put_conditional(&next, nd); | 1296 | if (!(nd->flags & LOOKUP_RCU)) |
1297 | path_put_conditional(&next, nd); | ||
971 | break; | 1298 | break; |
972 | } | 1299 | } |
973 | path_put(&nd->path); | 1300 | if (!(nd->flags & LOOKUP_RCU)) |
1301 | path_put(&nd->path); | ||
974 | return_err: | 1302 | return_err: |
975 | return err; | 1303 | return err; |
976 | } | 1304 | } |
977 | 1305 | ||
1306 | static inline int path_walk_rcu(const char *name, struct nameidata *nd) | ||
1307 | { | ||
1308 | current->total_link_count = 0; | ||
1309 | |||
1310 | return link_path_walk(name, nd); | ||
1311 | } | ||
1312 | |||
1313 | static inline int path_walk_simple(const char *name, struct nameidata *nd) | ||
1314 | { | ||
1315 | current->total_link_count = 0; | ||
1316 | |||
1317 | return link_path_walk(name, nd); | ||
1318 | } | ||
1319 | |||
978 | static int path_walk(const char *name, struct nameidata *nd) | 1320 | static int path_walk(const char *name, struct nameidata *nd) |
979 | { | 1321 | { |
980 | struct path save = nd->path; | 1322 | struct path save = nd->path; |
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd) | |||
1000 | return result; | 1342 | return result; |
1001 | } | 1343 | } |
1002 | 1344 | ||
1345 | static void path_finish_rcu(struct nameidata *nd) | ||
1346 | { | ||
1347 | if (nd->flags & LOOKUP_RCU) { | ||
1348 | /* RCU dangling. Cancel it. */ | ||
1349 | nd->flags &= ~LOOKUP_RCU; | ||
1350 | nd->root.mnt = NULL; | ||
1351 | rcu_read_unlock(); | ||
1352 | br_read_unlock(vfsmount_lock); | ||
1353 | } | ||
1354 | if (nd->file) | ||
1355 | fput(nd->file); | ||
1356 | } | ||
1357 | |||
1358 | static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) | ||
1359 | { | ||
1360 | int retval = 0; | ||
1361 | int fput_needed; | ||
1362 | struct file *file; | ||
1363 | |||
1364 | nd->last_type = LAST_ROOT; /* if there are only slashes... */ | ||
1365 | nd->flags = flags | LOOKUP_RCU; | ||
1366 | nd->depth = 0; | ||
1367 | nd->root.mnt = NULL; | ||
1368 | nd->file = NULL; | ||
1369 | |||
1370 | if (*name=='/') { | ||
1371 | struct fs_struct *fs = current->fs; | ||
1372 | |||
1373 | br_read_lock(vfsmount_lock); | ||
1374 | rcu_read_lock(); | ||
1375 | |||
1376 | spin_lock(&fs->lock); | ||
1377 | nd->root = fs->root; | ||
1378 | nd->path = nd->root; | ||
1379 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1380 | spin_unlock(&fs->lock); | ||
1381 | |||
1382 | } else if (dfd == AT_FDCWD) { | ||
1383 | struct fs_struct *fs = current->fs; | ||
1384 | |||
1385 | br_read_lock(vfsmount_lock); | ||
1386 | rcu_read_lock(); | ||
1387 | |||
1388 | spin_lock(&fs->lock); | ||
1389 | nd->path = fs->pwd; | ||
1390 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1391 | spin_unlock(&fs->lock); | ||
1392 | } else { | ||
1393 | struct dentry *dentry; | ||
1394 | |||
1395 | file = fget_light(dfd, &fput_needed); | ||
1396 | retval = -EBADF; | ||
1397 | if (!file) | ||
1398 | goto out_fail; | ||
1399 | |||
1400 | dentry = file->f_path.dentry; | ||
1401 | |||
1402 | retval = -ENOTDIR; | ||
1403 | if (!S_ISDIR(dentry->d_inode->i_mode)) | ||
1404 | goto fput_fail; | ||
1405 | |||
1406 | retval = file_permission(file, MAY_EXEC); | ||
1407 | if (retval) | ||
1408 | goto fput_fail; | ||
1409 | |||
1410 | nd->path = file->f_path; | ||
1411 | if (fput_needed) | ||
1412 | nd->file = file; | ||
1413 | |||
1414 | nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); | ||
1415 | br_read_lock(vfsmount_lock); | ||
1416 | rcu_read_lock(); | ||
1417 | } | ||
1418 | nd->inode = nd->path.dentry->d_inode; | ||
1419 | return 0; | ||
1420 | |||
1421 | fput_fail: | ||
1422 | fput_light(file, fput_needed); | ||
1423 | out_fail: | ||
1424 | return retval; | ||
1425 | } | ||
1426 | |||
1003 | static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) | 1427 | static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) |
1004 | { | 1428 | { |
1005 | int retval = 0; | 1429 | int retval = 0; |
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei | |||
1040 | 1464 | ||
1041 | fput_light(file, fput_needed); | 1465 | fput_light(file, fput_needed); |
1042 | } | 1466 | } |
1467 | nd->inode = nd->path.dentry->d_inode; | ||
1043 | return 0; | 1468 | return 0; |
1044 | 1469 | ||
1045 | fput_fail: | 1470 | fput_fail: |
@@ -1052,16 +1477,53 @@ out_fail: | |||
1052 | static int do_path_lookup(int dfd, const char *name, | 1477 | static int do_path_lookup(int dfd, const char *name, |
1053 | unsigned int flags, struct nameidata *nd) | 1478 | unsigned int flags, struct nameidata *nd) |
1054 | { | 1479 | { |
1055 | int retval = path_init(dfd, name, flags, nd); | 1480 | int retval; |
1056 | if (!retval) | 1481 | |
1057 | retval = path_walk(name, nd); | 1482 | /* |
1058 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && | 1483 | * Path walking is largely split up into 2 different synchronisation |
1059 | nd->path.dentry->d_inode)) | 1484 | * schemes, rcu-walk and ref-walk (explained in |
1060 | audit_inode(name, nd->path.dentry); | 1485 | * Documentation/filesystems/path-lookup.txt). These share much of the |
1486 | * path walk code, but some things particularly setup, cleanup, and | ||
1487 | * following mounts are sufficiently divergent that functions are | ||
1488 | * duplicated. Typically there is a function foo(), and its RCU | ||
1489 | * analogue, foo_rcu(). | ||
1490 | * | ||
1491 | * -ECHILD is the error number of choice (just to avoid clashes) that | ||
1492 | * is returned if some aspect of an rcu-walk fails. Such an error must | ||
1493 | * be handled by restarting a traditional ref-walk (which will always | ||
1494 | * be able to complete). | ||
1495 | */ | ||
1496 | retval = path_init_rcu(dfd, name, flags, nd); | ||
1497 | if (unlikely(retval)) | ||
1498 | return retval; | ||
1499 | retval = path_walk_rcu(name, nd); | ||
1500 | path_finish_rcu(nd); | ||
1061 | if (nd->root.mnt) { | 1501 | if (nd->root.mnt) { |
1062 | path_put(&nd->root); | 1502 | path_put(&nd->root); |
1063 | nd->root.mnt = NULL; | 1503 | nd->root.mnt = NULL; |
1064 | } | 1504 | } |
1505 | |||
1506 | if (unlikely(retval == -ECHILD || retval == -ESTALE)) { | ||
1507 | /* slower, locked walk */ | ||
1508 | if (retval == -ESTALE) | ||
1509 | flags |= LOOKUP_REVAL; | ||
1510 | retval = path_init(dfd, name, flags, nd); | ||
1511 | if (unlikely(retval)) | ||
1512 | return retval; | ||
1513 | retval = path_walk(name, nd); | ||
1514 | if (nd->root.mnt) { | ||
1515 | path_put(&nd->root); | ||
1516 | nd->root.mnt = NULL; | ||
1517 | } | ||
1518 | } | ||
1519 | |||
1520 | if (likely(!retval)) { | ||
1521 | if (unlikely(!audit_dummy_context())) { | ||
1522 | if (nd->path.dentry && nd->inode) | ||
1523 | audit_inode(name, nd->path.dentry); | ||
1524 | } | ||
1525 | } | ||
1526 | |||
1065 | return retval; | 1527 | return retval; |
1066 | } | 1528 | } |
1067 | 1529 | ||
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, | |||
1104 | path_get(&nd->path); | 1566 | path_get(&nd->path); |
1105 | nd->root = nd->path; | 1567 | nd->root = nd->path; |
1106 | path_get(&nd->root); | 1568 | path_get(&nd->root); |
1569 | nd->inode = nd->path.dentry->d_inode; | ||
1107 | 1570 | ||
1108 | retval = path_walk(name, nd); | 1571 | retval = path_walk(name, nd); |
1109 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && | 1572 | if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && |
1110 | nd->path.dentry->d_inode)) | 1573 | nd->inode)) |
1111 | audit_inode(name, nd->path.dentry); | 1574 | audit_inode(name, nd->path.dentry); |
1112 | 1575 | ||
1113 | path_put(&nd->root); | 1576 | path_put(&nd->root); |
@@ -1488,6 +1951,7 @@ out_unlock: | |||
1488 | mutex_unlock(&dir->d_inode->i_mutex); | 1951 | mutex_unlock(&dir->d_inode->i_mutex); |
1489 | dput(nd->path.dentry); | 1952 | dput(nd->path.dentry); |
1490 | nd->path.dentry = path->dentry; | 1953 | nd->path.dentry = path->dentry; |
1954 | |||
1491 | if (error) | 1955 | if (error) |
1492 | return error; | 1956 | return error; |
1493 | /* Don't check for write permission, don't truncate */ | 1957 | /* Don't check for write permission, don't truncate */ |
@@ -1582,6 +2046,9 @@ exit: | |||
1582 | return ERR_PTR(error); | 2046 | return ERR_PTR(error); |
1583 | } | 2047 | } |
1584 | 2048 | ||
2049 | /* | ||
2050 | * Handle O_CREAT case for do_filp_open | ||
2051 | */ | ||
1585 | static struct file *do_last(struct nameidata *nd, struct path *path, | 2052 | static struct file *do_last(struct nameidata *nd, struct path *path, |
1586 | int open_flag, int acc_mode, | 2053 | int open_flag, int acc_mode, |
1587 | int mode, const char *pathname) | 2054 | int mode, const char *pathname) |
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
1603 | } | 2070 | } |
1604 | /* fallthrough */ | 2071 | /* fallthrough */ |
1605 | case LAST_ROOT: | 2072 | case LAST_ROOT: |
1606 | if (open_flag & O_CREAT) | 2073 | goto exit; |
1607 | goto exit; | ||
1608 | /* fallthrough */ | ||
1609 | case LAST_BIND: | 2074 | case LAST_BIND: |
1610 | audit_inode(pathname, dir); | 2075 | audit_inode(pathname, dir); |
1611 | goto ok; | 2076 | goto ok; |
1612 | } | 2077 | } |
1613 | 2078 | ||
1614 | /* trailing slashes? */ | 2079 | /* trailing slashes? */ |
1615 | if (nd->last.name[nd->last.len]) { | 2080 | if (nd->last.name[nd->last.len]) |
1616 | if (open_flag & O_CREAT) | 2081 | goto exit; |
1617 | goto exit; | ||
1618 | nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW; | ||
1619 | } | ||
1620 | |||
1621 | /* just plain open? */ | ||
1622 | if (!(open_flag & O_CREAT)) { | ||
1623 | error = do_lookup(nd, &nd->last, path); | ||
1624 | if (error) | ||
1625 | goto exit; | ||
1626 | error = -ENOENT; | ||
1627 | if (!path->dentry->d_inode) | ||
1628 | goto exit_dput; | ||
1629 | if (path->dentry->d_inode->i_op->follow_link) | ||
1630 | return NULL; | ||
1631 | error = -ENOTDIR; | ||
1632 | if (nd->flags & LOOKUP_DIRECTORY) { | ||
1633 | if (!path->dentry->d_inode->i_op->lookup) | ||
1634 | goto exit_dput; | ||
1635 | } | ||
1636 | path_to_nameidata(path, nd); | ||
1637 | audit_inode(pathname, nd->path.dentry); | ||
1638 | goto ok; | ||
1639 | } | ||
1640 | 2082 | ||
1641 | /* OK, it's O_CREAT */ | ||
1642 | mutex_lock(&dir->d_inode->i_mutex); | 2083 | mutex_lock(&dir->d_inode->i_mutex); |
1643 | 2084 | ||
1644 | path->dentry = lookup_hash(nd); | 2085 | path->dentry = lookup_hash(nd); |
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
1709 | return NULL; | 2150 | return NULL; |
1710 | 2151 | ||
1711 | path_to_nameidata(path, nd); | 2152 | path_to_nameidata(path, nd); |
2153 | nd->inode = path->dentry->d_inode; | ||
1712 | error = -EISDIR; | 2154 | error = -EISDIR; |
1713 | if (S_ISDIR(path->dentry->d_inode->i_mode)) | 2155 | if (S_ISDIR(nd->inode->i_mode)) |
1714 | goto exit; | 2156 | goto exit; |
1715 | ok: | 2157 | ok: |
1716 | filp = finish_open(nd, open_flag, acc_mode); | 2158 | filp = finish_open(nd, open_flag, acc_mode); |
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname, | |||
1741 | struct path path; | 2183 | struct path path; |
1742 | int count = 0; | 2184 | int count = 0; |
1743 | int flag = open_to_namei_flags(open_flag); | 2185 | int flag = open_to_namei_flags(open_flag); |
1744 | int force_reval = 0; | 2186 | int flags; |
1745 | 2187 | ||
1746 | if (!(open_flag & O_CREAT)) | 2188 | if (!(open_flag & O_CREAT)) |
1747 | mode = 0; | 2189 | mode = 0; |
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname, | |||
1770 | if (open_flag & O_APPEND) | 2212 | if (open_flag & O_APPEND) |
1771 | acc_mode |= MAY_APPEND; | 2213 | acc_mode |= MAY_APPEND; |
1772 | 2214 | ||
1773 | /* find the parent */ | 2215 | flags = LOOKUP_OPEN; |
1774 | reval: | 2216 | if (open_flag & O_CREAT) { |
1775 | error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); | 2217 | flags |= LOOKUP_CREATE; |
2218 | if (open_flag & O_EXCL) | ||
2219 | flags |= LOOKUP_EXCL; | ||
2220 | } | ||
2221 | if (open_flag & O_DIRECTORY) | ||
2222 | flags |= LOOKUP_DIRECTORY; | ||
2223 | if (!(open_flag & O_NOFOLLOW)) | ||
2224 | flags |= LOOKUP_FOLLOW; | ||
2225 | |||
2226 | filp = get_empty_filp(); | ||
2227 | if (!filp) | ||
2228 | return ERR_PTR(-ENFILE); | ||
2229 | |||
2230 | filp->f_flags = open_flag; | ||
2231 | nd.intent.open.file = filp; | ||
2232 | nd.intent.open.flags = flag; | ||
2233 | nd.intent.open.create_mode = mode; | ||
2234 | |||
2235 | if (open_flag & O_CREAT) | ||
2236 | goto creat; | ||
2237 | |||
2238 | /* !O_CREAT, simple open */ | ||
2239 | error = do_path_lookup(dfd, pathname, flags, &nd); | ||
2240 | if (unlikely(error)) | ||
2241 | goto out_filp; | ||
2242 | error = -ELOOP; | ||
2243 | if (!(nd.flags & LOOKUP_FOLLOW)) { | ||
2244 | if (nd.inode->i_op->follow_link) | ||
2245 | goto out_path; | ||
2246 | } | ||
2247 | error = -ENOTDIR; | ||
2248 | if (nd.flags & LOOKUP_DIRECTORY) { | ||
2249 | if (!nd.inode->i_op->lookup) | ||
2250 | goto out_path; | ||
2251 | } | ||
2252 | audit_inode(pathname, nd.path.dentry); | ||
2253 | filp = finish_open(&nd, open_flag, acc_mode); | ||
2254 | return filp; | ||
2255 | |||
2256 | creat: | ||
2257 | /* OK, have to create the file. Find the parent. */ | ||
2258 | error = path_init_rcu(dfd, pathname, | ||
2259 | LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); | ||
1776 | if (error) | 2260 | if (error) |
1777 | return ERR_PTR(error); | 2261 | goto out_filp; |
1778 | if (force_reval) | 2262 | error = path_walk_rcu(pathname, &nd); |
1779 | nd.flags |= LOOKUP_REVAL; | 2263 | path_finish_rcu(&nd); |
2264 | if (unlikely(error == -ECHILD || error == -ESTALE)) { | ||
2265 | /* slower, locked walk */ | ||
2266 | if (error == -ESTALE) { | ||
2267 | reval: | ||
2268 | flags |= LOOKUP_REVAL; | ||
2269 | } | ||
2270 | error = path_init(dfd, pathname, | ||
2271 | LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); | ||
2272 | if (error) | ||
2273 | goto out_filp; | ||
1780 | 2274 | ||
1781 | current->total_link_count = 0; | 2275 | error = path_walk_simple(pathname, &nd); |
1782 | error = link_path_walk(pathname, &nd); | ||
1783 | if (error) { | ||
1784 | filp = ERR_PTR(error); | ||
1785 | goto out; | ||
1786 | } | 2276 | } |
1787 | if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) | 2277 | if (unlikely(error)) |
2278 | goto out_filp; | ||
2279 | if (unlikely(!audit_dummy_context())) | ||
1788 | audit_inode(pathname, nd.path.dentry); | 2280 | audit_inode(pathname, nd.path.dentry); |
1789 | 2281 | ||
1790 | /* | 2282 | /* |
1791 | * We have the parent and last component. | 2283 | * We have the parent and last component. |
1792 | */ | 2284 | */ |
1793 | 2285 | nd.flags = flags; | |
1794 | error = -ENFILE; | ||
1795 | filp = get_empty_filp(); | ||
1796 | if (filp == NULL) | ||
1797 | goto exit_parent; | ||
1798 | nd.intent.open.file = filp; | ||
1799 | filp->f_flags = open_flag; | ||
1800 | nd.intent.open.flags = flag; | ||
1801 | nd.intent.open.create_mode = mode; | ||
1802 | nd.flags &= ~LOOKUP_PARENT; | ||
1803 | nd.flags |= LOOKUP_OPEN; | ||
1804 | if (open_flag & O_CREAT) { | ||
1805 | nd.flags |= LOOKUP_CREATE; | ||
1806 | if (open_flag & O_EXCL) | ||
1807 | nd.flags |= LOOKUP_EXCL; | ||
1808 | } | ||
1809 | if (open_flag & O_DIRECTORY) | ||
1810 | nd.flags |= LOOKUP_DIRECTORY; | ||
1811 | if (!(open_flag & O_NOFOLLOW)) | ||
1812 | nd.flags |= LOOKUP_FOLLOW; | ||
1813 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); | 2286 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); |
1814 | while (unlikely(!filp)) { /* trailing symlink */ | 2287 | while (unlikely(!filp)) { /* trailing symlink */ |
1815 | struct path holder; | 2288 | struct path holder; |
1816 | struct inode *inode = path.dentry->d_inode; | ||
1817 | void *cookie; | 2289 | void *cookie; |
1818 | error = -ELOOP; | 2290 | error = -ELOOP; |
1819 | /* S_ISDIR part is a temporary automount kludge */ | 2291 | /* S_ISDIR part is a temporary automount kludge */ |
1820 | if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) | 2292 | if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode)) |
1821 | goto exit_dput; | 2293 | goto exit_dput; |
1822 | if (count++ == 32) | 2294 | if (count++ == 32) |
1823 | goto exit_dput; | 2295 | goto exit_dput; |
@@ -1838,36 +2310,33 @@ reval: | |||
1838 | goto exit_dput; | 2310 | goto exit_dput; |
1839 | error = __do_follow_link(&path, &nd, &cookie); | 2311 | error = __do_follow_link(&path, &nd, &cookie); |
1840 | if (unlikely(error)) { | 2312 | if (unlikely(error)) { |
2313 | if (!IS_ERR(cookie) && nd.inode->i_op->put_link) | ||
2314 | nd.inode->i_op->put_link(path.dentry, &nd, cookie); | ||
1841 | /* nd.path had been dropped */ | 2315 | /* nd.path had been dropped */ |
1842 | if (!IS_ERR(cookie) && inode->i_op->put_link) | 2316 | nd.path = path; |
1843 | inode->i_op->put_link(path.dentry, &nd, cookie); | 2317 | goto out_path; |
1844 | path_put(&path); | ||
1845 | release_open_intent(&nd); | ||
1846 | filp = ERR_PTR(error); | ||
1847 | goto out; | ||
1848 | } | 2318 | } |
1849 | holder = path; | 2319 | holder = path; |
1850 | nd.flags &= ~LOOKUP_PARENT; | 2320 | nd.flags &= ~LOOKUP_PARENT; |
1851 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); | 2321 | filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); |
1852 | if (inode->i_op->put_link) | 2322 | if (nd.inode->i_op->put_link) |
1853 | inode->i_op->put_link(holder.dentry, &nd, cookie); | 2323 | nd.inode->i_op->put_link(holder.dentry, &nd, cookie); |
1854 | path_put(&holder); | 2324 | path_put(&holder); |
1855 | } | 2325 | } |
1856 | out: | 2326 | out: |
1857 | if (nd.root.mnt) | 2327 | if (nd.root.mnt) |
1858 | path_put(&nd.root); | 2328 | path_put(&nd.root); |
1859 | if (filp == ERR_PTR(-ESTALE) && !force_reval) { | 2329 | if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) |
1860 | force_reval = 1; | ||
1861 | goto reval; | 2330 | goto reval; |
1862 | } | ||
1863 | return filp; | 2331 | return filp; |
1864 | 2332 | ||
1865 | exit_dput: | 2333 | exit_dput: |
1866 | path_put_conditional(&path, &nd); | 2334 | path_put_conditional(&path, &nd); |
2335 | out_path: | ||
2336 | path_put(&nd.path); | ||
2337 | out_filp: | ||
1867 | if (!IS_ERR(nd.intent.open.file)) | 2338 | if (!IS_ERR(nd.intent.open.file)) |
1868 | release_open_intent(&nd); | 2339 | release_open_intent(&nd); |
1869 | exit_parent: | ||
1870 | path_put(&nd.path); | ||
1871 | filp = ERR_PTR(error); | 2340 | filp = ERR_PTR(error); |
1872 | goto out; | 2341 | goto out; |
1873 | } | 2342 | } |
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ae4b0fd9033f..998e3a715bcc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c | |||
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent, | |||
402 | const struct dentry *dentry, const struct inode *inode, | 402 | const struct dentry *dentry, const struct inode *inode, |
403 | unsigned int len, const char *str, const struct qstr *name) | 403 | unsigned int len, const char *str, const struct qstr *name) |
404 | { | 404 | { |
405 | /* Although proc doesn't have negative dentries, rcu-walk means | ||
406 | * that inode here can be NULL */ | ||
407 | if (!inode) | ||
408 | return 0; | ||
405 | if (name->len != len) | 409 | if (name->len != len) |
406 | return 1; | 410 | return 1; |
407 | if (memcmp(name->name, str, len)) | 411 | if (memcmp(name->name, str, len)) |
diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ca648685f0cc..c2e7390289cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/rculist.h> | 6 | #include <linux/rculist.h> |
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/seqlock.h> | ||
8 | #include <linux/cache.h> | 9 | #include <linux/cache.h> |
9 | #include <linux/rcupdate.h> | 10 | #include <linux/rcupdate.h> |
10 | 11 | ||
@@ -90,6 +91,7 @@ struct dentry { | |||
90 | unsigned int d_count; /* protected by d_lock */ | 91 | unsigned int d_count; /* protected by d_lock */ |
91 | unsigned int d_flags; /* protected by d_lock */ | 92 | unsigned int d_flags; /* protected by d_lock */ |
92 | spinlock_t d_lock; /* per dentry lock */ | 93 | spinlock_t d_lock; /* per dentry lock */ |
94 | seqcount_t d_seq; /* per dentry seqlock */ | ||
93 | int d_mounted; | 95 | int d_mounted; |
94 | struct inode *d_inode; /* Where the name belongs to - NULL is | 96 | struct inode *d_inode; /* Where the name belongs to - NULL is |
95 | * negative */ | 97 | * negative */ |
@@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *); | |||
266 | extern struct dentry *d_ancestor(struct dentry *, struct dentry *); | 268 | extern struct dentry *d_ancestor(struct dentry *, struct dentry *); |
267 | 269 | ||
268 | /* appendix may either be NULL or be used for transname suffixes */ | 270 | /* appendix may either be NULL or be used for transname suffixes */ |
269 | extern struct dentry * d_lookup(struct dentry *, struct qstr *); | 271 | extern struct dentry *d_lookup(struct dentry *, struct qstr *); |
270 | extern struct dentry * __d_lookup(struct dentry *, struct qstr *); | 272 | extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); |
271 | extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); | 273 | extern struct dentry *__d_lookup(struct dentry *, struct qstr *); |
274 | extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, | ||
275 | unsigned *seq, struct inode **inode); | ||
276 | |||
277 | /** | ||
278 | * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok | ||
279 | * @dentry: dentry to take a ref on | ||
280 | * @seq: seqcount to verify against | ||
281 | * @Returns: 0 on failure, else 1. | ||
282 | * | ||
283 | * __d_rcu_to_refcount operates on a dentry,seq pair that was returned | ||
284 | * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. | ||
285 | */ | ||
286 | static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) | ||
287 | { | ||
288 | int ret = 0; | ||
289 | |||
290 | assert_spin_locked(&dentry->d_lock); | ||
291 | if (!read_seqcount_retry(&dentry->d_seq, seq)) { | ||
292 | ret = 1; | ||
293 | dentry->d_count++; | ||
294 | } | ||
295 | |||
296 | return ret; | ||
297 | } | ||
272 | 298 | ||
273 | /* validate "insecure" dentry pointer */ | 299 | /* validate "insecure" dentry pointer */ |
274 | extern int d_validate(struct dentry *, struct dentry *); | 300 | extern int d_validate(struct dentry *, struct dentry *); |
diff --git a/include/linux/namei.h b/include/linux/namei.h index aec730b53935..18d06add0a40 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h | |||
@@ -19,7 +19,10 @@ struct nameidata { | |||
19 | struct path path; | 19 | struct path path; |
20 | struct qstr last; | 20 | struct qstr last; |
21 | struct path root; | 21 | struct path root; |
22 | struct file *file; | ||
23 | struct inode *inode; /* path.dentry.d_inode */ | ||
22 | unsigned int flags; | 24 | unsigned int flags; |
25 | unsigned seq; | ||
23 | int last_type; | 26 | int last_type; |
24 | unsigned depth; | 27 | unsigned depth; |
25 | char *saved_names[MAX_NESTED_LINKS + 1]; | 28 | char *saved_names[MAX_NESTED_LINKS + 1]; |
@@ -43,11 +46,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; | |||
43 | * - internal "there are more path components" flag | 46 | * - internal "there are more path components" flag |
44 | * - dentry cache is untrusted; force a real lookup | 47 | * - dentry cache is untrusted; force a real lookup |
45 | */ | 48 | */ |
46 | #define LOOKUP_FOLLOW 1 | 49 | #define LOOKUP_FOLLOW 0x0001 |
47 | #define LOOKUP_DIRECTORY 2 | 50 | #define LOOKUP_DIRECTORY 0x0002 |
48 | #define LOOKUP_CONTINUE 4 | 51 | #define LOOKUP_CONTINUE 0x0004 |
49 | #define LOOKUP_PARENT 16 | 52 | |
50 | #define LOOKUP_REVAL 64 | 53 | #define LOOKUP_PARENT 0x0010 |
54 | #define LOOKUP_REVAL 0x0020 | ||
55 | #define LOOKUP_RCU 0x0040 | ||
51 | /* | 56 | /* |
52 | * Intent data | 57 | * Intent data |
53 | */ | 58 | */ |
diff --git a/include/linux/security.h b/include/linux/security.h index fd4d55fb8845..ed95401970c7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h | |||
@@ -457,7 +457,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) | |||
457 | * called when the actual read/write operations are performed. | 457 | * called when the actual read/write operations are performed. |
458 | * @inode contains the inode structure to check. | 458 | * @inode contains the inode structure to check. |
459 | * @mask contains the permission mask. | 459 | * @mask contains the permission mask. |
460 | * @nd contains the nameidata (may be NULL). | ||
461 | * Return 0 if permission is granted. | 460 | * Return 0 if permission is granted. |
462 | * @inode_setattr: | 461 | * @inode_setattr: |
463 | * Check permission before setting file attributes. Note that the kernel | 462 | * Check permission before setting file attributes. Note that the kernel |
@@ -1713,6 +1712,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
1713 | int security_inode_readlink(struct dentry *dentry); | 1712 | int security_inode_readlink(struct dentry *dentry); |
1714 | int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); | 1713 | int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); |
1715 | int security_inode_permission(struct inode *inode, int mask); | 1714 | int security_inode_permission(struct inode *inode, int mask); |
1715 | int security_inode_exec_permission(struct inode *inode, unsigned int flags); | ||
1716 | int security_inode_setattr(struct dentry *dentry, struct iattr *attr); | 1716 | int security_inode_setattr(struct dentry *dentry, struct iattr *attr); |
1717 | int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); | 1717 | int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); |
1718 | int security_inode_setxattr(struct dentry *dentry, const char *name, | 1718 | int security_inode_setxattr(struct dentry *dentry, const char *name, |
@@ -2102,6 +2102,12 @@ static inline int security_inode_permission(struct inode *inode, int mask) | |||
2102 | return 0; | 2102 | return 0; |
2103 | } | 2103 | } |
2104 | 2104 | ||
2105 | static inline int security_inode_exec_permission(struct inode *inode, | ||
2106 | unsigned int flags) | ||
2107 | { | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2105 | static inline int security_inode_setattr(struct dentry *dentry, | 2111 | static inline int security_inode_setattr(struct dentry *dentry, |
2106 | struct iattr *attr) | 2112 | struct iattr *attr) |
2107 | { | 2113 | { |
diff --git a/security/security.c b/security/security.c index 1b798d3df710..c645e263ca8d 100644 --- a/security/security.c +++ b/security/security.c | |||
@@ -513,6 +513,15 @@ int security_inode_permission(struct inode *inode, int mask) | |||
513 | return security_ops->inode_permission(inode, mask); | 513 | return security_ops->inode_permission(inode, mask); |
514 | } | 514 | } |
515 | 515 | ||
516 | int security_inode_exec_permission(struct inode *inode, unsigned int flags) | ||
517 | { | ||
518 | if (unlikely(IS_PRIVATE(inode))) | ||
519 | return 0; | ||
520 | if (flags) | ||
521 | return -ECHILD; | ||
522 | return security_ops->inode_permission(inode, MAY_EXEC); | ||
523 | } | ||
524 | |||
516 | int security_inode_setattr(struct dentry *dentry, struct iattr *attr) | 525 | int security_inode_setattr(struct dentry *dentry, struct iattr *attr) |
517 | { | 526 | { |
518 | if (unlikely(IS_PRIVATE(dentry->d_inode))) | 527 | if (unlikely(IS_PRIVATE(dentry->d_inode))) |