diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 29 | ||||
-rw-r--r-- | Documentation/filesystems/dentry-locking.txt | 174 | ||||
-rw-r--r-- | Documentation/filesystems/ntfs.txt | 3 | ||||
-rw-r--r-- | Documentation/filesystems/path-lookup.txt | 382 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 78 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 31 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 74 |
7 files changed, 566 insertions, 205 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 01d7cbdcd3bd..ef9349a4b5d1 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -9,22 +9,25 @@ be able to use diff(1). | |||
9 | 9 | ||
10 | --------------------------- dentry_operations -------------------------- | 10 | --------------------------- dentry_operations -------------------------- |
11 | prototypes: | 11 | prototypes: |
12 | int (*d_revalidate)(struct dentry *, int); | 12 | int (*d_revalidate)(struct dentry *, struct nameidata *); |
13 | int (*d_hash) (struct dentry *, struct qstr *); | 13 | int (*d_hash)(const struct dentry *, const struct inode *, |
14 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 14 | struct qstr *); |
15 | int (*d_compare)(const struct dentry *, const struct inode *, | ||
16 | const struct dentry *, const struct inode *, | ||
17 | unsigned int, const char *, const struct qstr *); | ||
15 | int (*d_delete)(struct dentry *); | 18 | int (*d_delete)(struct dentry *); |
16 | void (*d_release)(struct dentry *); | 19 | void (*d_release)(struct dentry *); |
17 | void (*d_iput)(struct dentry *, struct inode *); | 20 | void (*d_iput)(struct dentry *, struct inode *); |
18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); | 21 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); |
19 | 22 | ||
20 | locking rules: | 23 | locking rules: |
21 | dcache_lock rename_lock ->d_lock may block | 24 | rename_lock ->d_lock may block rcu-walk |
22 | d_revalidate: no no no yes | 25 | d_revalidate: no no yes (ref-walk) maybe |
23 | d_hash no no no yes | 26 | d_hash no no no maybe |
24 | d_compare: no yes no no | 27 | d_compare: yes no no maybe |
25 | d_delete: yes no yes no | 28 | d_delete: no yes no no |
26 | d_release: no no no yes | 29 | d_release: no no yes no |
27 | d_iput: no no no yes | 30 | d_iput: no no yes no |
28 | d_dname: no no no no | 31 | d_dname: no no no no |
29 | 32 | ||
30 | --------------------------- inode_operations --------------------------- | 33 | --------------------------- inode_operations --------------------------- |
@@ -44,8 +47,8 @@ ata *); | |||
44 | void * (*follow_link) (struct dentry *, struct nameidata *); | 47 | void * (*follow_link) (struct dentry *, struct nameidata *); |
45 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 48 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
46 | void (*truncate) (struct inode *); | 49 | void (*truncate) (struct inode *); |
47 | int (*permission) (struct inode *, int, struct nameidata *); | 50 | int (*permission) (struct inode *, int, unsigned int); |
48 | int (*check_acl)(struct inode *, int); | 51 | int (*check_acl)(struct inode *, int, unsigned int); |
49 | int (*setattr) (struct dentry *, struct iattr *); | 52 | int (*setattr) (struct dentry *, struct iattr *); |
50 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); | 53 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); |
51 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 54 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
@@ -73,7 +76,7 @@ follow_link: no | |||
73 | put_link: no | 76 | put_link: no |
74 | truncate: yes (see below) | 77 | truncate: yes (see below) |
75 | setattr: yes | 78 | setattr: yes |
76 | permission: no | 79 | permission: no (may not block if called in rcu-walk mode) |
77 | check_acl: no | 80 | check_acl: no |
78 | getattr: no | 81 | getattr: no |
79 | setxattr: yes | 82 | setxattr: yes |
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt deleted file mode 100644 index 79334ed5daa7..000000000000 --- a/Documentation/filesystems/dentry-locking.txt +++ /dev/null | |||
@@ -1,174 +0,0 @@ | |||
1 | RCU-based dcache locking model | ||
2 | ============================== | ||
3 | |||
4 | On many workloads, the most common operation on dcache is to look up a | ||
5 | dentry, given a parent dentry and the name of the child. Typically, | ||
6 | for every open(), stat() etc., the dentry corresponding to the | ||
7 | pathname will be looked up by walking the tree starting with the first | ||
8 | component of the pathname and using that dentry along with the next | ||
9 | component to look up the next level and so on. Since it is a frequent | ||
10 | operation for workloads like multiuser environments and web servers, | ||
11 | it is important to optimize this path. | ||
12 | |||
13 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in | ||
14 | every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
15 | algorithm changed this by holding the dcache_lock at the beginning and | ||
16 | walking as many cached path component dentries as possible. This | ||
17 | significantly decreases the number of acquisition of | ||
18 | dcache_lock. However it also increases the lock hold time | ||
19 | significantly and affects performance in large SMP machines. Since | ||
20 | 2.5.62 kernel, dcache has been using a new locking model that uses RCU | ||
21 | to make dcache look-up lock-free. | ||
22 | |||
23 | The current dcache locking model is not very different from the | ||
24 | existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
25 | protected the hash chain, d_child, d_alias, d_lru lists as well as | ||
26 | d_inode and several other things like mount look-up. RCU-based changes | ||
27 | affect only the way the hash chain is protected. For everything else | ||
28 | the dcache_lock must be taken for both traversing as well as | ||
29 | updating. The hash chain updates too take the dcache_lock. The | ||
30 | significant change is the way d_lookup traverses the hash chain, it | ||
31 | doesn't acquire the dcache_lock for this and rely on RCU to ensure | ||
32 | that the dentry has not been *freed*. | ||
33 | |||
34 | |||
35 | Dcache locking details | ||
36 | ====================== | ||
37 | |||
38 | For many multi-user workloads, open() and stat() on files are very | ||
39 | frequently occurring operations. Both involve walking of path names to | ||
40 | find the dentry corresponding to the concerned file. In 2.4 kernel, | ||
41 | dcache_lock was held during look-up of each path component. Contention | ||
42 | and cache-line bouncing of this global lock caused significant | ||
43 | scalability problems. With the introduction of RCU in Linux kernel, | ||
44 | this was worked around by making the look-up of path components during | ||
45 | path walking lock-free. | ||
46 | |||
47 | |||
48 | Safe lock-free look-up of dcache hash table | ||
49 | =========================================== | ||
50 | |||
51 | Dcache is a complex data structure with the hash table entries also | ||
52 | linked together in other lists. In 2.4 kernel, dcache_lock protected | ||
53 | all the lists. We applied RCU only on hash chain walking. The rest of | ||
54 | the lists are still protected by dcache_lock. Some of the important | ||
55 | changes are : | ||
56 | |||
57 | 1. The deletion from hash chain is done using hlist_del_rcu() macro | ||
58 | which doesn't initialize next pointer of the deleted dentry and | ||
59 | this allows us to walk safely lock-free while a deletion is | ||
60 | happening. | ||
61 | |||
62 | 2. Insertion of a dentry into the hash table is done using | ||
63 | hlist_add_head_rcu() which take care of ordering the writes - the | ||
64 | writes to the dentry must be visible before the dentry is | ||
65 | inserted. This works in conjunction with hlist_for_each_rcu(), | ||
66 | which has since been replaced by hlist_for_each_entry_rcu(), while | ||
67 | walking the hash chain. The only requirement is that all | ||
68 | initialization to the dentry must be done before | ||
69 | hlist_add_head_rcu() since we don't have dcache_lock protection | ||
70 | while traversing the hash chain. This isn't different from the | ||
71 | existing code. | ||
72 | |||
73 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
74 | returned for walking if it is unhashed. It then may have a NULL | ||
75 | d_inode or other bogosity since RCU doesn't protect the other | ||
76 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
77 | indicate unhashed dentries and use this in conjunction with a | ||
78 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
79 | we acquire the per-dentry lock (d_lock) and check if the dentry is | ||
80 | unhashed. If so, the look-up is failed. If not, the reference count | ||
81 | of the dentry is increased and the dentry is returned. | ||
82 | |||
83 | 4. Once a dentry is looked up, it must be ensured during the path walk | ||
84 | for that component it doesn't go away. In pre-2.5.10 code, this was | ||
85 | done holding a reference to the dentry. dcache_rcu does the same. | ||
86 | In some sense, dcache_rcu path walking looks like the pre-2.5.10 | ||
87 | version. | ||
88 | |||
89 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
90 | the per-dentry lock in that order. dput() does this to ensure that | ||
91 | a dentry that has just been looked up in another CPU doesn't get | ||
92 | deleted before dget() can be done on it. | ||
93 | |||
94 | 6. There are several ways to do reference counting of RCU protected | ||
95 | objects. One such example is in ipv4 route cache where deferred | ||
96 | freeing (using call_rcu()) is done as soon as the reference count | ||
97 | goes to zero. This cannot be done in the case of dentries because | ||
98 | tearing down of dentries require blocking (dentry_iput()) which | ||
99 | isn't supported from RCU callbacks. Instead, tearing down of | ||
100 | dentries happen synchronously in dput(), but actual freeing happens | ||
101 | later when RCU grace period is over. This allows safe lock-free | ||
102 | walking of the hash chains, but a matched dentry may have been | ||
103 | partially torn down. The checking of DCACHE_UNHASHED flag with | ||
104 | d_lock held detects such dentries and prevents them from being | ||
105 | returned from look-up. | ||
106 | |||
107 | |||
108 | Maintaining POSIX rename semantics | ||
109 | ================================== | ||
110 | |||
111 | Since look-up of dentries is lock-free, it can race against a | ||
112 | concurrent rename operation. For example, during rename of file A to | ||
113 | B, look-up of either A or B must succeed. So, if look-up of B happens | ||
114 | after A has been removed from the hash chain but not added to the new | ||
115 | hash chain, it may fail. Also, a comparison while the name is being | ||
116 | written concurrently by a rename may result in false positive matches | ||
117 | violating rename semantics. Issues related to race with rename are | ||
118 | handled as described below : | ||
119 | |||
120 | 1. Look-up can be done in two ways - d_lookup() which is safe from | ||
121 | simultaneous renames and __d_lookup() which is not. If | ||
122 | __d_lookup() fails, it must be followed up by a d_lookup() to | ||
123 | correctly determine whether a dentry is in the hash table or | ||
124 | not. d_lookup() protects look-ups using a sequence lock | ||
125 | (rename_lock). | ||
126 | |||
127 | 2. The name associated with a dentry (d_name) may be changed if a | ||
128 | rename is allowed to happen simultaneously. To avoid memcmp() in | ||
129 | __d_lookup() go out of bounds due to a rename and false positive | ||
130 | comparison, the name comparison is done while holding the | ||
131 | per-dentry lock. This prevents concurrent renames during this | ||
132 | operation. | ||
133 | |||
134 | 3. Hash table walking during look-up may move to a different bucket as | ||
135 | the current dentry is moved to a different bucket due to rename. | ||
136 | But we use hlists in dcache hash table and they are | ||
137 | null-terminated. So, even if a dentry moves to a different bucket, | ||
138 | hash chain walk will terminate. [with a list_head list, it may not | ||
139 | since termination is when the list_head in the original bucket is | ||
140 | reached]. Since we redo the d_parent check and compare name while | ||
141 | holding d_lock, lock-free look-up will not race against d_move(). | ||
142 | |||
143 | 4. There can be a theoretical race when a dentry keeps coming back to | ||
144 | original bucket due to double moves. Due to this look-up may | ||
145 | consider that it has never moved and can end up in a infinite loop. | ||
146 | But this is not any worse that theoretical livelocks we already | ||
147 | have in the kernel. | ||
148 | |||
149 | |||
150 | Important guidelines for filesystem developers related to dcache_rcu | ||
151 | ==================================================================== | ||
152 | |||
153 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
154 | don't change. Only dcache internal implementation changes. However | ||
155 | filesystems *must not* delete from the dentry hash chains directly | ||
156 | using the list macros like allowed earlier. They must use dcache | ||
157 | APIs like d_drop() or __d_drop() depending on the situation. | ||
158 | |||
159 | 2. d_flags is now protected by a per-dentry lock (d_lock). All access | ||
160 | to d_flags must be protected by it. | ||
161 | |||
162 | 3. For a hashed dentry, checking of d_count needs to be protected by | ||
163 | d_lock. | ||
164 | |||
165 | |||
166 | Papers and other documentation on dcache locking | ||
167 | ================================================ | ||
168 | |||
169 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
170 | |||
171 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
172 | |||
173 | |||
174 | |||
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index ac2a261c5f7d..6ef8cf3bc9a3 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt | |||
@@ -457,6 +457,9 @@ ChangeLog | |||
457 | 457 | ||
458 | Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. | 458 | Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. |
459 | 459 | ||
460 | 2.1.30: | ||
461 | - Fix writev() (it kept writing the first segment over and over again | ||
462 | instead of moving onto subsequent segments). | ||
460 | 2.1.29: | 463 | 2.1.29: |
461 | - Fix a deadlock when mounting read-write. | 464 | - Fix a deadlock when mounting read-write. |
462 | 2.1.28: | 465 | 2.1.28: |
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt new file mode 100644 index 000000000000..eb59c8b44be9 --- /dev/null +++ b/Documentation/filesystems/path-lookup.txt | |||
@@ -0,0 +1,382 @@ | |||
1 | Path walking and name lookup locking | ||
2 | ==================================== | ||
3 | |||
4 | Path resolution is the finding a dentry corresponding to a path name string, by | ||
5 | performing a path walk. Typically, for every open(), stat() etc., the path name | ||
6 | will be resolved. Paths are resolved by walking the namespace tree, starting | ||
7 | with the first component of the pathname (eg. root or cwd) with a known dentry, | ||
8 | then finding the child of that dentry, which is named the next component in the | ||
9 | path string. Then repeating the lookup from the child dentry and finding its | ||
10 | child with the next element, and so on. | ||
11 | |||
12 | Since it is a frequent operation for workloads like multiuser environments and | ||
13 | web servers, it is important to optimize this code. | ||
14 | |||
15 | Path walking synchronisation history: | ||
16 | Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and | ||
17 | thus in every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
18 | algorithm changed this by holding the dcache_lock at the beginning and walking | ||
19 | as many cached path component dentries as possible. This significantly | ||
20 | decreases the number of acquisition of dcache_lock. However it also increases | ||
21 | the lock hold time significantly and affects performance in large SMP machines. | ||
22 | Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to | ||
23 | make dcache look-up lock-free. | ||
24 | |||
25 | All the above algorithms required taking a lock and reference count on the | ||
26 | dentry that was looked up, so that may be used as the basis for walking the | ||
27 | next path element. This is inefficient and unscalable. It is inefficient | ||
28 | because of the locks and atomic operations required for every dentry element | ||
29 | slows things down. It is not scalable because many parallel applications that | ||
30 | are path-walk intensive tend to do path lookups starting from a common dentry | ||
31 | (usually, the root "/" or current working directory). So contention on these | ||
32 | common path elements causes lock and cacheline queueing. | ||
33 | |||
34 | Since 2.6.38, RCU is used to make a significant part of the entire path walk | ||
35 | (including dcache look-up) completely "store-free" (so, no locks, atomics, or | ||
36 | even stores into cachelines of common dentries). This is known as "rcu-walk" | ||
37 | path walking. | ||
38 | |||
39 | Path walking overview | ||
40 | ===================== | ||
41 | |||
42 | A name string specifies a start (root directory, cwd, fd-relative) and a | ||
43 | sequence of elements (directory entry names), which together refer to a path in | ||
44 | the namespace. A path is represented as a (dentry, vfsmount) tuple. The name | ||
45 | elements are sub-strings, seperated by '/'. | ||
46 | |||
47 | Name lookups will want to find a particular path that a name string refers to | ||
48 | (usually the final element, or parent of final element). This is done by taking | ||
49 | the path given by the name's starting point (which we know in advance -- eg. | ||
50 | current->fs->cwd or current->fs->root) as the first parent of the lookup. Then | ||
51 | iteratively for each subsequent name element, look up the child of the current | ||
52 | parent with the given name and if it is not the desired entry, make it the | ||
53 | parent for the next lookup. | ||
54 | |||
55 | A parent, of course, must be a directory, and we must have appropriate | ||
56 | permissions on the parent inode to be able to walk into it. | ||
57 | |||
58 | Turning the child into a parent for the next lookup requires more checks and | ||
59 | procedures. Symlinks essentially substitute the symlink name for the target | ||
60 | name in the name string, and require some recursive path walking. Mount points | ||
61 | must be followed into (thus changing the vfsmount that subsequent path elements | ||
62 | refer to), switching from the mount point path to the root of the particular | ||
63 | mounted vfsmount. These behaviours are variously modified depending on the | ||
64 | exact path walking flags. | ||
65 | |||
66 | Path walking then must, broadly, do several particular things: | ||
67 | - find the start point of the walk; | ||
68 | - perform permissions and validity checks on inodes; | ||
69 | - perform dcache hash name lookups on (parent, name element) tuples; | ||
70 | - traverse mount points; | ||
71 | - traverse symlinks; | ||
72 | - lookup and create missing parts of the path on demand. | ||
73 | |||
74 | Safe store-free look-up of dcache hash table | ||
75 | ============================================ | ||
76 | |||
77 | Dcache name lookup | ||
78 | ------------------ | ||
79 | In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple | ||
80 | and use that to select a bucket in the dcache-hash table. The list of entries | ||
81 | in that bucket is then walked, and we do a full comparison of each entry | ||
82 | against our (parent, name) tuple. | ||
83 | |||
84 | The hash lists are RCU protected, so list walking is not serialised with | ||
85 | concurrent updates (insertion, deletion from the hash). This is a standard RCU | ||
86 | list application with the exception of renames, which will be covered below. | ||
87 | |||
88 | Parent and name members of a dentry, as well as its membership in the dcache | ||
89 | hash, and its inode are protected by the per-dentry d_lock spinlock. A | ||
90 | reference is taken on the dentry (while the fields are verified under d_lock), | ||
91 | and this stabilises its d_inode pointer and actual inode. This gives a stable | ||
92 | point to perform the next step of our path walk against. | ||
93 | |||
94 | These members are also protected by d_seq seqlock, although this offers | ||
95 | read-only protection and no durability of results, so care must be taken when | ||
96 | using d_seq for synchronisation (see seqcount based lookups, below). | ||
97 | |||
98 | Renames | ||
99 | ------- | ||
100 | Back to the rename case. In usual RCU protected lists, the only operations that | ||
101 | will happen to an object is insertion, and then eventually removal from the | ||
102 | list. The object will not be reused until an RCU grace period is complete. | ||
103 | This ensures the RCU list traversal primitives can run over the object without | ||
104 | problems (see RCU documentation for how this works). | ||
105 | |||
106 | However when a dentry is renamed, its hash value can change, requiring it to be | ||
107 | moved to a new hash list. Allocating and inserting a new alias would be | ||
108 | expensive and also problematic for directory dentries. Latency would be far to | ||
109 | high to wait for a grace period after removing the dentry and before inserting | ||
110 | it in the new hash bucket. So what is done is to insert the dentry into the | ||
111 | new list immediately. | ||
112 | |||
113 | However, when the dentry's list pointers are updated to point to objects in the | ||
114 | new list before waiting for a grace period, this can result in a concurrent RCU | ||
115 | lookup of the old list veering off into the new (incorrect) list and missing | ||
116 | the remaining dentries on the list. | ||
117 | |||
118 | There is no fundamental problem with walking down the wrong list, because the | ||
119 | dentry comparisons will never match. However it is fatal to miss a matching | ||
120 | dentry. So a seqlock is used to detect when a rename has occurred, and so the | ||
121 | lookup can be retried. | ||
122 | |||
123 | 1 2 3 | ||
124 | +---+ +---+ +---+ | ||
125 | hlist-->| N-+->| N-+->| N-+-> | ||
126 | head <--+-P |<-+-P |<-+-P | | ||
127 | +---+ +---+ +---+ | ||
128 | |||
129 | Rename of dentry 2 may require it deleted from the above list, and inserted | ||
130 | into a new list. Deleting 2 gives the following list. | ||
131 | |||
132 | 1 3 | ||
133 | +---+ +---+ (don't worry, the longer pointers do not | ||
134 | hlist-->| N-+-------->| N-+-> impose a measurable performance overhead | ||
135 | head <--+-P |<--------+-P | on modern CPUs) | ||
136 | +---+ +---+ | ||
137 | ^ 2 ^ | ||
138 | | +---+ | | ||
139 | | | N-+----+ | ||
140 | +----+-P | | ||
141 | +---+ | ||
142 | |||
143 | This is a standard RCU-list deletion, which leaves the deleted object's | ||
144 | pointers intact, so a concurrent list walker that is currently looking at | ||
145 | object 2 will correctly continue to object 3 when it is time to traverse the | ||
146 | next object. | ||
147 | |||
148 | However, when inserting object 2 onto a new list, we end up with this: | ||
149 | |||
150 | 1 3 | ||
151 | +---+ +---+ | ||
152 | hlist-->| N-+-------->| N-+-> | ||
153 | head <--+-P |<--------+-P | | ||
154 | +---+ +---+ | ||
155 | 2 | ||
156 | +---+ | ||
157 | | N-+----> | ||
158 | <----+-P | | ||
159 | +---+ | ||
160 | |||
161 | Because we didn't wait for a grace period, there may be a concurrent lookup | ||
162 | still at 2. Now when it follows 2's 'next' pointer, it will walk off into | ||
163 | another list without ever having checked object 3. | ||
164 | |||
165 | A related, but distinctly different, issue is that of rename atomicity versus | ||
166 | lookup operations. If a file is renamed from 'A' to 'B', a lookup must only | ||
167 | find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup | ||
168 | of 'B' must succeed (note the reverse is not true). | ||
169 | |||
170 | Between deleting the dentry from the old hash list, and inserting it on the new | ||
171 | hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same | ||
172 | rename seqlock is also used to cover this race in much the same way, by | ||
173 | retrying a negative lookup result if a rename was in progress. | ||
174 | |||
175 | Seqcount based lookups | ||
176 | ---------------------- | ||
177 | In refcount based dcache lookups, d_lock is used to serialise access to | ||
178 | the dentry, stabilising it while comparing its name and parent and then | ||
179 | taking a reference count (the reference count then gives a stable place to | ||
180 | start the next part of the path walk from). | ||
181 | |||
182 | As explained above, we would like to do path walking without taking locks or | ||
183 | reference counts on intermediate dentries along the path. To do this, a per | ||
184 | dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry | ||
185 | looks like (its name, parent, and inode). That snapshot is then used to start | ||
186 | the next part of the path walk. When loading the coherent snapshot under d_seq, | ||
187 | care must be taken to load the members up-front, and use those pointers rather | ||
188 | than reloading from the dentry later on (otherwise we'd have interesting things | ||
189 | like d_inode going NULL underneath us, if the name was unlinked). | ||
190 | |||
191 | Also important is to avoid performing any destructive operations (pretty much: | ||
192 | no non-atomic stores to shared data), and to recheck the seqcount when we are | ||
193 | "done" with the operation. Retry or abort if the seqcount does not match. | ||
194 | Avoiding destructive or changing operations means we can easily unwind from | ||
195 | failure. | ||
196 | |||
197 | What this means is that a caller, provided they are holding RCU lock to | ||
198 | protect the dentry object from disappearing, can perform a seqcount based | ||
199 | lookup which does not increment the refcount on the dentry or write to | ||
200 | it in any way. This returned dentry can be used for subsequent operations, | ||
201 | provided that d_seq is rechecked after that operation is complete. | ||
202 | |||
203 | Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be | ||
204 | queried for permissions. | ||
205 | |||
206 | With this two parts of the puzzle, we can do path lookups without taking | ||
207 | locks or refcounts on dentry elements. | ||
208 | |||
209 | RCU-walk path walking design | ||
210 | ============================ | ||
211 | |||
212 | Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk | ||
213 | is the traditional[*] way of performing dcache lookups using d_lock to | ||
214 | serialise concurrent modifications to the dentry and take a reference count on | ||
215 | it. ref-walk is simple and obvious, and may sleep, take locks, etc while path | ||
216 | walking is operating on each dentry. rcu-walk uses seqcount based dentry | ||
217 | lookups, and can perform lookup of intermediate elements without any stores to | ||
218 | shared data in the dentry or inode. rcu-walk can not be applied to all cases, | ||
219 | eg. if the filesystem must sleep or perform non trivial operations, rcu-walk | ||
220 | must be switched to ref-walk mode. | ||
221 | |||
222 | [*] RCU is still used for the dentry hash lookup in ref-walk, but not the full | ||
223 | path walk. | ||
224 | |||
225 | Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining | ||
226 | path string, rcu-walk uses a d_seq protected snapshot. When looking up a | ||
227 | child of this parent snapshot, we open d_seq critical section on the child | ||
228 | before closing d_seq critical section on the parent. This gives an interlocking | ||
229 | ladder of snapshots to walk down. | ||
230 | |||
231 | |||
232 | proc 101 | ||
233 | /----------------\ | ||
234 | / comm: "vi" \ | ||
235 | / fs.root: dentry0 \ | ||
236 | \ fs.cwd: dentry2 / | ||
237 | \ / | ||
238 | \----------------/ | ||
239 | |||
240 | So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will | ||
241 | start from current->fs->root, which is a pinned dentry. Alternatively, | ||
242 | "./test.c" would start from cwd; both names refer to the same path in | ||
243 | the context of proc101. | ||
244 | |||
245 | dentry 0 | ||
246 | +---------------------+ rcu-walk begins here, we note d_seq, check the | ||
247 | | name: "/" | inode's permission, and then look up the next | ||
248 | | inode: 10 | path element which is "home"... | ||
249 | | children:"home", ...| | ||
250 | +---------------------+ | ||
251 | | | ||
252 | dentry 1 V | ||
253 | +---------------------+ ... which brings us here. We find dentry1 via | ||
254 | | name: "home" | hash lookup, then note d_seq and compare name | ||
255 | | inode: 678 | string and parent pointer. When we have a match, | ||
256 | | children:"npiggin" | we now recheck the d_seq of dentry0. Then we | ||
257 | +---------------------+ check inode and look up the next element. | ||
258 | | | ||
259 | dentry2 V | ||
260 | +---------------------+ Note: if dentry0 is now modified, lookup is | ||
261 | | name: "npiggin" | not necessarily invalid, so we need only keep a | ||
262 | | inode: 543 | parent for d_seq verification, and grandparents | ||
263 | | children:"a.c", ... | can be forgotten. | ||
264 | +---------------------+ | ||
265 | | | ||
266 | dentry3 V | ||
267 | +---------------------+ At this point we have our destination dentry. | ||
268 | | name: "a.c" | We now take its d_lock, verify d_seq of this | ||
269 | | inode: 14221 | dentry. If that checks out, we can increment | ||
270 | | children:NULL | its refcount because we're holding d_lock. | ||
271 | +---------------------+ | ||
272 | |||
273 | Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock, | ||
274 | re-checking its d_seq, and then incrementing its refcount is called | ||
275 | "dropping rcu" or dropping from rcu-walk into ref-walk mode. | ||
276 | |||
277 | It is, in some sense, a bit of a house of cards. If the seqcount check of the | ||
278 | parent snapshot fails, the house comes down, because we had closed the d_seq | ||
279 | section on the grandparent, so we have nothing left to stand on. In that case, | ||
280 | the path walk must be fully restarted (which we do in ref-walk mode, to avoid | ||
281 | live locks). It is costly to have a full restart, but fortunately they are | ||
282 | quite rare. | ||
283 | |||
284 | When we reach a point where sleeping is required, or a filesystem callout | ||
285 | requires ref-walk, then instead of restarting the walk, we attempt to drop rcu | ||
286 | at the last known good dentry we have. Avoiding a full restart in ref-walk in | ||
287 | these cases is fundamental for performance and scalability because blocking | ||
288 | operations such as creates and unlinks are not uncommon. | ||
289 | |||
290 | The detailed design for rcu-walk is like this: | ||
291 | * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. | ||
292 | * Take the RCU lock for the entire path walk, starting with the acquiring | ||
293 | of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are | ||
294 | not required for dentry persistence. | ||
295 | * synchronize_rcu is called when unregistering a filesystem, so we can | ||
296 | access d_ops and i_ops during rcu-walk. | ||
297 | * Similarly take the vfsmount lock for the entire path walk. So now mnt | ||
298 | refcounts are not required for persistence. Also we are free to perform mount | ||
299 | lookups, and to assume dentry mount points and mount roots are stable up and | ||
300 | down the path. | ||
301 | * Have a per-dentry seqlock to protect the dentry name, parent, and inode, | ||
302 | so we can load this tuple atomically, and also check whether any of its | ||
303 | members have changed. | ||
304 | * Dentry lookups (based on parent, candidate string tuple) recheck the parent | ||
305 | sequence after the child is found in case anything changed in the parent | ||
306 | during the path walk. | ||
307 | * inode is also RCU protected so we can load d_inode and use the inode for | ||
308 | limited things. | ||
309 | * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. | ||
310 | * i_op can be loaded. | ||
311 | * When the destination dentry is reached, drop rcu there (ie. take d_lock, | ||
312 | verify d_seq, increment refcount). | ||
313 | * If seqlock verification fails anywhere along the path, do a full restart | ||
314 | of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of | ||
315 | a better errno) to signal an rcu-walk failure. | ||
316 | |||
317 | The cases where rcu-walk cannot continue are: | ||
318 | * NULL dentry (ie. any uncached path element) | ||
319 | * Following links | ||
320 | |||
321 | It may be possible eventually to make following links rcu-walk aware. | ||
322 | |||
323 | Uncached path elements will always require dropping to ref-walk mode, at the | ||
324 | very least because i_mutex needs to be grabbed, and objects allocated. | ||
325 | |||
326 | Final note: | ||
327 | "store-free" path walking is not strictly store free. We take vfsmount lock | ||
328 | and refcounts (both of which can be made per-cpu), and we also store to the | ||
329 | stack (which is essentially CPU-local), and we also have to take locks and | ||
330 | refcount on final dentry. | ||
331 | |||
332 | The point is that shared data, where practically possible, is not locked | ||
333 | or stored into. The result is massive improvements in performance and | ||
334 | scalability of path resolution. | ||
335 | |||
336 | |||
337 | Interesting statistics | ||
338 | ====================== | ||
339 | |||
340 | The following table gives rcu lookup statistics for a few simple workloads | ||
341 | (2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to | ||
342 | drop rcu that fail due to d_seq failure and requiring the entire path lookup | ||
343 | again. Other cases are successful rcu-drops that are required before the final | ||
344 | element, nodentry for missing dentry, revalidate for filesystem revalidate | ||
345 | routine requiring rcu drop, permission for permission check requiring drop, | ||
346 | and link for symlink traversal requiring drop. | ||
347 | |||
348 | rcu-lookups restart nodentry link revalidate permission | ||
349 | bootup 47121 0 4624 1010 10283 7852 | ||
350 | dbench 25386793 0 6778659(26.7%) 55 549 1156 | ||
351 | kbuild 2696672 10 64442(2.3%) 108764(4.0%) 1 1590 | ||
352 | git diff 39605 0 28 2 0 106 | ||
353 | vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651 | ||
354 | |||
355 | What this shows is that failed rcu-walk lookups, ie. ones that are restarted | ||
356 | entirely with ref-walk, are quite rare. Even the "vfstest" case which | ||
357 | specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise | ||
358 | such races is not showing a huge amount of restarts. | ||
359 | |||
360 | Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where | ||
361 | the reference count needs to be taken for some reason. This is either because | ||
362 | we have reached the target of the path walk, or because we have encountered a | ||
363 | condition that can't be resolved in rcu-walk mode. Ideally, we drop rcu-walk | ||
364 | only when we have reached the target dentry, so the other statistics show where | ||
365 | this does not happen. | ||
366 | |||
367 | Note that a graceful drop from rcu-walk mode due to something such as the | ||
368 | dentry not existing (which can be common) is not necessarily a failure of | ||
369 | rcu-walk scheme, because some elements of the path may have been walked in | ||
370 | rcu-walk mode. The further we get from common path elements (such as cwd or | ||
371 | root), the less contended the dentry is likely to be. The closer we are to | ||
372 | common path elements, the more likely they will exist in dentry cache. | ||
373 | |||
374 | |||
375 | Papers and other documentation on dcache locking | ||
376 | ================================================ | ||
377 | |||
378 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
379 | |||
380 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
381 | |||
382 | |||
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index b12c89538680..dfbcd1b00b0a 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
@@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink(). | |||
216 | ->d_parent changes are not protected by BKL anymore. Read access is safe | 216 | ->d_parent changes are not protected by BKL anymore. Read access is safe |
217 | if at least one of the following is true: | 217 | if at least one of the following is true: |
218 | * filesystem has no cross-directory rename() | 218 | * filesystem has no cross-directory rename() |
219 | * dcache_lock is held | ||
220 | * we know that parent had been locked (e.g. we are looking at | 219 | * we know that parent had been locked (e.g. we are looking at |
221 | ->d_parent of ->lookup() argument). | 220 | ->d_parent of ->lookup() argument). |
222 | * we are called from ->rename(). | 221 | * we are called from ->rename(). |
@@ -318,3 +317,80 @@ if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput( | |||
318 | may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly | 317 | may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly |
319 | free the on-disk inode, you may end up doing that while ->write_inode() is writing | 318 | free the on-disk inode, you may end up doing that while ->write_inode() is writing |
320 | to it. | 319 | to it. |
320 | |||
321 | --- | ||
322 | [mandatory] | ||
323 | |||
324 | .d_delete() now only advises the dcache as to whether or not to cache | ||
325 | unreferenced dentries, and is now only called when the dentry refcount goes to | ||
326 | 0. Even on 0 refcount transition, it must be able to tolerate being called 0, | ||
327 | 1, or more times (eg. constant, idempotent). | ||
328 | |||
329 | --- | ||
330 | [mandatory] | ||
331 | |||
332 | .d_compare() calling convention and locking rules are significantly | ||
333 | changed. Read updated documentation in Documentation/filesystems/vfs.txt (and | ||
334 | look at examples of other filesystems) for guidance. | ||
335 | |||
336 | --- | ||
337 | [mandatory] | ||
338 | |||
339 | .d_hash() calling convention and locking rules are significantly | ||
340 | changed. Read updated documentation in Documentation/filesystems/vfs.txt (and | ||
341 | look at examples of other filesystems) for guidance. | ||
342 | |||
343 | --- | ||
344 | [mandatory] | ||
345 | dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c | ||
346 | for details of what locks to replace dcache_lock with in order to protect | ||
347 | particular things. Most of the time, a filesystem only needs ->d_lock, which | ||
348 | protects *all* the dcache state of a given dentry. | ||
349 | |||
350 | -- | ||
351 | [mandatory] | ||
352 | |||
353 | Filesystems must RCU-free their inodes, if they can have been accessed | ||
354 | via rcu-walk path walk (basically, if the file can have had a path name in the | ||
355 | vfs namespace). | ||
356 | |||
357 | i_dentry and i_rcu share storage in a union, and the vfs expects | ||
358 | i_dentry to be reinitialized before it is freed, so an: | ||
359 | |||
360 | INIT_LIST_HEAD(&inode->i_dentry); | ||
361 | |||
362 | must be done in the RCU callback. | ||
363 | |||
364 | -- | ||
365 | [recommended] | ||
366 | vfs now tries to do path walking in "rcu-walk mode", which avoids | ||
367 | atomic operations and scalability hazards on dentries and inodes (see | ||
368 | Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes | ||
369 | (above) are examples of the changes required to support this. For more complex | ||
370 | filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so | ||
371 | no changes are required to the filesystem. However, this is costly and loses | ||
372 | the benefits of rcu-walk mode. We will begin to add filesystem callbacks that | ||
373 | are rcu-walk aware, shown below. Filesystems should take advantage of this | ||
374 | where possible. | ||
375 | |||
376 | -- | ||
377 | [mandatory] | ||
378 | d_revalidate is a callback that is made on every path element (if | ||
379 | the filesystem provides it), which requires dropping out of rcu-walk mode. This | ||
380 | may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be | ||
381 | returned if the filesystem cannot handle rcu-walk. See | ||
382 | Documentation/filesystems/vfs.txt for more details. | ||
383 | |||
384 | permission and check_acl are inode permission checks that are called | ||
385 | on many or all directory inodes on the way down a path walk (to check for | ||
386 | exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU). | ||
387 | See Documentation/filesystems/vfs.txt for more details. | ||
388 | |||
389 | -- | ||
390 | [mandatory] | ||
391 | In ->fallocate() you must check the mode option passed in. If your | ||
392 | filesystem does not support hole punching (deallocating space in the middle of a | ||
393 | file) you must return -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set in mode. | ||
394 | Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set, | ||
395 | so the i_size should not change when hole punching, even when puching the end of | ||
396 | a file off. | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e73df2722ff3..23cae6548d3a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -375,6 +375,7 @@ Anonymous: 0 kB | |||
375 | Swap: 0 kB | 375 | Swap: 0 kB |
376 | KernelPageSize: 4 kB | 376 | KernelPageSize: 4 kB |
377 | MMUPageSize: 4 kB | 377 | MMUPageSize: 4 kB |
378 | Locked: 374 kB | ||
378 | 379 | ||
379 | The first of these lines shows the same information as is displayed for the | 380 | The first of these lines shows the same information as is displayed for the |
380 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping | 381 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping |
@@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a | |||
670 | 671 | ||
671 | > cat /proc/meminfo | 672 | > cat /proc/meminfo |
672 | 673 | ||
674 | The "Locked" indicates whether the mapping is locked in memory or not. | ||
675 | |||
673 | 676 | ||
674 | MemTotal: 16344972 kB | 677 | MemTotal: 16344972 kB |
675 | MemFree: 13634064 kB | 678 | MemFree: 13634064 kB |
@@ -1181,6 +1184,30 @@ Table 1-12: Files in /proc/fs/ext4/<devname> | |||
1181 | mb_groups details of multiblock allocator buddy cache of free blocks | 1184 | mb_groups details of multiblock allocator buddy cache of free blocks |
1182 | .............................................................................. | 1185 | .............................................................................. |
1183 | 1186 | ||
1187 | 2.0 /proc/consoles | ||
1188 | ------------------ | ||
1189 | Shows registered system console lines. | ||
1190 | |||
1191 | To see which character device lines are currently used for the system console | ||
1192 | /dev/console, you may simply look into the file /proc/consoles: | ||
1193 | |||
1194 | > cat /proc/consoles | ||
1195 | tty0 -WU (ECp) 4:7 | ||
1196 | ttyS0 -W- (Ep) 4:64 | ||
1197 | |||
1198 | The columns are: | ||
1199 | |||
1200 | device name of the device | ||
1201 | operations R = can do read operations | ||
1202 | W = can do write operations | ||
1203 | U = can do unblank | ||
1204 | flags E = it is enabled | ||
1205 | C = it is prefered console | ||
1206 | B = it is primary boot console | ||
1207 | p = it is used for printk buffer | ||
1208 | b = it is not a TTY but a Braille device | ||
1209 | a = it is safe to use when cpu is offline | ||
1210 | major:minor major and minor number of the device separated by a colon | ||
1184 | 1211 | ||
1185 | ------------------------------------------------------------------------------ | 1212 | ------------------------------------------------------------------------------ |
1186 | Summary | 1213 | Summary |
@@ -1296,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj. | |||
1296 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the | 1323 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the |
1297 | other with its scaled value. | 1324 | other with its scaled value. |
1298 | 1325 | ||
1326 | The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last | ||
1327 | value set by a CAP_SYS_RESOURCE process. To reduce the value any lower | ||
1328 | requires CAP_SYS_RESOURCE. | ||
1329 | |||
1299 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see | 1330 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see |
1300 | Documentation/feature-removal-schedule.txt. | 1331 | Documentation/feature-removal-schedule.txt. |
1301 | 1332 | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 20899e095e7e..cae6d27c9f5b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -325,7 +325,8 @@ struct inode_operations { | |||
325 | void * (*follow_link) (struct dentry *, struct nameidata *); | 325 | void * (*follow_link) (struct dentry *, struct nameidata *); |
326 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 326 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
327 | void (*truncate) (struct inode *); | 327 | void (*truncate) (struct inode *); |
328 | int (*permission) (struct inode *, int, struct nameidata *); | 328 | int (*permission) (struct inode *, int, unsigned int); |
329 | int (*check_acl)(struct inode *, int, unsigned int); | ||
329 | int (*setattr) (struct dentry *, struct iattr *); | 330 | int (*setattr) (struct dentry *, struct iattr *); |
330 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 331 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
331 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 332 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
@@ -414,6 +415,13 @@ otherwise noted. | |||
414 | permission: called by the VFS to check for access rights on a POSIX-like | 415 | permission: called by the VFS to check for access rights on a POSIX-like |
415 | filesystem. | 416 | filesystem. |
416 | 417 | ||
418 | May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk | ||
419 | mode, the filesystem must check the permission without blocking or | ||
420 | storing to the inode. | ||
421 | |||
422 | If a situation is encountered that rcu-walk cannot handle, return | ||
423 | -ECHILD and it will be called again in ref-walk mode. | ||
424 | |||
417 | setattr: called by the VFS to set attributes for a file. This method | 425 | setattr: called by the VFS to set attributes for a file. This method |
418 | is called by chmod(2) and related system calls. | 426 | is called by chmod(2) and related system calls. |
419 | 427 | ||
@@ -847,9 +855,12 @@ defined: | |||
847 | 855 | ||
848 | struct dentry_operations { | 856 | struct dentry_operations { |
849 | int (*d_revalidate)(struct dentry *, struct nameidata *); | 857 | int (*d_revalidate)(struct dentry *, struct nameidata *); |
850 | int (*d_hash) (struct dentry *, struct qstr *); | 858 | int (*d_hash)(const struct dentry *, const struct inode *, |
851 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 859 | struct qstr *); |
852 | int (*d_delete)(struct dentry *); | 860 | int (*d_compare)(const struct dentry *, const struct inode *, |
861 | const struct dentry *, const struct inode *, | ||
862 | unsigned int, const char *, const struct qstr *); | ||
863 | int (*d_delete)(const struct dentry *); | ||
853 | void (*d_release)(struct dentry *); | 864 | void (*d_release)(struct dentry *); |
854 | void (*d_iput)(struct dentry *, struct inode *); | 865 | void (*d_iput)(struct dentry *, struct inode *); |
855 | char *(*d_dname)(struct dentry *, char *, int); | 866 | char *(*d_dname)(struct dentry *, char *, int); |
@@ -860,13 +871,45 @@ struct dentry_operations { | |||
860 | dcache. Most filesystems leave this as NULL, because all their | 871 | dcache. Most filesystems leave this as NULL, because all their |
861 | dentries in the dcache are valid | 872 | dentries in the dcache are valid |
862 | 873 | ||
863 | d_hash: called when the VFS adds a dentry to the hash table | 874 | d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU). |
875 | If in rcu-walk mode, the filesystem must revalidate the dentry without | ||
876 | blocking or storing to the dentry, d_parent and d_inode should not be | ||
877 | used without care (because they can go NULL), instead nd->inode should | ||
878 | be used. | ||
879 | |||
880 | If a situation is encountered that rcu-walk cannot handle, return | ||
881 | -ECHILD and it will be called again in ref-walk mode. | ||
882 | |||
883 | d_hash: called when the VFS adds a dentry to the hash table. The first | ||
884 | dentry passed to d_hash is the parent directory that the name is | ||
885 | to be hashed into. The inode is the dentry's inode. | ||
886 | |||
887 | Same locking and synchronisation rules as d_compare regarding | ||
888 | what is safe to dereference etc. | ||
889 | |||
890 | d_compare: called to compare a dentry name with a given name. The first | ||
891 | dentry is the parent of the dentry to be compared, the second is | ||
892 | the parent's inode, then the dentry and inode (may be NULL) of the | ||
893 | child dentry. len and name string are properties of the dentry to be | ||
894 | compared. qstr is the name to compare it with. | ||
895 | |||
896 | Must be constant and idempotent, and should not take locks if | ||
897 | possible, and should not or store into the dentry or inodes. | ||
898 | Should not dereference pointers outside the dentry or inodes without | ||
899 | lots of care (eg. d_parent, d_inode, d_name should not be used). | ||
900 | |||
901 | However, our vfsmount is pinned, and RCU held, so the dentries and | ||
902 | inodes won't disappear, neither will our sb or filesystem module. | ||
903 | ->i_sb and ->d_sb may be used. | ||
864 | 904 | ||
865 | d_compare: called when a dentry should be compared with another | 905 | It is a tricky calling convention because it needs to be called under |
906 | "rcu-walk", ie. without any locks or references on things. | ||
866 | 907 | ||
867 | d_delete: called when the last reference to a dentry is | 908 | d_delete: called when the last reference to a dentry is dropped and the |
868 | deleted. This means no-one is using the dentry, however it is | 909 | dcache is deciding whether or not to cache it. Return 1 to delete |
869 | still valid and in the dcache | 910 | immediately, or 0 to cache the dentry. Default is NULL which means to |
911 | always cache a reachable dentry. d_delete must be constant and | ||
912 | idempotent. | ||
870 | 913 | ||
871 | d_release: called when a dentry is really deallocated | 914 | d_release: called when a dentry is really deallocated |
872 | 915 | ||
@@ -910,14 +953,11 @@ manipulate dentries: | |||
910 | the usage count) | 953 | the usage count) |
911 | 954 | ||
912 | dput: close a handle for a dentry (decrements the usage count). If | 955 | dput: close a handle for a dentry (decrements the usage count). If |
913 | the usage count drops to 0, the "d_delete" method is called | 956 | the usage count drops to 0, and the dentry is still in its |
914 | and the dentry is placed on the unused list if the dentry is | 957 | parent's hash, the "d_delete" method is called to check whether |
915 | still in its parents hash list. Putting the dentry on the | 958 | it should be cached. If it should not be cached, or if the dentry |
916 | unused list just means that if the system needs some RAM, it | 959 | is not hashed, it is deleted. Otherwise cached dentries are put |
917 | goes through the unused list of dentries and deallocates them. | 960 | into an LRU list to be reclaimed on memory shortage. |
918 | If the dentry has already been unhashed and the usage count | ||
919 | drops to 0, in this case the dentry is deallocated after the | ||
920 | "d_delete" method is called | ||
921 | 961 | ||
922 | d_drop: this unhashes a dentry from its parents hash list. A | 962 | d_drop: this unhashes a dentry from its parents hash list. A |
923 | subsequent call to dput() will deallocate the dentry if its | 963 | subsequent call to dput() will deallocate the dentry if its |