aboutsummaryrefslogtreecommitdiffstats
path: root/fs/notify
diff options
context:
space:
mode:
Diffstat (limited to 'fs/notify')
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c213
-rw-r--r--fs/notify/fanotify/Kconfig26
-rw-r--r--fs/notify/fanotify/Makefile1
-rw-r--r--fs/notify/fanotify/fanotify.c209
-rw-r--r--fs/notify/fanotify/fanotify_user.c787
-rw-r--r--fs/notify/fsnotify.c205
-rw-r--r--fs/notify/fsnotify.h27
-rw-r--r--fs/notify/group.c182
-rw-r--r--fs/notify/inode_mark.c337
-rw-r--r--fs/notify/inotify/Kconfig15
-rw-r--r--fs/notify/inotify/Makefile1
-rw-r--r--fs/notify/inotify/inotify.c873
-rw-r--r--fs/notify/inotify/inotify.h7
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c151
-rw-r--r--fs/notify/inotify/inotify_user.c369
-rw-r--r--fs/notify/mark.c371
-rw-r--r--fs/notify/notification.c209
-rw-r--r--fs/notify/vfsmount_mark.c187
20 files changed, 2452 insertions, 1723 deletions
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d02..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c46..ae5f33a6d868 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o
2 3
3obj-y += dnotify/ 4obj-y += dnotify/
4obj-y += inotify/ 5obj-y += inotify/
6obj-y += fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964dd..3344bdd5506e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
29int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
30 30
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex); 34static DEFINE_MUTEX(dnotify_mark_mutex);
35 35
36/* 36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which 37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
38 * is being watched by dnotify. If multiple userspace applications are watching 38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn 39 * the same directory with dnotify their information is chained in dn
40 */ 40 */
41struct dnotify_mark_entry { 41struct dnotify_mark {
42 struct fsnotify_mark_entry fsn_entry; 42 struct fsnotify_mark fsn_mark;
43 struct dnotify_struct *dn; 43 struct dnotify_struct *dn;
44}; 44};
45 45
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
51 * it calls the fsnotify function so it can update the set of all events relevant 51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode. 52 * to this inode.
53 */ 53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) 54static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
55{ 55{
56 __u32 new_mask, old_mask; 56 __u32 new_mask, old_mask;
57 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry, 58 struct dnotify_mark *dn_mark = container_of(fsn_mark,
59 struct dnotify_mark_entry, 59 struct dnotify_mark,
60 fsn_entry); 60 fsn_mark);
61 61
62 assert_spin_locked(&entry->lock); 62 assert_spin_locked(&fsn_mark->lock);
63 63
64 old_mask = entry->mask; 64 old_mask = fsn_mark->mask;
65 new_mask = 0; 65 new_mask = 0;
66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) 66 for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
68 entry->mask = new_mask; 68 fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
69 69
70 if (old_mask == new_mask) 70 if (old_mask == new_mask)
71 return; 71 return;
72 72
73 if (entry->inode) 73 if (fsn_mark->i.inode)
74 fsnotify_recalc_inode_mask(entry->inode); 74 fsnotify_recalc_inode_mask(fsn_mark->i.inode);
75} 75}
76 76
77/* 77/*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
83 * events. 83 * events.
84 */ 84 */
85static int dnotify_handle_event(struct fsnotify_group *group, 85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_mark *inode_mark,
87 struct fsnotify_mark *vfsmount_mark,
86 struct fsnotify_event *event) 88 struct fsnotify_event *event)
87{ 89{
88 struct fsnotify_mark_entry *entry = NULL; 90 struct dnotify_mark *dn_mark;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell; 91 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
95 96
96 to_tell = event->to_tell; 97 BUG_ON(vfsmount_mark);
97 98
98 spin_lock(&to_tell->i_lock); 99 to_tell = event->to_tell;
99 entry = fsnotify_find_mark_entry(group, to_tell);
100 spin_unlock(&to_tell->i_lock);
101 100
102 /* unlikely since we alreay passed dnotify_should_send_event() */ 101 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
103 if (unlikely(!entry))
104 return 0;
105 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
106 102
107 spin_lock(&entry->lock); 103 spin_lock(&inode_mark->lock);
108 prev = &dnentry->dn; 104 prev = &dn_mark->dn;
109 while ((dn = *prev) != NULL) { 105 while ((dn = *prev) != NULL) {
110 if ((dn->dn_mask & test_mask) == 0) { 106 if ((dn->dn_mask & test_mask) == 0) {
111 prev = &dn->dn_next; 107 prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
118 else { 114 else {
119 *prev = dn->dn_next; 115 *prev = dn->dn_next;
120 kmem_cache_free(dnotify_struct_cache, dn); 116 kmem_cache_free(dnotify_struct_cache, dn);
121 dnotify_recalc_inode_mask(entry); 117 dnotify_recalc_inode_mask(inode_mark);
122 } 118 }
123 } 119 }
124 120
125 spin_unlock(&entry->lock); 121 spin_unlock(&inode_mark->lock);
126 fsnotify_put_mark(entry);
127 122
128 return 0; 123 return 0;
129} 124}
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
133 * userspace notification for that pair. 128 * userspace notification for that pair.
134 */ 129 */
135static bool dnotify_should_send_event(struct fsnotify_group *group, 130static bool dnotify_should_send_event(struct fsnotify_group *group,
136 struct inode *inode, __u32 mask) 131 struct inode *inode,
132 struct fsnotify_mark *inode_mark,
133 struct fsnotify_mark *vfsmount_mark,
134 __u32 mask, void *data, int data_type)
137{ 135{
138 struct fsnotify_mark_entry *entry;
139 bool send;
140
141 /* !dir_notify_enable should never get here, don't waste time checking
142 if (!dir_notify_enable)
143 return 0; */
144
145 /* not a dir, dnotify doesn't care */ 136 /* not a dir, dnotify doesn't care */
146 if (!S_ISDIR(inode->i_mode)) 137 if (!S_ISDIR(inode->i_mode))
147 return false; 138 return false;
148 139
149 spin_lock(&inode->i_lock); 140 return true;
150 entry = fsnotify_find_mark_entry(group, inode);
151 spin_unlock(&inode->i_lock);
152
153 /* no mark means no dnotify watch */
154 if (!entry)
155 return false;
156
157 mask = (mask & ~FS_EVENT_ON_CHILD);
158 send = (mask & entry->mask);
159
160 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
161
162 return send;
163} 141}
164 142
165static void dnotify_free_mark(struct fsnotify_mark_entry *entry) 143static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
166{ 144{
167 struct dnotify_mark_entry *dnentry = container_of(entry, 145 struct dnotify_mark *dn_mark = container_of(fsn_mark,
168 struct dnotify_mark_entry, 146 struct dnotify_mark,
169 fsn_entry); 147 fsn_mark);
170 148
171 BUG_ON(dnentry->dn); 149 BUG_ON(dn_mark->dn);
172 150
173 kmem_cache_free(dnotify_mark_entry_cache, dnentry); 151 kmem_cache_free(dnotify_mark_cache, dn_mark);
174} 152}
175 153
176static struct fsnotify_ops dnotify_fsnotify_ops = { 154static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
183 161
184/* 162/*
185 * Called every time a file is closed. Looks first for a dnotify mark on the 163 * Called every time a file is closed. Looks first for a dnotify mark on the
186 * inode. If one is found run all of the ->dn entries attached to that 164 * inode. If one is found run all of the ->dn structures attached to that
187 * mark for one relevant to this process closing the file and remove that 165 * mark for one relevant to this process closing the file and remove that
188 * dnotify_struct. If that was the last dnotify_struct also remove the 166 * dnotify_struct. If that was the last dnotify_struct also remove the
189 * fsnotify_mark_entry. 167 * fsnotify_mark.
190 */ 168 */
191void dnotify_flush(struct file *filp, fl_owner_t id) 169void dnotify_flush(struct file *filp, fl_owner_t id)
192{ 170{
193 struct fsnotify_mark_entry *entry; 171 struct fsnotify_mark *fsn_mark;
194 struct dnotify_mark_entry *dnentry; 172 struct dnotify_mark *dn_mark;
195 struct dnotify_struct *dn; 173 struct dnotify_struct *dn;
196 struct dnotify_struct **prev; 174 struct dnotify_struct **prev;
197 struct inode *inode; 175 struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
200 if (!S_ISDIR(inode->i_mode)) 178 if (!S_ISDIR(inode->i_mode))
201 return; 179 return;
202 180
203 spin_lock(&inode->i_lock); 181 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
204 entry = fsnotify_find_mark_entry(dnotify_group, inode); 182 if (!fsn_mark)
205 spin_unlock(&inode->i_lock);
206 if (!entry)
207 return; 183 return;
208 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); 184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
209 185
210 mutex_lock(&dnotify_mark_mutex); 186 mutex_lock(&dnotify_mark_mutex);
211 187
212 spin_lock(&entry->lock); 188 spin_lock(&fsn_mark->lock);
213 prev = &dnentry->dn; 189 prev = &dn_mark->dn;
214 while ((dn = *prev) != NULL) { 190 while ((dn = *prev) != NULL) {
215 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 191 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
216 *prev = dn->dn_next; 192 *prev = dn->dn_next;
217 kmem_cache_free(dnotify_struct_cache, dn); 193 kmem_cache_free(dnotify_struct_cache, dn);
218 dnotify_recalc_inode_mask(entry); 194 dnotify_recalc_inode_mask(fsn_mark);
219 break; 195 break;
220 } 196 }
221 prev = &dn->dn_next; 197 prev = &dn->dn_next;
222 } 198 }
223 199
224 spin_unlock(&entry->lock); 200 spin_unlock(&fsn_mark->lock);
225 201
226 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 202 /* nothing else could have found us thanks to the dnotify_mark_mutex */
227 if (dnentry->dn == NULL) 203 if (dn_mark->dn == NULL)
228 fsnotify_destroy_mark_by_entry(entry); 204 fsnotify_destroy_mark(fsn_mark);
229
230 fsnotify_recalc_group_mask(dnotify_group);
231 205
232 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_mark_mutex);
233 207
234 fsnotify_put_mark(entry); 208 fsnotify_put_mark(fsn_mark);
235} 209}
236 210
237/* this conversion is done only at watch creation */ 211/* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
259 233
260/* 234/*
261 * If multiple processes watch the same inode with dnotify there is only one 235 * If multiple processes watch the same inode with dnotify there is only one
262 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct 236 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
263 * onto that mark. This function either attaches the new dnotify_struct onto 237 * onto that mark. This function either attaches the new dnotify_struct onto
264 * that list, or it |= the mask onto an existing dnofiy_struct. 238 * that list, or it |= the mask onto an existing dnofiy_struct.
265 */ 239 */
266static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, 240static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
267 fl_owner_t id, int fd, struct file *filp, __u32 mask) 241 fl_owner_t id, int fd, struct file *filp, __u32 mask)
268{ 242{
269 struct dnotify_struct *odn; 243 struct dnotify_struct *odn;
270 244
271 odn = dnentry->dn; 245 odn = dn_mark->dn;
272 while (odn != NULL) { 246 while (odn != NULL) {
273 /* adding more events to existing dnofiy_struct? */ 247 /* adding more events to existing dnofiy_struct? */
274 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { 248 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
283 dn->dn_fd = fd; 257 dn->dn_fd = fd;
284 dn->dn_filp = filp; 258 dn->dn_filp = filp;
285 dn->dn_owner = id; 259 dn->dn_owner = id;
286 dn->dn_next = dnentry->dn; 260 dn->dn_next = dn_mark->dn;
287 dnentry->dn = dn; 261 dn_mark->dn = dn;
288 262
289 return 0; 263 return 0;
290} 264}
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
296 */ 270 */
297int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 271int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
298{ 272{
299 struct dnotify_mark_entry *new_dnentry, *dnentry; 273 struct dnotify_mark *new_dn_mark, *dn_mark;
300 struct fsnotify_mark_entry *new_entry, *entry; 274 struct fsnotify_mark *new_fsn_mark, *fsn_mark;
301 struct dnotify_struct *dn; 275 struct dnotify_struct *dn;
302 struct inode *inode; 276 struct inode *inode;
303 fl_owner_t id = current->files; 277 fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
306 __u32 mask; 280 __u32 mask;
307 281
308 /* we use these to tell if we need to kfree */ 282 /* we use these to tell if we need to kfree */
309 new_entry = NULL; 283 new_fsn_mark = NULL;
310 dn = NULL; 284 dn = NULL;
311 285
312 if (!dir_notify_enable) { 286 if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
336 } 310 }
337 311
338 /* new fsnotify mark, we expect most fcntl calls to add a new mark */ 312 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
339 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); 313 new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
340 if (!new_dnentry) { 314 if (!new_dn_mark) {
341 error = -ENOMEM; 315 error = -ENOMEM;
342 goto out_err; 316 goto out_err;
343 } 317 }
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
345 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ 319 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
346 mask = convert_arg(arg); 320 mask = convert_arg(arg);
347 321
348 /* set up the new_entry and new_dnentry */ 322 /* set up the new_fsn_mark and new_dn_mark */
349 new_entry = &new_dnentry->fsn_entry; 323 new_fsn_mark = &new_dn_mark->fsn_mark;
350 fsnotify_init_mark(new_entry, dnotify_free_mark); 324 fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
351 new_entry->mask = mask; 325 new_fsn_mark->mask = mask;
352 new_dnentry->dn = NULL; 326 new_dn_mark->dn = NULL;
353 327
354 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
355 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_mark_mutex);
356 330
357 /* add the new_entry or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
358 spin_lock(&inode->i_lock); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
359 entry = fsnotify_find_mark_entry(dnotify_group, inode); 333 if (fsn_mark) {
360 spin_unlock(&inode->i_lock); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
361 if (entry) { 335 spin_lock(&fsn_mark->lock);
362 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
363 spin_lock(&entry->lock);
364 } else { 336 } else {
365 fsnotify_add_mark(new_entry, dnotify_group, inode); 337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
366 spin_lock(&new_entry->lock); 338 spin_lock(&new_fsn_mark->lock);
367 entry = new_entry; 339 fsn_mark = new_fsn_mark;
368 dnentry = new_dnentry; 340 dn_mark = new_dn_mark;
369 /* we used new_entry, so don't free it */ 341 /* we used new_fsn_mark, so don't free it */
370 new_entry = NULL; 342 new_fsn_mark = NULL;
371 } 343 }
372 344
373 rcu_read_lock(); 345 rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
376 348
377 /* if (f != filp) means that we lost a race and another task/thread 349 /* if (f != filp) means that we lost a race and another task/thread
378 * actually closed the fd we are still playing with before we grabbed 350 * actually closed the fd we are still playing with before we grabbed
379 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the 351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
380 * only time we clean up the mark entries we need to get our mark off 352 * only time we clean up the marks we need to get our mark off
381 * the list. */ 353 * the list. */
382 if (f != filp) { 354 if (f != filp) {
383 /* if we added ourselves, shoot ourselves, it's possible that 355 /* if we added ourselves, shoot ourselves, it's possible that
384 * the flush actually did shoot this entry. That's fine too 356 * the flush actually did shoot this fsn_mark. That's fine too
385 * since multiple calls to destroy_mark is perfectly safe, if 357 * since multiple calls to destroy_mark is perfectly safe, if
386 * we found a dnentry already attached to the inode, just sod 358 * we found a dn_mark already attached to the inode, just sod
387 * off silently as the flush at close time dealt with it. 359 * off silently as the flush at close time dealt with it.
388 */ 360 */
389 if (dnentry == new_dnentry) 361 if (dn_mark == new_dn_mark)
390 destroy = 1; 362 destroy = 1;
391 goto out; 363 goto out;
392 } 364 }
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
394 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 366 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
395 if (error) { 367 if (error) {
396 /* if we added, we must shoot */ 368 /* if we added, we must shoot */
397 if (dnentry == new_dnentry) 369 if (dn_mark == new_dn_mark)
398 destroy = 1; 370 destroy = 1;
399 goto out; 371 goto out;
400 } 372 }
401 373
402 error = attach_dn(dn, dnentry, id, fd, filp, mask); 374 error = attach_dn(dn, dn_mark, id, fd, filp, mask);
403 /* !error means that we attached the dn to the dnentry, so don't free it */ 375 /* !error means that we attached the dn to the dn_mark, so don't free it */
404 if (!error) 376 if (!error)
405 dn = NULL; 377 dn = NULL;
406 /* -EEXIST means that we didn't add this new dn and used an old one. 378 /* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
408 else if (error == -EEXIST) 380 else if (error == -EEXIST)
409 error = 0; 381 error = 0;
410 382
411 dnotify_recalc_inode_mask(entry); 383 dnotify_recalc_inode_mask(fsn_mark);
412out: 384out:
413 spin_unlock(&entry->lock); 385 spin_unlock(&fsn_mark->lock);
414 386
415 if (destroy) 387 if (destroy)
416 fsnotify_destroy_mark_by_entry(entry); 388 fsnotify_destroy_mark(fsn_mark);
417
418 fsnotify_recalc_group_mask(dnotify_group);
419 389
420 mutex_unlock(&dnotify_mark_mutex); 390 mutex_unlock(&dnotify_mark_mutex);
421 fsnotify_put_mark(entry); 391 fsnotify_put_mark(fsn_mark);
422out_err: 392out_err:
423 if (new_entry) 393 if (new_fsn_mark)
424 fsnotify_put_mark(new_entry); 394 fsnotify_put_mark(new_fsn_mark);
425 if (dn) 395 if (dn)
426 kmem_cache_free(dnotify_struct_cache, dn); 396 kmem_cache_free(dnotify_struct_cache, dn);
427 return error; 397 return error;
@@ -430,10 +400,9 @@ out_err:
430static int __init dnotify_init(void) 400static int __init dnotify_init(void)
431{ 401{
432 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); 402 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
433 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); 403 dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
434 404
435 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, 405 dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
436 0, &dnotify_fsnotify_ops);
437 if (IS_ERR(dnotify_group)) 406 if (IS_ERR(dnotify_group))
438 panic("unable to allocate fsnotify group for dnotify\n"); 407 panic("unable to allocate fsnotify group for dnotify\n");
439 return 0; 408 return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000000..3ac36b7bf6b9
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
1config FANOTIFY
2 bool "Filesystem wide access notification"
3 select FSNOTIFY
4 select ANON_INODES
5 default n
6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with
10 the event.
11
12 If unsure, say Y.
13
14config FANOTIFY_ACCESS_PERMISSIONS
15 bool "fanotify permissions checking"
16 depends on FANOTIFY
17 depends on SECURITY
18 default n
19 ---help---
20 Say Y here is you want fanotify listeners to be able to make permissions
21 decisions concerning filesystem events. This is used by some fanotify
22 listeners which need to scan files before allowing the system access to
23 use those files. This is used by some anti-malware vendors and by some
24 hierarchical storage managent systems.
25
26 If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000000..0999213e7e6e
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000000..85366c78cc37
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,209 @@
1#include <linux/fanotify.h>
2#include <linux/fdtable.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/init.h>
5#include <linux/jiffies.h>
6#include <linux/kernel.h> /* UINT_MAX */
7#include <linux/mount.h>
8#include <linux/sched.h>
9#include <linux/types.h>
10#include <linux/wait.h>
11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
13{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new);
15
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH):
21 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry))
23 return true;
24 case (FSNOTIFY_EVENT_NONE):
25 return true;
26 default:
27 BUG();
28 };
29 }
30 return false;
31}
32
33/* and the list better be locked by something too! */
34static struct fsnotify_event *fanotify_merge(struct list_head *list,
35 struct fsnotify_event *event)
36{
37 struct fsnotify_event_holder *test_holder;
38 struct fsnotify_event *test_event = NULL;
39 struct fsnotify_event *new_event;
40
41 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
42
43
44 list_for_each_entry_reverse(test_holder, list, event_list) {
45 if (should_merge(test_holder->event, event)) {
46 test_event = test_holder->event;
47 break;
48 }
49 }
50
51 if (!test_event)
52 return NULL;
53
54 fsnotify_get_event(test_event);
55
56 /* if they are exactly the same we are done */
57 if (test_event->mask == event->mask)
58 return test_event;
59
60 /*
61 * if the refcnt == 2 this is the only queue
62 * for this event and so we can update the mask
63 * in place.
64 */
65 if (atomic_read(&test_event->refcnt) == 2) {
66 test_event->mask |= event->mask;
67 return test_event;
68 }
69
70 new_event = fsnotify_clone_event(test_event);
71
72 /* done with test_event */
73 fsnotify_put_event(test_event);
74
75 /* couldn't allocate memory, merge was not possible */
76 if (unlikely(!new_event))
77 return ERR_PTR(-ENOMEM);
78
79 /* build new event and replace it on the list */
80 new_event->mask = (test_event->mask | event->mask);
81 fsnotify_replace_event(test_holder, new_event);
82
83 /* we hold a reference on new_event from clone_event */
84 return new_event;
85}
86
87#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
88static int fanotify_get_response_from_access(struct fsnotify_group *group,
89 struct fsnotify_event *event)
90{
91 int ret;
92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94
95 wait_event(group->fanotify_data.access_waitq, event->response);
96
97 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock);
99 switch (event->response) {
100 case FAN_ALLOW:
101 ret = 0;
102 break;
103 case FAN_DENY:
104 default:
105 ret = -EPERM;
106 }
107 event->response = 0;
108 spin_unlock(&event->lock);
109
110 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
111 group, event, ret);
112
113 return ret;
114}
115#endif
116
117static int fanotify_handle_event(struct fsnotify_group *group,
118 struct fsnotify_mark *inode_mark,
119 struct fsnotify_mark *fanotify_mark,
120 struct fsnotify_event *event)
121{
122 int ret = 0;
123 struct fsnotify_event *notify_event = NULL;
124
125 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
126 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
127 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
128 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
129 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
130 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
134
135 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
136
137 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
138 if (IS_ERR(notify_event))
139 return PTR_ERR(notify_event);
140
141#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
142 if (event->mask & FAN_ALL_PERM_EVENTS) {
143 /* if we merged we need to wait on the new event */
144 if (notify_event)
145 event = notify_event;
146 ret = fanotify_get_response_from_access(group, event);
147 }
148#endif
149
150 if (notify_event)
151 fsnotify_put_event(notify_event);
152
153 return ret;
154}
155
156static bool fanotify_should_send_event(struct fsnotify_group *group,
157 struct inode *to_tell,
158 struct fsnotify_mark *inode_mark,
159 struct fsnotify_mark *vfsmnt_mark,
160 __u32 event_mask, void *data, int data_type)
161{
162 __u32 marks_mask, marks_ignored_mask;
163
164 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167
168 /* sorry, fanotify only gives a damn about files and dirs */
169 if (!S_ISREG(to_tell->i_mode) &&
170 !S_ISDIR(to_tell->i_mode))
171 return false;
172
173 /* if we don't have enough info to send an event to userspace say no */
174 if (data_type != FSNOTIFY_EVENT_PATH)
175 return false;
176
177 if (inode_mark && vfsmnt_mark) {
178 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
179 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
180 } else if (inode_mark) {
181 /*
182 * if the event is for a child and this inode doesn't care about
183 * events on the child, don't send it!
184 */
185 if ((event_mask & FS_EVENT_ON_CHILD) &&
186 !(inode_mark->mask & FS_EVENT_ON_CHILD))
187 return false;
188 marks_mask = inode_mark->mask;
189 marks_ignored_mask = inode_mark->ignored_mask;
190 } else if (vfsmnt_mark) {
191 marks_mask = vfsmnt_mark->mask;
192 marks_ignored_mask = vfsmnt_mark->ignored_mask;
193 } else {
194 BUG();
195 }
196
197 if (event_mask & marks_mask & ~marks_ignored_mask)
198 return true;
199
200 return false;
201}
202
203const struct fsnotify_ops fanotify_fsnotify_ops = {
204 .handle_event = fanotify_handle_event,
205 .should_send_event = fanotify_should_send_event,
206 .free_group_priv = NULL,
207 .free_event_priv = NULL,
208 .freeing_mark = NULL,
209};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000000..5ed8e58d7bfc
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,787 @@
1#include <linux/fanotify.h>
2#include <linux/fcntl.h>
3#include <linux/file.h>
4#include <linux/fs.h>
5#include <linux/anon_inodes.h>
6#include <linux/fsnotify_backend.h>
7#include <linux/init.h>
8#include <linux/mount.h>
9#include <linux/namei.h>
10#include <linux/poll.h>
11#include <linux/security.h>
12#include <linux/syscalls.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/uaccess.h>
16
17#include <asm/ioctls.h>
18
19extern const struct fsnotify_ops fanotify_fsnotify_ops;
20
21static struct kmem_cache *fanotify_mark_cache __read_mostly;
22static struct kmem_cache *fanotify_response_event_cache __read_mostly;
23
24struct fanotify_response_event {
25 struct list_head list;
26 __s32 fd;
27 struct fsnotify_event *event;
28};
29
30/*
31 * Get an fsnotify notification event if one exists and is small
32 * enough to fit in "count". Return an error pointer if the count
33 * is not large enough.
34 *
35 * Called with the group->notification_mutex held.
36 */
37static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
38 size_t count)
39{
40 BUG_ON(!mutex_is_locked(&group->notification_mutex));
41
42 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
43
44 if (fsnotify_notify_queue_is_empty(group))
45 return NULL;
46
47 if (FAN_EVENT_METADATA_LEN > count)
48 return ERR_PTR(-EINVAL);
49
50 /* held the notification_mutex the whole time, so this is the
51 * same event we peeked above */
52 return fsnotify_remove_notify_event(group);
53}
54
55static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
56{
57 int client_fd;
58 struct dentry *dentry;
59 struct vfsmount *mnt;
60 struct file *new_file;
61
62 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
63
64 client_fd = get_unused_fd();
65 if (client_fd < 0)
66 return client_fd;
67
68 if (event->data_type != FSNOTIFY_EVENT_PATH) {
69 WARN_ON(1);
70 put_unused_fd(client_fd);
71 return -EINVAL;
72 }
73
74 /*
75 * we need a new file handle for the userspace program so it can read even if it was
76 * originally opened O_WRONLY.
77 */
78 dentry = dget(event->path.dentry);
79 mnt = mntget(event->path.mnt);
80 /* it's possible this event was an overflow event. in that case dentry and mnt
81 * are NULL; That's fine, just don't call dentry open */
82 if (dentry && mnt)
83 new_file = dentry_open(dentry, mnt,
84 group->fanotify_data.f_flags | FMODE_NONOTIFY,
85 current_cred());
86 else
87 new_file = ERR_PTR(-EOVERFLOW);
88 if (IS_ERR(new_file)) {
89 /*
90 * we still send an event even if we can't open the file. this
91 * can happen when say tasks are gone and we try to open their
92 * /proc files or we try to open a WRONLY file like in sysfs
93 * we just send the errno to userspace since there isn't much
94 * else we can do.
95 */
96 put_unused_fd(client_fd);
97 client_fd = PTR_ERR(new_file);
98 } else {
99 fd_install(client_fd, new_file);
100 }
101
102 return client_fd;
103}
104
105static ssize_t fill_event_metadata(struct fsnotify_group *group,
106 struct fanotify_event_metadata *metadata,
107 struct fsnotify_event *event)
108{
109 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
110 group, metadata, event);
111
112 metadata->event_len = FAN_EVENT_METADATA_LEN;
113 metadata->vers = FANOTIFY_METADATA_VERSION;
114 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
115 metadata->pid = pid_vnr(event->tgid);
116 metadata->fd = create_fd(group, event);
117
118 return metadata->fd;
119}
120
121#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
122static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
123 __s32 fd)
124{
125 struct fanotify_response_event *re, *return_re = NULL;
126
127 mutex_lock(&group->fanotify_data.access_mutex);
128 list_for_each_entry(re, &group->fanotify_data.access_list, list) {
129 if (re->fd != fd)
130 continue;
131
132 list_del_init(&re->list);
133 return_re = re;
134 break;
135 }
136 mutex_unlock(&group->fanotify_data.access_mutex);
137
138 pr_debug("%s: found return_re=%p\n", __func__, return_re);
139
140 return return_re;
141}
142
143static int process_access_response(struct fsnotify_group *group,
144 struct fanotify_response *response_struct)
145{
146 struct fanotify_response_event *re;
147 __s32 fd = response_struct->fd;
148 __u32 response = response_struct->response;
149
150 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
151 fd, response);
152 /*
153 * make sure the response is valid, if invalid we do nothing and either
154 * userspace can send a valid responce or we will clean it up after the
155 * timeout
156 */
157 switch (response) {
158 case FAN_ALLOW:
159 case FAN_DENY:
160 break;
161 default:
162 return -EINVAL;
163 }
164
165 if (fd < 0)
166 return -EINVAL;
167
168 re = dequeue_re(group, fd);
169 if (!re)
170 return -ENOENT;
171
172 re->event->response = response;
173
174 wake_up(&group->fanotify_data.access_waitq);
175
176 kmem_cache_free(fanotify_response_event_cache, re);
177
178 return 0;
179}
180
181static int prepare_for_access_response(struct fsnotify_group *group,
182 struct fsnotify_event *event,
183 __s32 fd)
184{
185 struct fanotify_response_event *re;
186
187 if (!(event->mask & FAN_ALL_PERM_EVENTS))
188 return 0;
189
190 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
191 if (!re)
192 return -ENOMEM;
193
194 re->event = event;
195 re->fd = fd;
196
197 mutex_lock(&group->fanotify_data.access_mutex);
198
199 if (group->fanotify_data.bypass_perm) {
200 mutex_unlock(&group->fanotify_data.access_mutex);
201 kmem_cache_free(fanotify_response_event_cache, re);
202 event->response = FAN_ALLOW;
203 return 0;
204 }
205
206 list_add_tail(&re->list, &group->fanotify_data.access_list);
207 mutex_unlock(&group->fanotify_data.access_mutex);
208
209 return 0;
210}
211
212static void remove_access_response(struct fsnotify_group *group,
213 struct fsnotify_event *event,
214 __s32 fd)
215{
216 struct fanotify_response_event *re;
217
218 if (!(event->mask & FAN_ALL_PERM_EVENTS))
219 return;
220
221 re = dequeue_re(group, fd);
222 if (!re)
223 return;
224
225 BUG_ON(re->event != event);
226
227 kmem_cache_free(fanotify_response_event_cache, re);
228
229 return;
230}
231#else
232static int prepare_for_access_response(struct fsnotify_group *group,
233 struct fsnotify_event *event,
234 __s32 fd)
235{
236 return 0;
237}
238
239static void remove_access_response(struct fsnotify_group *group,
240 struct fsnotify_event *event,
241 __s32 fd)
242{
243 return;
244}
245#endif
246
247static ssize_t copy_event_to_user(struct fsnotify_group *group,
248 struct fsnotify_event *event,
249 char __user *buf)
250{
251 struct fanotify_event_metadata fanotify_event_metadata;
252 int fd, ret;
253
254 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
255
256 fd = fill_event_metadata(group, &fanotify_event_metadata, event);
257 if (fd < 0)
258 return fd;
259
260 ret = prepare_for_access_response(group, event, fd);
261 if (ret)
262 goto out_close_fd;
263
264 ret = -EFAULT;
265 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
266 goto out_kill_access_response;
267
268 return FAN_EVENT_METADATA_LEN;
269
270out_kill_access_response:
271 remove_access_response(group, event, fd);
272out_close_fd:
273 sys_close(fd);
274 return ret;
275}
276
277/* intofiy userspace file descriptor functions */
278static unsigned int fanotify_poll(struct file *file, poll_table *wait)
279{
280 struct fsnotify_group *group = file->private_data;
281 int ret = 0;
282
283 poll_wait(file, &group->notification_waitq, wait);
284 mutex_lock(&group->notification_mutex);
285 if (!fsnotify_notify_queue_is_empty(group))
286 ret = POLLIN | POLLRDNORM;
287 mutex_unlock(&group->notification_mutex);
288
289 return ret;
290}
291
292static ssize_t fanotify_read(struct file *file, char __user *buf,
293 size_t count, loff_t *pos)
294{
295 struct fsnotify_group *group;
296 struct fsnotify_event *kevent;
297 char __user *start;
298 int ret;
299 DEFINE_WAIT(wait);
300
301 start = buf;
302 group = file->private_data;
303
304 pr_debug("%s: group=%p\n", __func__, group);
305
306 while (1) {
307 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
308
309 mutex_lock(&group->notification_mutex);
310 kevent = get_one_event(group, count);
311 mutex_unlock(&group->notification_mutex);
312
313 if (kevent) {
314 ret = PTR_ERR(kevent);
315 if (IS_ERR(kevent))
316 break;
317 ret = copy_event_to_user(group, kevent, buf);
318 fsnotify_put_event(kevent);
319 if (ret < 0)
320 break;
321 buf += ret;
322 count -= ret;
323 continue;
324 }
325
326 ret = -EAGAIN;
327 if (file->f_flags & O_NONBLOCK)
328 break;
329 ret = -EINTR;
330 if (signal_pending(current))
331 break;
332
333 if (start != buf)
334 break;
335
336 schedule();
337 }
338
339 finish_wait(&group->notification_waitq, &wait);
340 if (start != buf && ret != -EFAULT)
341 ret = buf - start;
342 return ret;
343}
344
345static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
346{
347#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
348 struct fanotify_response response = { .fd = -1, .response = -1 };
349 struct fsnotify_group *group;
350 int ret;
351
352 group = file->private_data;
353
354 if (count > sizeof(response))
355 count = sizeof(response);
356
357 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
358
359 if (copy_from_user(&response, buf, count))
360 return -EFAULT;
361
362 ret = process_access_response(group, &response);
363 if (ret < 0)
364 count = ret;
365
366 return count;
367#else
368 return -EINVAL;
369#endif
370}
371
372static int fanotify_release(struct inode *ignored, struct file *file)
373{
374 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
376
377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
378
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
380 mutex_lock(&group->fanotify_data.access_mutex);
381
382 group->fanotify_data.bypass_perm = true;
383
384 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
385 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
386 re, re->event);
387
388 list_del_init(&re->list);
389 re->event->response = FAN_ALLOW;
390
391 kmem_cache_free(fanotify_response_event_cache, re);
392 }
393 mutex_unlock(&group->fanotify_data.access_mutex);
394
395 wake_up(&group->fanotify_data.access_waitq);
396#endif
397 /* matches the fanotify_init->fsnotify_alloc_group */
398 fsnotify_put_group(group);
399
400 return 0;
401}
402
403static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
404{
405 struct fsnotify_group *group;
406 struct fsnotify_event_holder *holder;
407 void __user *p;
408 int ret = -ENOTTY;
409 size_t send_len = 0;
410
411 group = file->private_data;
412
413 p = (void __user *) arg;
414
415 switch (cmd) {
416 case FIONREAD:
417 mutex_lock(&group->notification_mutex);
418 list_for_each_entry(holder, &group->notification_list, event_list)
419 send_len += FAN_EVENT_METADATA_LEN;
420 mutex_unlock(&group->notification_mutex);
421 ret = put_user(send_len, (int __user *) p);
422 break;
423 }
424
425 return ret;
426}
427
428static const struct file_operations fanotify_fops = {
429 .poll = fanotify_poll,
430 .read = fanotify_read,
431 .write = fanotify_write,
432 .fasync = NULL,
433 .release = fanotify_release,
434 .unlocked_ioctl = fanotify_ioctl,
435 .compat_ioctl = fanotify_ioctl,
436};
437
438static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
439{
440 kmem_cache_free(fanotify_mark_cache, fsn_mark);
441}
442
443static int fanotify_find_path(int dfd, const char __user *filename,
444 struct path *path, unsigned int flags)
445{
446 int ret;
447
448 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
449 dfd, filename, flags);
450
451 if (filename == NULL) {
452 struct file *file;
453 int fput_needed;
454
455 ret = -EBADF;
456 file = fget_light(dfd, &fput_needed);
457 if (!file)
458 goto out;
459
460 ret = -ENOTDIR;
461 if ((flags & FAN_MARK_ONLYDIR) &&
462 !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
463 fput_light(file, fput_needed);
464 goto out;
465 }
466
467 *path = file->f_path;
468 path_get(path);
469 fput_light(file, fput_needed);
470 } else {
471 unsigned int lookup_flags = 0;
472
473 if (!(flags & FAN_MARK_DONT_FOLLOW))
474 lookup_flags |= LOOKUP_FOLLOW;
475 if (flags & FAN_MARK_ONLYDIR)
476 lookup_flags |= LOOKUP_DIRECTORY;
477
478 ret = user_path_at(dfd, filename, lookup_flags, path);
479 if (ret)
480 goto out;
481 }
482
483 /* you can only watch an inode if you have read permissions on it */
484 ret = inode_permission(path->dentry->d_inode, MAY_READ);
485 if (ret)
486 path_put(path);
487out:
488 return ret;
489}
490
491static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
492 __u32 mask,
493 unsigned int flags)
494{
495 __u32 oldmask;
496
497 spin_lock(&fsn_mark->lock);
498 if (!(flags & FAN_MARK_IGNORED_MASK)) {
499 oldmask = fsn_mark->mask;
500 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
501 } else {
502 oldmask = fsn_mark->ignored_mask;
503 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
504 }
505 spin_unlock(&fsn_mark->lock);
506
507 if (!(oldmask & ~mask))
508 fsnotify_destroy_mark(fsn_mark);
509
510 return mask & oldmask;
511}
512
513static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
514 struct vfsmount *mnt, __u32 mask,
515 unsigned int flags)
516{
517 struct fsnotify_mark *fsn_mark = NULL;
518 __u32 removed;
519
520 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
521 if (!fsn_mark)
522 return -ENOENT;
523
524 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
525 fsnotify_put_mark(fsn_mark);
526 if (removed & mnt->mnt_fsnotify_mask)
527 fsnotify_recalc_vfsmount_mask(mnt);
528
529 return 0;
530}
531
532static int fanotify_remove_inode_mark(struct fsnotify_group *group,
533 struct inode *inode, __u32 mask,
534 unsigned int flags)
535{
536 struct fsnotify_mark *fsn_mark = NULL;
537 __u32 removed;
538
539 fsn_mark = fsnotify_find_inode_mark(group, inode);
540 if (!fsn_mark)
541 return -ENOENT;
542
543 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
544 /* matches the fsnotify_find_inode_mark() */
545 fsnotify_put_mark(fsn_mark);
546 if (removed & inode->i_fsnotify_mask)
547 fsnotify_recalc_inode_mask(inode);
548
549 return 0;
550}
551
552static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
553 __u32 mask,
554 unsigned int flags)
555{
556 __u32 oldmask;
557
558 spin_lock(&fsn_mark->lock);
559 if (!(flags & FAN_MARK_IGNORED_MASK)) {
560 oldmask = fsn_mark->mask;
561 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
562 } else {
563 oldmask = fsn_mark->ignored_mask;
564 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
565 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
566 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
567 }
568 spin_unlock(&fsn_mark->lock);
569
570 return mask & ~oldmask;
571}
572
573static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
574 struct vfsmount *mnt, __u32 mask,
575 unsigned int flags)
576{
577 struct fsnotify_mark *fsn_mark;
578 __u32 added;
579
580 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
581 if (!fsn_mark) {
582 int ret;
583
584 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
585 if (!fsn_mark)
586 return -ENOMEM;
587
588 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
589 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
590 if (ret) {
591 fanotify_free_mark(fsn_mark);
592 return ret;
593 }
594 }
595 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
596 fsnotify_put_mark(fsn_mark);
597 if (added & ~mnt->mnt_fsnotify_mask)
598 fsnotify_recalc_vfsmount_mask(mnt);
599
600 return 0;
601}
602
603static int fanotify_add_inode_mark(struct fsnotify_group *group,
604 struct inode *inode, __u32 mask,
605 unsigned int flags)
606{
607 struct fsnotify_mark *fsn_mark;
608 __u32 added;
609
610 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
611
612 fsn_mark = fsnotify_find_inode_mark(group, inode);
613 if (!fsn_mark) {
614 int ret;
615
616 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
617 if (!fsn_mark)
618 return -ENOMEM;
619
620 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
621 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
622 if (ret) {
623 fanotify_free_mark(fsn_mark);
624 return ret;
625 }
626 }
627 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
628 fsnotify_put_mark(fsn_mark);
629 if (added & ~inode->i_fsnotify_mask)
630 fsnotify_recalc_inode_mask(inode);
631 return 0;
632}
633
634/* fanotify syscalls */
635SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
636{
637 struct fsnotify_group *group;
638 int f_flags, fd;
639
640 pr_debug("%s: flags=%d event_f_flags=%d\n",
641 __func__, flags, event_f_flags);
642
643 if (!capable(CAP_SYS_ADMIN))
644 return -EPERM;
645
646 if (flags & ~FAN_ALL_INIT_FLAGS)
647 return -EINVAL;
648
649 f_flags = O_RDWR | FMODE_NONOTIFY;
650 if (flags & FAN_CLOEXEC)
651 f_flags |= O_CLOEXEC;
652 if (flags & FAN_NONBLOCK)
653 f_flags |= O_NONBLOCK;
654
655 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
656 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
657 if (IS_ERR(group))
658 return PTR_ERR(group);
659
660 group->fanotify_data.f_flags = event_f_flags;
661#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
662 mutex_init(&group->fanotify_data.access_mutex);
663 init_waitqueue_head(&group->fanotify_data.access_waitq);
664 INIT_LIST_HEAD(&group->fanotify_data.access_list);
665#endif
666
667 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
668 if (fd < 0)
669 goto out_put_group;
670
671 return fd;
672
673out_put_group:
674 fsnotify_put_group(group);
675 return fd;
676}
677
678SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
679 __u64 mask, int dfd,
680 const char __user * pathname)
681{
682 struct inode *inode = NULL;
683 struct vfsmount *mnt = NULL;
684 struct fsnotify_group *group;
685 struct file *filp;
686 struct path path;
687 int ret, fput_needed;
688
689 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
690 __func__, fanotify_fd, flags, dfd, pathname, mask);
691
692 /* we only use the lower 32 bits as of right now. */
693 if (mask & ((__u64)0xffffffff << 32))
694 return -EINVAL;
695
696 if (flags & ~FAN_ALL_MARK_FLAGS)
697 return -EINVAL;
698 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
699 case FAN_MARK_ADD:
700 case FAN_MARK_REMOVE:
701 case FAN_MARK_FLUSH:
702 break;
703 default:
704 return -EINVAL;
705 }
706#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
707 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
708#else
709 if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
710#endif
711 return -EINVAL;
712
713 filp = fget_light(fanotify_fd, &fput_needed);
714 if (unlikely(!filp))
715 return -EBADF;
716
717 /* verify that this is indeed an fanotify instance */
718 ret = -EINVAL;
719 if (unlikely(filp->f_op != &fanotify_fops))
720 goto fput_and_out;
721
722 ret = fanotify_find_path(dfd, pathname, &path, flags);
723 if (ret)
724 goto fput_and_out;
725
726 /* inode held in place by reference to path; group by fget on fd */
727 if (!(flags & FAN_MARK_MOUNT))
728 inode = path.dentry->d_inode;
729 else
730 mnt = path.mnt;
731 group = filp->private_data;
732
733 /* create/update an inode mark */
734 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
735 case FAN_MARK_ADD:
736 if (flags & FAN_MARK_MOUNT)
737 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
738 else
739 ret = fanotify_add_inode_mark(group, inode, mask, flags);
740 break;
741 case FAN_MARK_REMOVE:
742 if (flags & FAN_MARK_MOUNT)
743 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
744 else
745 ret = fanotify_remove_inode_mark(group, inode, mask, flags);
746 break;
747 case FAN_MARK_FLUSH:
748 if (flags & FAN_MARK_MOUNT)
749 fsnotify_clear_vfsmount_marks_by_group(group);
750 else
751 fsnotify_clear_inode_marks_by_group(group);
752 break;
753 default:
754 ret = -EINVAL;
755 }
756
757 path_put(&path);
758fput_and_out:
759 fput_light(filp, fput_needed);
760 return ret;
761}
762
763#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
764asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
765 long dfd, long pathname)
766{
767 return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
768 mask, (int) dfd,
769 (const char __user *) pathname);
770}
771SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
772#endif
773
774/*
775 * fanotify_user_setup - Our initialization function. Note that we cannnot return
776 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
777 * must result in panic().
778 */
779static int __init fanotify_user_setup(void)
780{
781 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
782 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
783 SLAB_PANIC);
784
785 return 0;
786}
787device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af83..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/mount.h>
24#include <linux/srcu.h> 25#include <linux/srcu.h>
25 26
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
35} 36}
36EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); 37EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
37 38
39void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
40{
41 fsnotify_clear_marks_by_mount(mnt);
42}
43
38/* 44/*
39 * Given an inode, first check if we care what happens to our children. Inotify 45 * Given an inode, first check if we care what happens to our children. Inotify
40 * and dnotify both tell their parents about events. If we care about any event 46 * and dnotify both tell their parents about events. If we care about any event
@@ -78,13 +84,16 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
78} 84}
79 85
80/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
81void __fsnotify_parent(struct dentry *dentry, __u32 mask) 87void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
82{ 88{
83 struct dentry *parent; 89 struct dentry *parent;
84 struct inode *p_inode; 90 struct inode *p_inode;
85 bool send = false; 91 bool send = false;
86 bool should_update_children = false; 92 bool should_update_children = false;
87 93
94 if (!dentry)
95 dentry = path->dentry;
96
88 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
89 return; 98 return;
90 99
@@ -115,8 +124,12 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
115 * specifies these are events which came from a child. */ 124 * specifies these are events which came from a child. */
116 mask |= FS_EVENT_ON_CHILD; 125 mask |= FS_EVENT_ON_CHILD;
117 126
118 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 127 if (path)
119 dentry->d_name.name, 0); 128 fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
129 dentry->d_name.name, 0);
130 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0);
120 dput(parent); 133 dput(parent);
121 } 134 }
122 135
@@ -127,63 +140,185 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
127} 140}
128EXPORT_SYMBOL_GPL(__fsnotify_parent); 141EXPORT_SYMBOL_GPL(__fsnotify_parent);
129 142
143static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
144 struct fsnotify_mark *inode_mark,
145 struct fsnotify_mark *vfsmount_mark,
146 __u32 mask, void *data,
147 int data_is, u32 cookie,
148 const unsigned char *file_name,
149 struct fsnotify_event **event)
150{
151 struct fsnotify_group *group = NULL;
152 __u32 inode_test_mask = 0;
153 __u32 vfsmount_test_mask = 0;
154
155 if (unlikely(!inode_mark && !vfsmount_mark)) {
156 BUG();
157 return 0;
158 }
159
160 /* clear ignored on inode modification */
161 if (mask & FS_MODIFY) {
162 if (inode_mark &&
163 !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
164 inode_mark->ignored_mask = 0;
165 if (vfsmount_mark &&
166 !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
167 vfsmount_mark->ignored_mask = 0;
168 }
169
170 /* does the inode mark tell us to do something? */
171 if (inode_mark) {
172 group = inode_mark->group;
173 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
174 inode_test_mask &= inode_mark->mask;
175 inode_test_mask &= ~inode_mark->ignored_mask;
176 }
177
178 /* does the vfsmount_mark tell us to do something? */
179 if (vfsmount_mark) {
180 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
181 group = vfsmount_mark->group;
182 vfsmount_test_mask &= vfsmount_mark->mask;
183 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
184 if (inode_mark)
185 vfsmount_test_mask &= ~inode_mark->ignored_mask;
186 }
187
188 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
189 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
190 " data=%p data_is=%d cookie=%d event=%p\n",
191 __func__, group, to_tell, mnt, mask, inode_mark,
192 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
193 data_is, cookie, *event);
194
195 if (!inode_test_mask && !vfsmount_test_mask)
196 return 0;
197
198 if (group->ops->should_send_event(group, to_tell, inode_mark,
199 vfsmount_mark, mask, data,
200 data_is) == false)
201 return 0;
202
203 if (!*event) {
204 *event = fsnotify_create_event(to_tell, mask, data,
205 data_is, file_name,
206 cookie, GFP_KERNEL);
207 if (!*event)
208 return -ENOMEM;
209 }
210 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
211}
212
130/* 213/*
131 * This is the main call to fsnotify. The VFS calls into hook specific functions 214 * This is the main call to fsnotify. The VFS calls into hook specific functions
132 * in linux/fsnotify.h. Those functions then in turn call here. Here will call 215 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
133 * out to all of the registered fsnotify_group. Those groups can then use the 216 * out to all of the registered fsnotify_group. Those groups can then use the
134 * notification event in whatever means they feel necessary. 217 * notification event in whatever means they feel necessary.
135 */ 218 */
136void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) 219int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
220 const unsigned char *file_name, u32 cookie)
137{ 221{
138 struct fsnotify_group *group; 222 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
223 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
224 struct fsnotify_group *inode_group, *vfsmount_group;
139 struct fsnotify_event *event = NULL; 225 struct fsnotify_event *event = NULL;
140 int idx; 226 struct vfsmount *mnt;
227 int idx, ret = 0;
141 /* global tests shouldn't care about events on child only the specific event */ 228 /* global tests shouldn't care about events on child only the specific event */
142 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 229 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
143 230
144 if (list_empty(&fsnotify_groups)) 231 if (data_is == FSNOTIFY_EVENT_PATH)
145 return; 232 mnt = ((struct path *)data)->mnt;
233 else
234 mnt = NULL;
146 235
147 if (!(test_mask & fsnotify_mask))
148 return;
149
150 if (!(test_mask & to_tell->i_fsnotify_mask))
151 return;
152 /* 236 /*
153 * SRCU!! the groups list is very very much read only and the path is 237 * if this is a modify event we may need to clear the ignored masks
154 * very hot. The VAST majority of events are not going to need to do 238 * otherwise return if neither the inode nor the vfsmount care about
155 * anything other than walk the list so it's crazy to pre-allocate. 239 * this type of event.
156 */ 240 */
157 idx = srcu_read_lock(&fsnotify_grp_srcu); 241 if (!(mask & FS_MODIFY) &&
158 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { 242 !(test_mask & to_tell->i_fsnotify_mask) &&
159 if (test_mask & group->mask) { 243 !(mnt && test_mask & mnt->mnt_fsnotify_mask))
160 if (!group->ops->should_send_event(group, to_tell, mask)) 244 return 0;
161 continue; 245
162 if (!event) { 246 idx = srcu_read_lock(&fsnotify_mark_srcu);
163 event = fsnotify_create_event(to_tell, mask, data, 247
164 data_is, file_name, cookie, 248 if ((mask & FS_MODIFY) ||
165 GFP_KERNEL); 249 (test_mask & to_tell->i_fsnotify_mask))
166 /* shit, we OOM'd and now we can't tell, maybe 250 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
167 * someday someone else will want to do something 251 &fsnotify_mark_srcu);
168 * here */ 252
169 if (!event) 253 if (mnt && ((mask & FS_MODIFY) ||
170 break; 254 (test_mask & mnt->mnt_fsnotify_mask))) {
171 } 255 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
172 group->ops->handle_event(group, event); 256 &fsnotify_mark_srcu);
257 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
258 &fsnotify_mark_srcu);
259 }
260
261 while (inode_node || vfsmount_node) {
262 inode_group = vfsmount_group = NULL;
263
264 if (inode_node) {
265 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
266 struct fsnotify_mark, i.i_list);
267 inode_group = inode_mark->group;
173 } 268 }
269
270 if (vfsmount_node) {
271 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
272 struct fsnotify_mark, m.m_list);
273 vfsmount_group = vfsmount_mark->group;
274 }
275
276 if (inode_group > vfsmount_group) {
277 /* handle inode */
278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
279 data_is, cookie, file_name, &event);
280 /* we didn't use the vfsmount_mark */
281 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) {
283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
284 data_is, cookie, file_name, &event);
285 inode_group = NULL;
286 } else {
287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
288 mask, data, data_is, cookie, file_name,
289 &event);
290 }
291
292 if (inode_group)
293 inode_node = srcu_dereference(inode_node->next,
294 &fsnotify_mark_srcu);
295 if (vfsmount_group)
296 vfsmount_node = srcu_dereference(vfsmount_node->next,
297 &fsnotify_mark_srcu);
174 } 298 }
175 srcu_read_unlock(&fsnotify_grp_srcu, idx); 299
300 srcu_read_unlock(&fsnotify_mark_srcu, idx);
176 /* 301 /*
177 * fsnotify_create_event() took a reference so the event can't be cleaned 302 * fsnotify_create_event() took a reference so the event can't be cleaned
178 * up while we are still trying to add it to lists, drop that one. 303 * up while we are still trying to add it to lists, drop that one.
179 */ 304 */
180 if (event) 305 if (event)
181 fsnotify_put_event(event); 306 fsnotify_put_event(event);
307
308 return ret;
182} 309}
183EXPORT_SYMBOL_GPL(fsnotify); 310EXPORT_SYMBOL_GPL(fsnotify);
184 311
185static __init int fsnotify_init(void) 312static __init int fsnotify_init(void)
186{ 313{
187 return init_srcu_struct(&fsnotify_grp_srcu); 314 int ret;
315
316 BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
317
318 ret = init_srcu_struct(&fsnotify_mark_srcu);
319 if (ret)
320 panic("initializing fsnotify_mark_srcu");
321
322 return 0;
188} 323}
189subsys_initcall(fsnotify_init); 324core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2d..85e7d2b431d9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
6#include <linux/srcu.h> 6#include <linux/srcu.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */ 9/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group); 10extern void fsnotify_flush_notify(struct fsnotify_group *group);
18 11
12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu;
14
15extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
16 __u32 mask);
17/* add a mark to an inode */
18extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
19 struct fsnotify_group *group, struct inode *inode,
20 int allow_dups);
21/* add a mark to a vfsmount */
22extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups);
25
19/* final kfree of a group */ 26/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group); 27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21 28
29/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */
32extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
22/* run the list of all marks associated with inode and flag them to be freed */ 33/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode); 34extern void fsnotify_clear_marks_by_inode(struct inode *inode);
35/* run the list of all marks associated with vfsmount and flag them to be freed */
36extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
24/* 37/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares 38 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children. 39 * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc5..d309f38449cb 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30 30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/* 31/*
90 * Final freeing of a group 32 * Final freeing of a group
91 */ 33 */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
110 */ 52 */
111static void fsnotify_destroy_group(struct fsnotify_group *group) 53static void fsnotify_destroy_group(struct fsnotify_group *group)
112{ 54{
113 /* clear all inode mark entries for this group */ 55 /* clear all inode marks for this group */
114 fsnotify_clear_marks_by_group(group); 56 fsnotify_clear_marks_by_group(group);
115 57
58 synchronize_srcu(&fsnotify_mark_srcu);
59
116 /* past the point of no return, matches the initial value of 1 */ 60 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks)) 61 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group); 62 fsnotify_final_destroy_group(group);
119} 63}
120 64
121/* 65/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through. 66 * Drop a reference to a group. Free it if it's through.
149 */ 67 */
150void fsnotify_put_group(struct fsnotify_group *group) 68void fsnotify_put_group(struct fsnotify_group *group)
151{ 69{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) 70 if (atomic_dec_and_test(&group->refcnt))
153 return; 71 fsnotify_destroy_group(group);
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197} 72}
198 73
199/* 74/*
200 * Either finds an existing group which matches the group_num, mask, and ops or 75 * Create a new fsnotify_group and hold a reference for the group returned.
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */ 76 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, 77struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
205 const struct fsnotify_ops *ops)
206{ 78{
207 struct fsnotify_group *group, *tgroup; 79 struct fsnotify_group *group;
208 80
209 /* very low use, simpler locking if we just always alloc */ 81 group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group) 82 if (!group)
212 return ERR_PTR(-ENOMEM); 83 return ERR_PTR(-ENOMEM);
213 84
85 /* set to 0 when there a no external references to this group */
214 atomic_set(&group->refcnt, 1); 86 atomic_set(&group->refcnt, 1);
215 87 /*
216 group->on_group_list = 0; 88 * hits 0 when there are no external references AND no marks for
217 group->group_num = group_num; 89 * this group
218 group->mask = mask; 90 */
91 atomic_set(&group->num_marks, 1);
219 92
220 mutex_init(&group->notification_mutex); 93 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list); 94 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq); 95 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX; 96 group->max_events = UINT_MAX;
225 97
226 spin_lock_init(&group->mark_lock); 98 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0); 99 INIT_LIST_HEAD(&group->marks_list);
228 INIT_LIST_HEAD(&group->mark_entries);
229 100
230 group->ops = ops; 101 group->ops = ops;
231 102
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group; 103 return group;
254} 104}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c8..33297c005060 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */ 17 */
18 18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h> 19#include <linux/fs.h>
86#include <linux/init.h> 20#include <linux/init.h>
87#include <linux/kernel.h> 21#include <linux/kernel.h>
@@ -95,30 +29,19 @@
95#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
96#include "fsnotify.h" 30#include "fsnotify.h"
97 31
98void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
99{
100 atomic_inc(&entry->refcnt);
101}
102
103void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
104{
105 if (atomic_dec_and_test(&entry->refcnt))
106 entry->free_mark(entry);
107}
108
109/* 32/*
110 * Recalculate the mask of events relevant to a given inode locked. 33 * Recalculate the mask of events relevant to a given inode locked.
111 */ 34 */
112static void fsnotify_recalc_inode_mask_locked(struct inode *inode) 35static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
113{ 36{
114 struct fsnotify_mark_entry *entry; 37 struct fsnotify_mark *mark;
115 struct hlist_node *pos; 38 struct hlist_node *pos;
116 __u32 new_mask = 0; 39 __u32 new_mask = 0;
117 40
118 assert_spin_locked(&inode->i_lock); 41 assert_spin_locked(&inode->i_lock);
119 42
120 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) 43 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
121 new_mask |= entry->mask; 44 new_mask |= mark->mask;
122 inode->i_fsnotify_mask = new_mask; 45 inode->i_fsnotify_mask = new_mask;
123} 46}
124 47
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
135 __fsnotify_update_child_dentry_flags(inode); 58 __fsnotify_update_child_dentry_flags(inode);
136} 59}
137 60
138/* 61void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
139 * Any time a mark is getting freed we end up here.
140 * The caller had better be holding a reference to this mark so we don't actually
141 * do the final put under the entry->lock
142 */
143void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
144{ 62{
145 struct fsnotify_group *group; 63 struct inode *inode = mark->i.inode;
146 struct inode *inode;
147 64
148 spin_lock(&entry->lock); 65 assert_spin_locked(&mark->lock);
66 assert_spin_locked(&mark->group->mark_lock);
149 67
150 group = entry->group;
151 inode = entry->inode;
152
153 BUG_ON(group && !inode);
154 BUG_ON(!group && inode);
155
156 /* if !group something else already marked this to die */
157 if (!group) {
158 spin_unlock(&entry->lock);
159 return;
160 }
161
162 /* 1 from caller and 1 for being on i_list/g_list */
163 BUG_ON(atomic_read(&entry->refcnt) < 2);
164
165 spin_lock(&group->mark_lock);
166 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
167 69
168 hlist_del_init(&entry->i_list); 70 hlist_del_init_rcu(&mark->i.i_list);
169 entry->inode = NULL; 71 mark->i.inode = NULL;
170
171 list_del_init(&entry->g_list);
172 entry->group = NULL;
173
174 fsnotify_put_mark(entry); /* for i_list and g_list */
175 72
176 /* 73 /*
177 * this mark is now off the inode->i_fsnotify_mark_entries list and we 74 * this mark is now off the inode->i_fsnotify_marks list and we
178 * hold the inode->i_lock, so this is the perfect time to update the 75 * hold the inode->i_lock, so this is the perfect time to update the
179 * inode->i_fsnotify_mask 76 * inode->i_fsnotify_mask
180 */ 77 */
181 fsnotify_recalc_inode_mask_locked(inode); 78 fsnotify_recalc_inode_mask_locked(inode);
182 79
183 spin_unlock(&inode->i_lock); 80 spin_unlock(&inode->i_lock);
184 spin_unlock(&group->mark_lock);
185 spin_unlock(&entry->lock);
186
187 /*
188 * Some groups like to know that marks are being freed. This is a
189 * callback to the group function to let it know that this entry
190 * is being freed.
191 */
192 if (group->ops->freeing_mark)
193 group->ops->freeing_mark(entry, group);
194
195 /*
196 * __fsnotify_update_child_dentry_flags(inode);
197 *
198 * I really want to call that, but we can't, we have no idea if the inode
199 * still exists the second we drop the entry->lock.
200 *
201 * The next time an event arrive to this inode from one of it's children
202 * __fsnotify_parent will see that the inode doesn't care about it's
203 * children and will update all of these flags then. So really this
204 * is just a lazy update (and could be a perf win...)
205 */
206
207
208 iput(inode);
209
210 /*
211 * it's possible that this group tried to destroy itself, but this
212 * this mark was simultaneously being freed by inode. If that's the
213 * case, we finish freeing the group here.
214 */
215 if (unlikely(atomic_dec_and_test(&group->num_marks)))
216 fsnotify_final_destroy_group(group);
217}
218
219/*
220 * Given a group, destroy all of the marks associated with that group.
221 */
222void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
223{
224 struct fsnotify_mark_entry *lentry, *entry;
225 LIST_HEAD(free_list);
226
227 spin_lock(&group->mark_lock);
228 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
229 list_add(&entry->free_g_list, &free_list);
230 list_del_init(&entry->g_list);
231 fsnotify_get_mark(entry);
232 }
233 spin_unlock(&group->mark_lock);
234
235 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
236 fsnotify_destroy_mark_by_entry(entry);
237 fsnotify_put_mark(entry);
238 }
239} 81}
240 82
241/* 83/*
@@ -243,112 +85,145 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
243 */ 85 */
244void fsnotify_clear_marks_by_inode(struct inode *inode) 86void fsnotify_clear_marks_by_inode(struct inode *inode)
245{ 87{
246 struct fsnotify_mark_entry *entry, *lentry; 88 struct fsnotify_mark *mark, *lmark;
247 struct hlist_node *pos, *n; 89 struct hlist_node *pos, *n;
248 LIST_HEAD(free_list); 90 LIST_HEAD(free_list);
249 91
250 spin_lock(&inode->i_lock); 92 spin_lock(&inode->i_lock);
251 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { 93 hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
252 list_add(&entry->free_i_list, &free_list); 94 list_add(&mark->i.free_i_list, &free_list);
253 hlist_del_init(&entry->i_list); 95 hlist_del_init_rcu(&mark->i.i_list);
254 fsnotify_get_mark(entry); 96 fsnotify_get_mark(mark);
255 } 97 }
256 spin_unlock(&inode->i_lock); 98 spin_unlock(&inode->i_lock);
257 99
258 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { 100 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
259 fsnotify_destroy_mark_by_entry(entry); 101 fsnotify_destroy_mark(mark);
260 fsnotify_put_mark(entry); 102 fsnotify_put_mark(mark);
261 } 103 }
262} 104}
263 105
264/* 106/*
107 * Given a group clear all of the inode marks associated with that group.
108 */
109void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
110{
111 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
112}
113
114/*
265 * given a group and inode, find the mark associated with that combination. 115 * given a group and inode, find the mark associated with that combination.
266 * if found take a reference to that mark and return it, else return NULL 116 * if found take a reference to that mark and return it, else return NULL
267 */ 117 */
268struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, 118struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
269 struct inode *inode) 119 struct inode *inode)
270{ 120{
271 struct fsnotify_mark_entry *entry; 121 struct fsnotify_mark *mark;
272 struct hlist_node *pos; 122 struct hlist_node *pos;
273 123
274 assert_spin_locked(&inode->i_lock); 124 assert_spin_locked(&inode->i_lock);
275 125
276 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { 126 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
277 if (entry->group == group) { 127 if (mark->group == group) {
278 fsnotify_get_mark(entry); 128 fsnotify_get_mark(mark);
279 return entry; 129 return mark;
280 } 130 }
281 } 131 }
282 return NULL; 132 return NULL;
283} 133}
284 134
285/* 135/*
286 * Nothing fancy, just initialize lists and locks and counters. 136 * given a group and inode, find the mark associated with that combination.
137 * if found take a reference to that mark and return it, else return NULL
287 */ 138 */
288void fsnotify_init_mark(struct fsnotify_mark_entry *entry, 139struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
289 void (*free_mark)(struct fsnotify_mark_entry *entry)) 140 struct inode *inode)
141{
142 struct fsnotify_mark *mark;
143
144 spin_lock(&inode->i_lock);
145 mark = fsnotify_find_inode_mark_locked(group, inode);
146 spin_unlock(&inode->i_lock);
290 147
148 return mark;
149}
150
151/*
152 * If we are setting a mark mask on an inode mark we should pin the inode
153 * in memory.
154 */
155void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
156 __u32 mask)
291{ 157{
292 spin_lock_init(&entry->lock); 158 struct inode *inode;
293 atomic_set(&entry->refcnt, 1); 159
294 INIT_HLIST_NODE(&entry->i_list); 160 assert_spin_locked(&mark->lock);
295 entry->group = NULL; 161
296 entry->mask = 0; 162 if (mask &&
297 entry->inode = NULL; 163 mark->i.inode &&
298 entry->free_mark = free_mark; 164 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
165 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
166 inode = igrab(mark->i.inode);
167 /*
168 * we shouldn't be able to get here if the inode wasn't
169 * already safely held in memory. But bug in case it
170 * ever is wrong.
171 */
172 BUG_ON(!inode);
173 }
299} 174}
300 175
301/* 176/*
302 * Attach an initialized mark entry to a given group and inode. 177 * Attach an initialized mark to a given inode.
303 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
304 * event types should be delivered to which group and for which inodes. 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to the group's location in memory.
305 */ 181 */
306int fsnotify_add_mark(struct fsnotify_mark_entry *entry, 182int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
307 struct fsnotify_group *group, struct inode *inode) 183 struct fsnotify_group *group, struct inode *inode,
184 int allow_dups)
308{ 185{
309 struct fsnotify_mark_entry *lentry; 186 struct fsnotify_mark *lmark;
187 struct hlist_node *node, *last = NULL;
310 int ret = 0; 188 int ret = 0;
311 189
312 inode = igrab(inode); 190 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
313 if (unlikely(!inode)) 191
314 return -EINVAL; 192 assert_spin_locked(&mark->lock);
193 assert_spin_locked(&group->mark_lock);
315 194
316 /*
317 * LOCKING ORDER!!!!
318 * entry->lock
319 * group->mark_lock
320 * inode->i_lock
321 */
322 spin_lock(&entry->lock);
323 spin_lock(&group->mark_lock);
324 spin_lock(&inode->i_lock); 195 spin_lock(&inode->i_lock);
325 196
326 lentry = fsnotify_find_mark_entry(group, inode); 197 mark->i.inode = inode;
327 if (!lentry) {
328 entry->group = group;
329 entry->inode = inode;
330 198
331 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 199 /* is mark the first mark? */
332 list_add(&entry->g_list, &group->mark_entries); 200 if (hlist_empty(&inode->i_fsnotify_marks)) {
201 hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
202 goto out;
203 }
333 204
334 fsnotify_get_mark(entry); /* for i_list and g_list */ 205 /* should mark be in the middle of the current list? */
206 hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
207 last = node;
208
209 if ((lmark->group == group) && !allow_dups) {
210 ret = -EEXIST;
211 goto out;
212 }
335 213
336 atomic_inc(&group->num_marks); 214 if (mark->group < lmark->group)
215 continue;
337 216
338 fsnotify_recalc_inode_mask_locked(inode); 217 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
218 goto out;
339 } 219 }
340 220
221 BUG_ON(last == NULL);
222 /* mark should be the last entry. last is the current last entry */
223 hlist_add_after_rcu(last, &mark->i.i_list);
224out:
225 fsnotify_recalc_inode_mask_locked(inode);
341 spin_unlock(&inode->i_lock); 226 spin_unlock(&inode->i_lock);
342 spin_unlock(&group->mark_lock);
343 spin_unlock(&entry->lock);
344
345 if (lentry) {
346 ret = -EEXIST;
347 iput(inode);
348 fsnotify_put_mark(lentry);
349 } else {
350 __fsnotify_update_child_dentry_flags(inode);
351 }
352 227
353 return ret; 228 return ret;
354} 229}
@@ -369,11 +244,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
369 struct inode *need_iput_tmp; 244 struct inode *need_iput_tmp;
370 245
371 /* 246 /*
372 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, 247 * We cannot __iget() an inode in state I_FREEING,
373 * I_WILL_FREE, or I_NEW which is fine because by that point 248 * I_WILL_FREE, or I_NEW which is fine because by that point
374 * the inode cannot have any associated watches. 249 * the inode cannot have any associated watches.
375 */ 250 */
376 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) 251 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
377 continue; 252 continue;
378 253
379 /* 254 /*
@@ -397,7 +272,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
397 /* In case the dropping of a reference would nuke next_i. */ 272 /* In case the dropping of a reference would nuke next_i. */
398 if ((&next_i->i_sb_list != list) && 273 if ((&next_i->i_sb_list != list) &&
399 atomic_read(&next_i->i_count) && 274 atomic_read(&next_i->i_count) &&
400 !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { 275 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
401 __iget(next_i); 276 __iget(next_i);
402 need_iput = next_i; 277 need_iput = next_i;
403 } 278 }
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cfd..b981fc0c8379 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default n
4 ---help---
5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 file change notification system. It is a replacement for dnotify.
7 This option only provides the legacy inotify in kernel API. There
8 are no in tree kernel users of this interface since it is deprecated.
9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
11
12 For more information, see <file:Documentation/filesystems/inotify.txt>
13
14 If unsure, say N.
15
16config INOTIFY_USER 1config INOTIFY_USER
17 bool "Inotify support for userspace" 2 bool "Inotify support for userspace"
18 select ANON_INODES 3 select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 943828171362..a380dabe09de 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 27b75ebc7460..000000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,873 +0,0 @@
1/*
2 * fs/inotify.c - inode-based file event notifications
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
9 *
10 * Copyright (C) 2005 John McCutchan
11 * Copyright 2006 Hewlett-Packard Development Company, L.P.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by the
15 * Free Software Foundation; either version 2, or (at your option) any
16 * later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/spinlock.h>
27#include <linux/idr.h>
28#include <linux/slab.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/list.h>
33#include <linux/writeback.h>
34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
36
37static atomic_t inotify_cookie;
38
39/*
40 * Lock ordering:
41 *
42 * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
43 * iprune_mutex (synchronize shrink_icache_memory())
44 * inode_lock (protects the super_block->s_inodes list)
45 * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
46 * inotify_handle->mutex (protects inotify_handle and watches->h_list)
47 *
48 * The inode->inotify_mutex and inotify_handle->mutex and held during execution
49 * of a caller's event handler. Thus, the caller must not hold any locks
50 * taken in their event handler while calling any of the published inotify
51 * interfaces.
52 */
53
54/*
55 * Lifetimes of the three main data structures--inotify_handle, inode, and
56 * inotify_watch--are managed by reference count.
57 *
58 * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
59 * Additional references can bump the count via get_inotify_handle() and drop
60 * the count via put_inotify_handle().
61 *
62 * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
63 * to remove_watch_no_event(). Additional references can bump the count via
64 * get_inotify_watch() and drop the count via put_inotify_watch(). The caller
65 * is reponsible for the final put after receiving IN_IGNORED, or when using
66 * IN_ONESHOT after receiving the first event. Inotify does the final put if
67 * inotify_destroy() is called.
68 *
69 * inode: Pinned so long as the inode is associated with a watch, from
70 * inotify_add_watch() to the final put_inotify_watch().
71 */
72
73/*
74 * struct inotify_handle - represents an inotify instance
75 *
76 * This structure is protected by the mutex 'mutex'.
77 */
78struct inotify_handle {
79 struct idr idr; /* idr mapping wd -> watch */
80 struct mutex mutex; /* protects this bad boy */
81 struct list_head watches; /* list of watches */
82 atomic_t count; /* reference count */
83 u32 last_wd; /* the last wd allocated */
84 const struct inotify_operations *in_ops; /* inotify caller operations */
85};
86
87static inline void get_inotify_handle(struct inotify_handle *ih)
88{
89 atomic_inc(&ih->count);
90}
91
92static inline void put_inotify_handle(struct inotify_handle *ih)
93{
94 if (atomic_dec_and_test(&ih->count)) {
95 idr_destroy(&ih->idr);
96 kfree(ih);
97 }
98}
99
100/**
101 * get_inotify_watch - grab a reference to an inotify_watch
102 * @watch: watch to grab
103 */
104void get_inotify_watch(struct inotify_watch *watch)
105{
106 atomic_inc(&watch->count);
107}
108EXPORT_SYMBOL_GPL(get_inotify_watch);
109
110int pin_inotify_watch(struct inotify_watch *watch)
111{
112 struct super_block *sb = watch->inode->i_sb;
113 if (atomic_inc_not_zero(&sb->s_active)) {
114 atomic_inc(&watch->count);
115 return 1;
116 }
117 return 0;
118}
119
120/**
121 * put_inotify_watch - decrements the ref count on a given watch. cleans up
122 * watch references if the count reaches zero. inotify_watch is freed by
123 * inotify callers via the destroy_watch() op.
124 * @watch: watch to release
125 */
126void put_inotify_watch(struct inotify_watch *watch)
127{
128 if (atomic_dec_and_test(&watch->count)) {
129 struct inotify_handle *ih = watch->ih;
130
131 iput(watch->inode);
132 ih->in_ops->destroy_watch(watch);
133 put_inotify_handle(ih);
134 }
135}
136EXPORT_SYMBOL_GPL(put_inotify_watch);
137
138void unpin_inotify_watch(struct inotify_watch *watch)
139{
140 struct super_block *sb = watch->inode->i_sb;
141 put_inotify_watch(watch);
142 deactivate_super(sb);
143}
144
145/*
146 * inotify_handle_get_wd - returns the next WD for use by the given handle
147 *
148 * Callers must hold ih->mutex. This function can sleep.
149 */
150static int inotify_handle_get_wd(struct inotify_handle *ih,
151 struct inotify_watch *watch)
152{
153 int ret;
154
155 do {
156 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
157 return -ENOSPC;
158 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
159 } while (ret == -EAGAIN);
160
161 if (likely(!ret))
162 ih->last_wd = watch->wd;
163
164 return ret;
165}
166
167/*
168 * inotify_inode_watched - returns nonzero if there are watches on this inode
169 * and zero otherwise. We call this lockless, we do not care if we race.
170 */
171static inline int inotify_inode_watched(struct inode *inode)
172{
173 return !list_empty(&inode->inotify_watches);
174}
175
176/*
177 * Get child dentry flag into synch with parent inode.
178 * Flag should always be clear for negative dentrys.
179 */
180static void set_dentry_child_flags(struct inode *inode, int watched)
181{
182 struct dentry *alias;
183
184 spin_lock(&dcache_lock);
185 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
186 struct dentry *child;
187
188 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
189 if (!child->d_inode)
190 continue;
191
192 spin_lock(&child->d_lock);
193 if (watched)
194 child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
195 else
196 child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
197 spin_unlock(&child->d_lock);
198 }
199 }
200 spin_unlock(&dcache_lock);
201}
202
203/*
204 * inotify_find_handle - find the watch associated with the given inode and
205 * handle
206 *
207 * Callers must hold inode->inotify_mutex.
208 */
209static struct inotify_watch *inode_find_handle(struct inode *inode,
210 struct inotify_handle *ih)
211{
212 struct inotify_watch *watch;
213
214 list_for_each_entry(watch, &inode->inotify_watches, i_list) {
215 if (watch->ih == ih)
216 return watch;
217 }
218
219 return NULL;
220}
221
222/*
223 * remove_watch_no_event - remove watch without the IN_IGNORED event.
224 *
225 * Callers must hold both inode->inotify_mutex and ih->mutex.
226 */
227static void remove_watch_no_event(struct inotify_watch *watch,
228 struct inotify_handle *ih)
229{
230 list_del(&watch->i_list);
231 list_del(&watch->h_list);
232
233 if (!inotify_inode_watched(watch->inode))
234 set_dentry_child_flags(watch->inode, 0);
235
236 idr_remove(&ih->idr, watch->wd);
237}
238
239/**
240 * inotify_remove_watch_locked - Remove a watch from both the handle and the
241 * inode. Sends the IN_IGNORED event signifying that the inode is no longer
242 * watched. May be invoked from a caller's event handler.
243 * @ih: inotify handle associated with watch
244 * @watch: watch to remove
245 *
246 * Callers must hold both inode->inotify_mutex and ih->mutex.
247 */
248void inotify_remove_watch_locked(struct inotify_handle *ih,
249 struct inotify_watch *watch)
250{
251 remove_watch_no_event(watch, ih);
252 ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
253}
254EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
255
256/* Kernel API for producing events */
257
258/*
259 * inotify_d_instantiate - instantiate dcache entry for inode
260 */
261void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
262{
263 struct dentry *parent;
264
265 if (!inode)
266 return;
267
268 spin_lock(&entry->d_lock);
269 parent = entry->d_parent;
270 if (parent->d_inode && inotify_inode_watched(parent->d_inode))
271 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
272 spin_unlock(&entry->d_lock);
273}
274
275/*
276 * inotify_d_move - dcache entry has been moved
277 */
278void inotify_d_move(struct dentry *entry)
279{
280 struct dentry *parent;
281
282 parent = entry->d_parent;
283 if (inotify_inode_watched(parent->d_inode))
284 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
285 else
286 entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
287}
288
289/**
290 * inotify_inode_queue_event - queue an event to all watches on this inode
291 * @inode: inode event is originating from
292 * @mask: event mask describing this event
293 * @cookie: cookie for synchronization, or zero
294 * @name: filename, if any
295 * @n_inode: inode associated with name
296 */
297void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
298 const char *name, struct inode *n_inode)
299{
300 struct inotify_watch *watch, *next;
301
302 if (!inotify_inode_watched(inode))
303 return;
304
305 mutex_lock(&inode->inotify_mutex);
306 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
307 u32 watch_mask = watch->mask;
308 if (watch_mask & mask) {
309 struct inotify_handle *ih= watch->ih;
310 mutex_lock(&ih->mutex);
311 if (watch_mask & IN_ONESHOT)
312 remove_watch_no_event(watch, ih);
313 ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
314 name, n_inode);
315 mutex_unlock(&ih->mutex);
316 }
317 }
318 mutex_unlock(&inode->inotify_mutex);
319}
320EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
321
322/**
323 * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
324 * @dentry: the dentry in question, we queue against this dentry's parent
325 * @mask: event mask describing this event
326 * @cookie: cookie for synchronization, or zero
327 * @name: filename, if any
328 */
329void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
330 u32 cookie, const char *name)
331{
332 struct dentry *parent;
333 struct inode *inode;
334
335 if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
336 return;
337
338 spin_lock(&dentry->d_lock);
339 parent = dentry->d_parent;
340 inode = parent->d_inode;
341
342 if (inotify_inode_watched(inode)) {
343 dget(parent);
344 spin_unlock(&dentry->d_lock);
345 inotify_inode_queue_event(inode, mask, cookie, name,
346 dentry->d_inode);
347 dput(parent);
348 } else
349 spin_unlock(&dentry->d_lock);
350}
351EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
352
353/**
354 * inotify_get_cookie - return a unique cookie for use in synchronizing events.
355 */
356u32 inotify_get_cookie(void)
357{
358 return atomic_inc_return(&inotify_cookie);
359}
360EXPORT_SYMBOL_GPL(inotify_get_cookie);
361
362/**
363 * inotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
364 * @list: list of inodes being unmounted (sb->s_inodes)
365 *
366 * Called with inode_lock held, protecting the unmounting super block's list
367 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
368 * We temporarily drop inode_lock, however, and CAN block.
369 */
370void inotify_unmount_inodes(struct list_head *list)
371{
372 struct inode *inode, *next_i, *need_iput = NULL;
373
374 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
375 struct inotify_watch *watch, *next_w;
376 struct inode *need_iput_tmp;
377 struct list_head *watches;
378
379 /*
380 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
381 * I_WILL_FREE, or I_NEW which is fine because by that point
382 * the inode cannot have any associated watches.
383 */
384 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
385 continue;
386
387 /*
388 * If i_count is zero, the inode cannot have any watches and
389 * doing an __iget/iput with MS_ACTIVE clear would actually
390 * evict all inodes with zero i_count from icache which is
391 * unnecessarily violent and may in fact be illegal to do.
392 */
393 if (!atomic_read(&inode->i_count))
394 continue;
395
396 need_iput_tmp = need_iput;
397 need_iput = NULL;
398 /* In case inotify_remove_watch_locked() drops a reference. */
399 if (inode != need_iput_tmp)
400 __iget(inode);
401 else
402 need_iput_tmp = NULL;
403 /* In case the dropping of a reference would nuke next_i. */
404 if ((&next_i->i_sb_list != list) &&
405 atomic_read(&next_i->i_count) &&
406 !(next_i->i_state & (I_CLEAR | I_FREEING |
407 I_WILL_FREE))) {
408 __iget(next_i);
409 need_iput = next_i;
410 }
411
412 /*
413 * We can safely drop inode_lock here because we hold
414 * references on both inode and next_i. Also no new inodes
415 * will be added since the umount has begun. Finally,
416 * iprune_mutex keeps shrink_icache_memory() away.
417 */
418 spin_unlock(&inode_lock);
419
420 if (need_iput_tmp)
421 iput(need_iput_tmp);
422
423 /* for each watch, send IN_UNMOUNT and then remove it */
424 mutex_lock(&inode->inotify_mutex);
425 watches = &inode->inotify_watches;
426 list_for_each_entry_safe(watch, next_w, watches, i_list) {
427 struct inotify_handle *ih= watch->ih;
428 get_inotify_watch(watch);
429 mutex_lock(&ih->mutex);
430 ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
431 NULL, NULL);
432 inotify_remove_watch_locked(ih, watch);
433 mutex_unlock(&ih->mutex);
434 put_inotify_watch(watch);
435 }
436 mutex_unlock(&inode->inotify_mutex);
437 iput(inode);
438
439 spin_lock(&inode_lock);
440 }
441}
442EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
443
444/**
445 * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
446 * @inode: inode that is about to be removed
447 */
448void inotify_inode_is_dead(struct inode *inode)
449{
450 struct inotify_watch *watch, *next;
451
452 mutex_lock(&inode->inotify_mutex);
453 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
454 struct inotify_handle *ih = watch->ih;
455 mutex_lock(&ih->mutex);
456 inotify_remove_watch_locked(ih, watch);
457 mutex_unlock(&ih->mutex);
458 }
459 mutex_unlock(&inode->inotify_mutex);
460}
461EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
462
463/* Kernel Consumer API */
464
465/**
466 * inotify_init - allocate and initialize an inotify instance
467 * @ops: caller's inotify operations
468 */
469struct inotify_handle *inotify_init(const struct inotify_operations *ops)
470{
471 struct inotify_handle *ih;
472
473 ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
474 if (unlikely(!ih))
475 return ERR_PTR(-ENOMEM);
476
477 idr_init(&ih->idr);
478 INIT_LIST_HEAD(&ih->watches);
479 mutex_init(&ih->mutex);
480 ih->last_wd = 0;
481 ih->in_ops = ops;
482 atomic_set(&ih->count, 0);
483 get_inotify_handle(ih);
484
485 return ih;
486}
487EXPORT_SYMBOL_GPL(inotify_init);
488
489/**
490 * inotify_init_watch - initialize an inotify watch
491 * @watch: watch to initialize
492 */
493void inotify_init_watch(struct inotify_watch *watch)
494{
495 INIT_LIST_HEAD(&watch->h_list);
496 INIT_LIST_HEAD(&watch->i_list);
497 atomic_set(&watch->count, 0);
498 get_inotify_watch(watch); /* initial get */
499}
500EXPORT_SYMBOL_GPL(inotify_init_watch);
501
502/*
503 * Watch removals suck violently. To kick the watch out we need (in this
504 * order) inode->inotify_mutex and ih->mutex. That's fine if we have
505 * a hold on inode; however, for all other cases we need to make damn sure
506 * we don't race with umount. We can *NOT* just grab a reference to a
507 * watch - inotify_unmount_inodes() will happily sail past it and we'll end
508 * with reference to inode potentially outliving its superblock. Ideally
509 * we just want to grab an active reference to superblock if we can; that
510 * will make sure we won't go into inotify_umount_inodes() until we are
511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
512 * case - what if we *are* racing with umount() and active references to
513 * superblock can't be acquired anymore? We can bump ->s_count, grab
514 * ->s_umount, which will wait until the superblock is shut down and the
515 * watch in question is pining for fjords.
516 *
517 * And yes, this is far beyond mere "not very pretty"; so's the entire
518 * concept of inotify to start with.
519 */
520
521/**
522 * pin_to_kill - pin the watch down for removal
523 * @ih: inotify handle
524 * @watch: watch to kill
525 *
526 * Called with ih->mutex held, drops it. Possible return values:
527 * 0 - nothing to do, it has died
528 * 1 - remove it, drop the reference and deactivate_super()
529 */
530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
531{
532 struct super_block *sb = watch->inode->i_sb;
533
534 if (atomic_inc_not_zero(&sb->s_active)) {
535 get_inotify_watch(watch);
536 mutex_unlock(&ih->mutex);
537 return 1; /* the best outcome */
538 }
539 spin_lock(&sb_lock);
540 sb->s_count++;
541 spin_unlock(&sb_lock);
542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
543 down_read(&sb->s_umount);
544 /* fs is already shut down; the watch is dead */
545 drop_super(sb);
546 return 0;
547}
548
549static void unpin_and_kill(struct inotify_watch *watch)
550{
551 struct super_block *sb = watch->inode->i_sb;
552 put_inotify_watch(watch);
553 deactivate_super(sb);
554}
555
556/**
557 * inotify_destroy - clean up and destroy an inotify instance
558 * @ih: inotify handle
559 */
560void inotify_destroy(struct inotify_handle *ih)
561{
562 /*
563 * Destroy all of the watches for this handle. Unfortunately, not very
564 * pretty. We cannot do a simple iteration over the list, because we
565 * do not know the inode until we iterate to the watch. But we need to
566 * hold inode->inotify_mutex before ih->mutex. The following works.
567 *
568 * AV: it had to become even uglier to start working ;-/
569 */
570 while (1) {
571 struct inotify_watch *watch;
572 struct list_head *watches;
573 struct super_block *sb;
574 struct inode *inode;
575
576 mutex_lock(&ih->mutex);
577 watches = &ih->watches;
578 if (list_empty(watches)) {
579 mutex_unlock(&ih->mutex);
580 break;
581 }
582 watch = list_first_entry(watches, struct inotify_watch, h_list);
583 sb = watch->inode->i_sb;
584 if (!pin_to_kill(ih, watch))
585 continue;
586
587 inode = watch->inode;
588 mutex_lock(&inode->inotify_mutex);
589 mutex_lock(&ih->mutex);
590
591 /* make sure we didn't race with another list removal */
592 if (likely(idr_find(&ih->idr, watch->wd))) {
593 remove_watch_no_event(watch, ih);
594 put_inotify_watch(watch);
595 }
596
597 mutex_unlock(&ih->mutex);
598 mutex_unlock(&inode->inotify_mutex);
599 unpin_and_kill(watch);
600 }
601
602 /* free this handle: the put matching the get in inotify_init() */
603 put_inotify_handle(ih);
604}
605EXPORT_SYMBOL_GPL(inotify_destroy);
606
607/**
608 * inotify_find_watch - find an existing watch for an (ih,inode) pair
609 * @ih: inotify handle
610 * @inode: inode to watch
611 * @watchp: pointer to existing inotify_watch
612 *
613 * Caller must pin given inode (via nameidata).
614 */
615s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
616 struct inotify_watch **watchp)
617{
618 struct inotify_watch *old;
619 int ret = -ENOENT;
620
621 mutex_lock(&inode->inotify_mutex);
622 mutex_lock(&ih->mutex);
623
624 old = inode_find_handle(inode, ih);
625 if (unlikely(old)) {
626 get_inotify_watch(old); /* caller must put watch */
627 *watchp = old;
628 ret = old->wd;
629 }
630
631 mutex_unlock(&ih->mutex);
632 mutex_unlock(&inode->inotify_mutex);
633
634 return ret;
635}
636EXPORT_SYMBOL_GPL(inotify_find_watch);
637
638/**
639 * inotify_find_update_watch - find and update the mask of an existing watch
640 * @ih: inotify handle
641 * @inode: inode's watch to update
642 * @mask: mask of events to watch
643 *
644 * Caller must pin given inode (via nameidata).
645 */
646s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
647 u32 mask)
648{
649 struct inotify_watch *old;
650 int mask_add = 0;
651 int ret;
652
653 if (mask & IN_MASK_ADD)
654 mask_add = 1;
655
656 /* don't allow invalid bits: we don't want flags set */
657 mask &= IN_ALL_EVENTS | IN_ONESHOT;
658 if (unlikely(!mask))
659 return -EINVAL;
660
661 mutex_lock(&inode->inotify_mutex);
662 mutex_lock(&ih->mutex);
663
664 /*
665 * Handle the case of re-adding a watch on an (inode,ih) pair that we
666 * are already watching. We just update the mask and return its wd.
667 */
668 old = inode_find_handle(inode, ih);
669 if (unlikely(!old)) {
670 ret = -ENOENT;
671 goto out;
672 }
673
674 if (mask_add)
675 old->mask |= mask;
676 else
677 old->mask = mask;
678 ret = old->wd;
679out:
680 mutex_unlock(&ih->mutex);
681 mutex_unlock(&inode->inotify_mutex);
682 return ret;
683}
684EXPORT_SYMBOL_GPL(inotify_find_update_watch);
685
686/**
687 * inotify_add_watch - add a watch to an inotify instance
688 * @ih: inotify handle
689 * @watch: caller allocated watch structure
690 * @inode: inode to watch
691 * @mask: mask of events to watch
692 *
693 * Caller must pin given inode (via nameidata).
694 * Caller must ensure it only calls inotify_add_watch() once per watch.
695 * Calls inotify_handle_get_wd() so may sleep.
696 */
697s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
698 struct inode *inode, u32 mask)
699{
700 int ret = 0;
701 int newly_watched;
702
703 /* don't allow invalid bits: we don't want flags set */
704 mask &= IN_ALL_EVENTS | IN_ONESHOT;
705 if (unlikely(!mask))
706 return -EINVAL;
707 watch->mask = mask;
708
709 mutex_lock(&inode->inotify_mutex);
710 mutex_lock(&ih->mutex);
711
712 /* Initialize a new watch */
713 ret = inotify_handle_get_wd(ih, watch);
714 if (unlikely(ret))
715 goto out;
716 ret = watch->wd;
717
718 /* save a reference to handle and bump the count to make it official */
719 get_inotify_handle(ih);
720 watch->ih = ih;
721
722 /*
723 * Save a reference to the inode and bump the ref count to make it
724 * official. We hold a reference to nameidata, which makes this safe.
725 */
726 watch->inode = igrab(inode);
727
728 /* Add the watch to the handle's and the inode's list */
729 newly_watched = !inotify_inode_watched(inode);
730 list_add(&watch->h_list, &ih->watches);
731 list_add(&watch->i_list, &inode->inotify_watches);
732 /*
733 * Set child flags _after_ adding the watch, so there is no race
734 * windows where newly instantiated children could miss their parent's
735 * watched flag.
736 */
737 if (newly_watched)
738 set_dentry_child_flags(inode, 1);
739
740out:
741 mutex_unlock(&ih->mutex);
742 mutex_unlock(&inode->inotify_mutex);
743 return ret;
744}
745EXPORT_SYMBOL_GPL(inotify_add_watch);
746
747/**
748 * inotify_clone_watch - put the watch next to existing one
749 * @old: already installed watch
750 * @new: new watch
751 *
752 * Caller must hold the inotify_mutex of inode we are dealing with;
753 * it is expected to remove the old watch before unlocking the inode.
754 */
755s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
756{
757 struct inotify_handle *ih = old->ih;
758 int ret = 0;
759
760 new->mask = old->mask;
761 new->ih = ih;
762
763 mutex_lock(&ih->mutex);
764
765 /* Initialize a new watch */
766 ret = inotify_handle_get_wd(ih, new);
767 if (unlikely(ret))
768 goto out;
769 ret = new->wd;
770
771 get_inotify_handle(ih);
772
773 new->inode = igrab(old->inode);
774
775 list_add(&new->h_list, &ih->watches);
776 list_add(&new->i_list, &old->inode->inotify_watches);
777out:
778 mutex_unlock(&ih->mutex);
779 return ret;
780}
781
782void inotify_evict_watch(struct inotify_watch *watch)
783{
784 get_inotify_watch(watch);
785 mutex_lock(&watch->ih->mutex);
786 inotify_remove_watch_locked(watch->ih, watch);
787 mutex_unlock(&watch->ih->mutex);
788}
789
790/**
791 * inotify_rm_wd - remove a watch from an inotify instance
792 * @ih: inotify handle
793 * @wd: watch descriptor to remove
794 *
795 * Can sleep.
796 */
797int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
798{
799 struct inotify_watch *watch;
800 struct super_block *sb;
801 struct inode *inode;
802
803 mutex_lock(&ih->mutex);
804 watch = idr_find(&ih->idr, wd);
805 if (unlikely(!watch)) {
806 mutex_unlock(&ih->mutex);
807 return -EINVAL;
808 }
809 sb = watch->inode->i_sb;
810 if (!pin_to_kill(ih, watch))
811 return 0;
812
813 inode = watch->inode;
814
815 mutex_lock(&inode->inotify_mutex);
816 mutex_lock(&ih->mutex);
817
818 /* make sure that we did not race */
819 if (likely(idr_find(&ih->idr, wd) == watch))
820 inotify_remove_watch_locked(ih, watch);
821
822 mutex_unlock(&ih->mutex);
823 mutex_unlock(&inode->inotify_mutex);
824 unpin_and_kill(watch);
825
826 return 0;
827}
828EXPORT_SYMBOL_GPL(inotify_rm_wd);
829
830/**
831 * inotify_rm_watch - remove a watch from an inotify instance
832 * @ih: inotify handle
833 * @watch: watch to remove
834 *
835 * Can sleep.
836 */
837int inotify_rm_watch(struct inotify_handle *ih,
838 struct inotify_watch *watch)
839{
840 return inotify_rm_wd(ih, watch->wd);
841}
842EXPORT_SYMBOL_GPL(inotify_rm_watch);
843
844/*
845 * inotify_setup - core initialization function
846 */
847static int __init inotify_setup(void)
848{
849 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
850 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
851 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
852 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
853 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
854 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
855 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
856 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
857 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
858 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
859 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
860 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862
863 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
865 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
866 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
867
868 atomic_set(&inotify_cookie, 0);
869
870 return 0;
871}
872
873module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8ca..b6642e4de4bf 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
9 int wd; 9 int wd;
10}; 10};
11 11
12struct inotify_inode_mark_entry { 12struct inotify_inode_mark {
13 /* fsnotify_mark_entry MUST be the first thing */ 13 struct fsnotify_mark fsn_mark;
14 struct fsnotify_mark_entry fsn_entry;
15 int wd; 14 int wd;
16}; 15};
17 16
18extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
19 struct fsnotify_group *group); 18 struct fsnotify_group *group);
20extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
21 20
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76ab..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
22 * General Public License for more details. 22 * General Public License for more details.
23 */ 23 */
24 24
25#include <linux/dcache.h> /* d_unlinked */
25#include <linux/fs.h> /* struct inode */ 26#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h> 28#include <linux/inotify.h>
@@ -32,26 +33,84 @@
32 33
33#include "inotify.h" 34#include "inotify.h"
34 35
35static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) 36/*
37 * Check if 2 events contain the same information. We do not compare private data
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
41{
42 if ((old->mask == new->mask) &&
43 (old->to_tell == new->to_tell) &&
44 (old->data_type == new->data_type) &&
45 (old->name_len == new->name_len)) {
46 switch (old->data_type) {
47 case (FSNOTIFY_EVENT_INODE):
48 /* remember, after old was put on the wait_q we aren't
49 * allowed to look at the inode any more, only thing
50 * left to check was if the file_name is the same */
51 if (!old->name_len ||
52 !strcmp(old->file_name, new->file_name))
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false;
69}
70
71static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event)
36{ 73{
37 struct fsnotify_mark_entry *entry; 74 struct fsnotify_event_holder *last_holder;
38 struct inotify_inode_mark_entry *ientry; 75 struct fsnotify_event *last_event;
76
77 /* and the list better be locked by something too */
78 spin_lock(&event->lock);
79
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event;
90}
91
92static int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark,
94 struct fsnotify_mark *vfsmount_mark,
95 struct fsnotify_event *event)
96{
97 struct inotify_inode_mark *i_mark;
39 struct inode *to_tell; 98 struct inode *to_tell;
40 struct inotify_event_private_data *event_priv; 99 struct inotify_event_private_data *event_priv;
41 struct fsnotify_event_private_data *fsn_event_priv; 100 struct fsnotify_event_private_data *fsn_event_priv;
42 int wd, ret; 101 struct fsnotify_event *added_event;
102 int wd, ret = 0;
103
104 BUG_ON(vfsmount_mark);
105
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
107 event, event->to_tell, event->mask);
43 108
44 to_tell = event->to_tell; 109 to_tell = event->to_tell;
45 110
46 spin_lock(&to_tell->i_lock); 111 i_mark = container_of(inode_mark, struct inotify_inode_mark,
47 entry = fsnotify_find_mark_entry(group, to_tell); 112 fsn_mark);
48 spin_unlock(&to_tell->i_lock); 113 wd = i_mark->wd;
49 /* race with watch removal? We already passes should_send */
50 if (unlikely(!entry))
51 return 0;
52 ientry = container_of(entry, struct inotify_inode_mark_entry,
53 fsn_entry);
54 wd = ientry->wd;
55 114
56 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
57 if (unlikely(!event_priv)) 116 if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 fsn_event_priv->group = group; 121 fsn_event_priv->group = group;
63 event_priv->wd = wd; 122 event_priv->wd = wd;
64 123
65 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 124 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
66 if (ret) { 125 if (added_event) {
67 inotify_free_event_priv(fsn_event_priv); 126 inotify_free_event_priv(fsn_event_priv);
68 /* EEXIST says we tail matched, EOVERFLOW isn't something 127 if (!IS_ERR(added_event))
69 * to report up the stack. */ 128 fsnotify_put_event(added_event);
70 if ((ret == -EEXIST) || 129 else
71 (ret == -EOVERFLOW)) 130 ret = PTR_ERR(added_event);
72 ret = 0;
73 } 131 }
74 132
75 /* 133 if (inode_mark->mask & IN_ONESHOT)
76 * If we hold the entry until after the event is on the queue 134 fsnotify_destroy_mark(inode_mark);
77 * IN_IGNORED won't be able to pass this event in the queue
78 */
79 fsnotify_put_mark(entry);
80 135
81 return ret; 136 return ret;
82} 137}
83 138
84static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) 139static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
85{ 140{
86 inotify_ignored_and_remove_idr(entry, group); 141 inotify_ignored_and_remove_idr(fsn_mark, group);
87} 142}
88 143
89static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) 144static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
145 struct fsnotify_mark *inode_mark,
146 struct fsnotify_mark *vfsmount_mark,
147 __u32 mask, void *data, int data_type)
90{ 148{
91 struct fsnotify_mark_entry *entry; 149 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
92 bool send; 150 (data_type == FSNOTIFY_EVENT_PATH)) {
93 151 struct path *path = data;
94 spin_lock(&inode->i_lock);
95 entry = fsnotify_find_mark_entry(group, inode);
96 spin_unlock(&inode->i_lock);
97 if (!entry)
98 return false;
99 152
100 mask = (mask & ~FS_EVENT_ON_CHILD); 153 if (d_unlinked(path->dentry))
101 send = (entry->mask & mask); 154 return false;
102 155 }
103 /* find took a reference */
104 fsnotify_put_mark(entry);
105 156
106 return send; 157 return true;
107} 158}
108 159
109/* 160/*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
115 */ 166 */
116static int idr_callback(int id, void *p, void *data) 167static int idr_callback(int id, void *p, void *data)
117{ 168{
118 struct fsnotify_mark_entry *entry; 169 struct fsnotify_mark *fsn_mark;
119 struct inotify_inode_mark_entry *ientry; 170 struct inotify_inode_mark *i_mark;
120 static bool warned = false; 171 static bool warned = false;
121 172
122 if (warned) 173 if (warned)
123 return 0; 174 return 0;
124 175
125 warned = true; 176 warned = true;
126 entry = p; 177 fsn_mark = p;
127 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 178 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
128 179
129 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " 180 WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
130 "idr. Probably leaking memory\n", id, p, data); 181 "idr. Probably leaking memory\n", id, p, data);
131 182
132 /* 183 /*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
135 * out why we got here and the panic is no worse than the original 186 * out why we got here and the panic is no worse than the original
136 * BUG() that was here. 187 * BUG() that was here.
137 */ 188 */
138 if (entry) 189 if (fsn_mark)
139 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", 190 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
140 entry->group, entry->inode, ientry->wd); 191 fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
141 return 0; 192 return 0;
142} 193}
143 194
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9be..bf7f6d776c31 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
46/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
47static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
48static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
49int inotify_max_user_watches __read_mostly; 49static int inotify_max_user_watches __read_mostly;
50 50
51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
52struct kmem_cache *event_priv_cachep __read_mostly; 52struct kmem_cache *event_priv_cachep __read_mostly;
53 53
54/*
55 * When inotify registers a new group it increments this and uses that
56 * value as an offset to set the fsnotify group "name" and priority.
57 */
58static atomic_t inotify_grp_num;
59
60#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
61 55
62#include <linux/sysctl.h> 56#include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
96{ 90{
97 __u32 mask; 91 __u32 mask;
98 92
99 /* everything should accept their own ignored and cares about children */ 93 /*
100 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); 94 * everything should accept their own ignored, cares about children,
95 * and should receive events when the inode is unmounted
96 */
97 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
101 98
102 /* mask off the flags used to open the fd */ 99 /* mask off the flags used to open the fd */
103 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); 100 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
104 101
105 return mask; 102 return mask;
106} 103}
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 141
145 event = fsnotify_peek_notify_event(group); 142 event = fsnotify_peek_notify_event(group);
146 143
144 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
145
147 if (event->name_len) 146 if (event->name_len)
148 event_size += roundup(event->name_len + 1, event_size); 147 event_size += roundup(event->name_len + 1, event_size);
149 148
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
173 size_t event_size = sizeof(struct inotify_event); 172 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 173 size_t name_len = 0;
175 174
175 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
176
176 /* we get the inotify watch descriptor from the event private data */ 177 /* we get the inotify watch descriptor from the event private data */
177 spin_lock(&event->lock); 178 spin_lock(&event->lock);
178 fsn_priv = fsnotify_remove_priv_from_event(group, event); 179 fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
245 kevent = get_one_event(group, count); 246 kevent = get_one_event(group, count);
246 mutex_unlock(&group->notification_mutex); 247 mutex_unlock(&group->notification_mutex);
247 248
249 pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
250
248 if (kevent) { 251 if (kevent) {
249 ret = PTR_ERR(kevent); 252 ret = PTR_ERR(kevent);
250 if (IS_ERR(kevent)) 253 if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
289 struct fsnotify_group *group = file->private_data; 292 struct fsnotify_group *group = file->private_data;
290 struct user_struct *user = group->inotify_data.user; 293 struct user_struct *user = group->inotify_data.user;
291 294
295 pr_debug("%s: group=%p\n", __func__, group);
296
292 fsnotify_clear_marks_by_group(group); 297 fsnotify_clear_marks_by_group(group);
293 298
294 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 299 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
312 group = file->private_data; 317 group = file->private_data;
313 p = (void __user *) arg; 318 p = (void __user *) arg;
314 319
320 pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
321
315 switch (cmd) { 322 switch (cmd) {
316 case FIONREAD: 323 case FIONREAD:
317 mutex_lock(&group->notification_mutex); 324 mutex_lock(&group->notification_mutex);
@@ -357,59 +364,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
357 return error; 364 return error;
358} 365}
359 366
367static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
368 int *last_wd,
369 struct inotify_inode_mark *i_mark)
370{
371 int ret;
372
373 do {
374 if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
375 return -ENOMEM;
376
377 spin_lock(idr_lock);
378 ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
379 &i_mark->wd);
380 /* we added the mark to the idr, take a reference */
381 if (!ret) {
382 *last_wd = i_mark->wd;
383 fsnotify_get_mark(&i_mark->fsn_mark);
384 }
385 spin_unlock(idr_lock);
386 } while (ret == -EAGAIN);
387
388 return ret;
389}
390
391static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
392 int wd)
393{
394 struct idr *idr = &group->inotify_data.idr;
395 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
396 struct inotify_inode_mark *i_mark;
397
398 assert_spin_locked(idr_lock);
399
400 i_mark = idr_find(idr, wd);
401 if (i_mark) {
402 struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
403
404 fsnotify_get_mark(fsn_mark);
405 /* One ref for being in the idr, one ref we just took */
406 BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
407 }
408
409 return i_mark;
410}
411
412static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
413 int wd)
414{
415 struct inotify_inode_mark *i_mark;
416 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
417
418 spin_lock(idr_lock);
419 i_mark = inotify_idr_find_locked(group, wd);
420 spin_unlock(idr_lock);
421
422 return i_mark;
423}
424
425static void do_inotify_remove_from_idr(struct fsnotify_group *group,
426 struct inotify_inode_mark *i_mark)
427{
428 struct idr *idr = &group->inotify_data.idr;
429 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
430 int wd = i_mark->wd;
431
432 assert_spin_locked(idr_lock);
433
434 idr_remove(idr, wd);
435
436 /* removed from the idr, drop that ref */
437 fsnotify_put_mark(&i_mark->fsn_mark);
438}
439
360/* 440/*
361 * Remove the mark from the idr (if present) and drop the reference 441 * Remove the mark from the idr (if present) and drop the reference
362 * on the mark because it was in the idr. 442 * on the mark because it was in the idr.
363 */ 443 */
364static void inotify_remove_from_idr(struct fsnotify_group *group, 444static void inotify_remove_from_idr(struct fsnotify_group *group,
365 struct inotify_inode_mark_entry *ientry) 445 struct inotify_inode_mark *i_mark)
366{ 446{
367 struct idr *idr; 447 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
368 struct fsnotify_mark_entry *entry; 448 struct inotify_inode_mark *found_i_mark = NULL;
369 struct inotify_inode_mark_entry *found_ientry;
370 int wd; 449 int wd;
371 450
372 spin_lock(&group->inotify_data.idr_lock); 451 spin_lock(idr_lock);
373 idr = &group->inotify_data.idr; 452 wd = i_mark->wd;
374 wd = ientry->wd;
375 453
376 if (wd == -1) 454 /*
455 * does this i_mark think it is in the idr? we shouldn't get called
456 * if it wasn't....
457 */
458 if (wd == -1) {
459 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
460 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
461 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
377 goto out; 462 goto out;
463 }
378 464
379 entry = idr_find(&group->inotify_data.idr, wd); 465 /* Lets look in the idr to see if we find it */
380 if (unlikely(!entry)) 466 found_i_mark = inotify_idr_find_locked(group, wd);
467 if (unlikely(!found_i_mark)) {
468 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
469 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
470 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
381 goto out; 471 goto out;
472 }
382 473
383 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 474 /*
384 if (unlikely(found_ientry != ientry)) { 475 * We found an mark in the idr at the right wd, but it's
385 /* We found an entry in the idr with the right wd, but it's 476 * not the mark we were told to remove. eparis seriously
386 * not the entry we were told to remove. eparis seriously 477 * fucked up somewhere.
387 * fucked up somewhere. */ 478 */
388 WARN_ON(1); 479 if (unlikely(found_i_mark != i_mark)) {
389 ientry->wd = -1; 480 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
481 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
482 "found_i_mark->group=%p found_i_mark->inode=%p\n",
483 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
484 i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
485 found_i_mark->fsn_mark.group,
486 found_i_mark->fsn_mark.i.inode);
390 goto out; 487 goto out;
391 } 488 }
392 489
393 /* One ref for being in the idr, one ref held by the caller */ 490 /*
394 BUG_ON(atomic_read(&entry->refcnt) < 2); 491 * One ref for being in the idr
395 492 * one ref held by the caller trying to kill us
396 idr_remove(idr, wd); 493 * one ref grabbed by inotify_idr_find
397 ientry->wd = -1; 494 */
495 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
496 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
497 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
498 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
499 /* we can't really recover with bad ref cnting.. */
500 BUG();
501 }
398 502
399 /* removed from the idr, drop that ref */ 503 do_inotify_remove_from_idr(group, i_mark);
400 fsnotify_put_mark(entry);
401out: 504out:
402 spin_unlock(&group->inotify_data.idr_lock); 505 /* match the ref taken by inotify_idr_find_locked() */
506 if (found_i_mark)
507 fsnotify_put_mark(&found_i_mark->fsn_mark);
508 i_mark->wd = -1;
509 spin_unlock(idr_lock);
403} 510}
404 511
405/* 512/*
406 * Send IN_IGNORED for this wd, remove this wd from the idr. 513 * Send IN_IGNORED for this wd, remove this wd from the idr.
407 */ 514 */
408void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 515void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
409 struct fsnotify_group *group) 516 struct fsnotify_group *group)
410{ 517{
411 struct inotify_inode_mark_entry *ientry; 518 struct inotify_inode_mark *i_mark;
412 struct fsnotify_event *ignored_event; 519 struct fsnotify_event *ignored_event, *notify_event;
413 struct inotify_event_private_data *event_priv; 520 struct inotify_event_private_data *event_priv;
414 struct fsnotify_event_private_data *fsn_event_priv; 521 struct fsnotify_event_private_data *fsn_event_priv;
415 int ret; 522 int ret;
@@ -420,7 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
420 if (!ignored_event) 527 if (!ignored_event)
421 return; 528 return;
422 529
423 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 530 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
424 531
425 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 532 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
426 if (unlikely(!event_priv)) 533 if (unlikely(!event_priv))
@@ -429,37 +536,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
429 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 536 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
430 537
431 fsn_event_priv->group = group; 538 fsn_event_priv->group = group;
432 event_priv->wd = ientry->wd; 539 event_priv->wd = i_mark->wd;
433 540
434 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); 541 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
435 if (ret) 542 if (notify_event) {
543 if (IS_ERR(notify_event))
544 ret = PTR_ERR(notify_event);
545 else
546 fsnotify_put_event(notify_event);
436 inotify_free_event_priv(fsn_event_priv); 547 inotify_free_event_priv(fsn_event_priv);
548 }
437 549
438skip_send_ignore: 550skip_send_ignore:
439 551
440 /* matches the reference taken when the event was created */ 552 /* matches the reference taken when the event was created */
441 fsnotify_put_event(ignored_event); 553 fsnotify_put_event(ignored_event);
442 554
443 /* remove this entry from the idr */ 555 /* remove this mark from the idr */
444 inotify_remove_from_idr(group, ientry); 556 inotify_remove_from_idr(group, i_mark);
445 557
446 atomic_dec(&group->inotify_data.user->inotify_watches); 558 atomic_dec(&group->inotify_data.user->inotify_watches);
447} 559}
448 560
449/* ding dong the mark is dead */ 561/* ding dong the mark is dead */
450static void inotify_free_mark(struct fsnotify_mark_entry *entry) 562static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
451{ 563{
452 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; 564 struct inotify_inode_mark *i_mark;
565
566 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
453 567
454 kmem_cache_free(inotify_inode_mark_cachep, ientry); 568 kmem_cache_free(inotify_inode_mark_cachep, i_mark);
455} 569}
456 570
457static int inotify_update_existing_watch(struct fsnotify_group *group, 571static int inotify_update_existing_watch(struct fsnotify_group *group,
458 struct inode *inode, 572 struct inode *inode,
459 u32 arg) 573 u32 arg)
460{ 574{
461 struct fsnotify_mark_entry *entry; 575 struct fsnotify_mark *fsn_mark;
462 struct inotify_inode_mark_entry *ientry; 576 struct inotify_inode_mark *i_mark;
463 __u32 old_mask, new_mask; 577 __u32 old_mask, new_mask;
464 __u32 mask; 578 __u32 mask;
465 int add = (arg & IN_MASK_ADD); 579 int add = (arg & IN_MASK_ADD);
@@ -467,52 +581,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
467 581
468 /* don't allow invalid bits: we don't want flags set */ 582 /* don't allow invalid bits: we don't want flags set */
469 mask = inotify_arg_to_mask(arg); 583 mask = inotify_arg_to_mask(arg);
470 if (unlikely(!mask)) 584 if (unlikely(!(mask & IN_ALL_EVENTS)))
471 return -EINVAL; 585 return -EINVAL;
472 586
473 spin_lock(&inode->i_lock); 587 fsn_mark = fsnotify_find_inode_mark(group, inode);
474 entry = fsnotify_find_mark_entry(group, inode); 588 if (!fsn_mark)
475 spin_unlock(&inode->i_lock);
476 if (!entry)
477 return -ENOENT; 589 return -ENOENT;
478 590
479 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 591 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
480 592
481 spin_lock(&entry->lock); 593 spin_lock(&fsn_mark->lock);
482 594
483 old_mask = entry->mask; 595 old_mask = fsn_mark->mask;
484 if (add) { 596 if (add)
485 entry->mask |= mask; 597 fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
486 new_mask = entry->mask; 598 else
487 } else { 599 fsnotify_set_mark_mask_locked(fsn_mark, mask);
488 entry->mask = mask; 600 new_mask = fsn_mark->mask;
489 new_mask = entry->mask;
490 }
491 601
492 spin_unlock(&entry->lock); 602 spin_unlock(&fsn_mark->lock);
493 603
494 if (old_mask != new_mask) { 604 if (old_mask != new_mask) {
495 /* more bits in old than in new? */ 605 /* more bits in old than in new? */
496 int dropped = (old_mask & ~new_mask); 606 int dropped = (old_mask & ~new_mask);
497 /* more bits in this entry than the inode's mask? */ 607 /* more bits in this fsn_mark than the inode's mask? */
498 int do_inode = (new_mask & ~inode->i_fsnotify_mask); 608 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
499 /* more bits in this entry than the group? */
500 int do_group = (new_mask & ~group->mask);
501 609
502 /* update the inode with this new entry */ 610 /* update the inode with this new fsn_mark */
503 if (dropped || do_inode) 611 if (dropped || do_inode)
504 fsnotify_recalc_inode_mask(inode); 612 fsnotify_recalc_inode_mask(inode);
505 613
506 /* update the group mask with the new mask */
507 if (dropped || do_group)
508 fsnotify_recalc_group_mask(group);
509 } 614 }
510 615
511 /* return the wd */ 616 /* return the wd */
512 ret = ientry->wd; 617 ret = i_mark->wd;
513 618
514 /* match the get from fsnotify_find_mark_entry() */ 619 /* match the get from fsnotify_find_mark() */
515 fsnotify_put_mark(entry); 620 fsnotify_put_mark(fsn_mark);
516 621
517 return ret; 622 return ret;
518} 623}
@@ -521,73 +626,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
521 struct inode *inode, 626 struct inode *inode,
522 u32 arg) 627 u32 arg)
523{ 628{
524 struct inotify_inode_mark_entry *tmp_ientry; 629 struct inotify_inode_mark *tmp_i_mark;
525 __u32 mask; 630 __u32 mask;
526 int ret; 631 int ret;
632 struct idr *idr = &group->inotify_data.idr;
633 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
527 634
528 /* don't allow invalid bits: we don't want flags set */ 635 /* don't allow invalid bits: we don't want flags set */
529 mask = inotify_arg_to_mask(arg); 636 mask = inotify_arg_to_mask(arg);
530 if (unlikely(!mask)) 637 if (unlikely(!(mask & IN_ALL_EVENTS)))
531 return -EINVAL; 638 return -EINVAL;
532 639
533 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); 640 tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
534 if (unlikely(!tmp_ientry)) 641 if (unlikely(!tmp_i_mark))
535 return -ENOMEM; 642 return -ENOMEM;
536 643
537 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); 644 fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
538 tmp_ientry->fsn_entry.mask = mask; 645 tmp_i_mark->fsn_mark.mask = mask;
539 tmp_ientry->wd = -1; 646 tmp_i_mark->wd = -1;
540 647
541 ret = -ENOSPC; 648 ret = -ENOSPC;
542 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) 649 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
543 goto out_err; 650 goto out_err;
544retry:
545 ret = -ENOMEM;
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err;
548 651
549 /* we are putting the mark on the idr, take a reference */ 652 ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
550 fsnotify_get_mark(&tmp_ientry->fsn_entry); 653 tmp_i_mark);
551 654 if (ret)
552 spin_lock(&group->inotify_data.idr_lock);
553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
554 group->inotify_data.last_wd+1,
555 &tmp_ientry->wd);
556 spin_unlock(&group->inotify_data.idr_lock);
557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
561 /* idr was out of memory allocate and try again */
562 if (ret == -EAGAIN)
563 goto retry;
564 goto out_err; 655 goto out_err;
565 }
566 656
567 /* we are on the idr, now get on the inode */ 657 /* we are on the idr, now get on the inode */
568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 658 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
569 if (ret) { 659 if (ret) {
570 /* we failed to get on the inode, get off the idr */ 660 /* we failed to get on the inode, get off the idr */
571 inotify_remove_from_idr(group, tmp_ientry); 661 inotify_remove_from_idr(group, tmp_i_mark);
572 goto out_err; 662 goto out_err;
573 } 663 }
574 664
575 /* update the idr hint, who cares about races, it's just a hint */
576 group->inotify_data.last_wd = tmp_ientry->wd;
577
578 /* increment the number of watches the user has */ 665 /* increment the number of watches the user has */
579 atomic_inc(&group->inotify_data.user->inotify_watches); 666 atomic_inc(&group->inotify_data.user->inotify_watches);
580 667
581 /* return the watch descriptor for this new entry */ 668 /* return the watch descriptor for this new mark */
582 ret = tmp_ientry->wd; 669 ret = tmp_i_mark->wd;
583
584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group);
587 670
588out_err: 671out_err:
589 /* match the ref from fsnotify_init_markentry() */ 672 /* match the ref from fsnotify_init_mark() */
590 fsnotify_put_mark(&tmp_ientry->fsn_entry); 673 fsnotify_put_mark(&tmp_i_mark->fsn_mark);
591 674
592 return ret; 675 return ret;
593} 676}
@@ -616,11 +699,8 @@ retry:
616static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) 699static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
617{ 700{
618 struct fsnotify_group *group; 701 struct fsnotify_group *group;
619 unsigned int grp_num;
620 702
621 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ 703 group = fsnotify_alloc_group(&inotify_fsnotify_ops);
622 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
623 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
624 if (IS_ERR(group)) 704 if (IS_ERR(group))
625 return group; 705 return group;
626 706
@@ -726,7 +806,7 @@ fput_and_out:
726SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 806SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
727{ 807{
728 struct fsnotify_group *group; 808 struct fsnotify_group *group;
729 struct fsnotify_mark_entry *entry; 809 struct inotify_inode_mark *i_mark;
730 struct file *filp; 810 struct file *filp;
731 int ret = 0, fput_needed; 811 int ret = 0, fput_needed;
732 812
@@ -735,25 +815,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
735 return -EBADF; 815 return -EBADF;
736 816
737 /* verify that this is indeed an inotify instance */ 817 /* verify that this is indeed an inotify instance */
738 if (unlikely(filp->f_op != &inotify_fops)) { 818 ret = -EINVAL;
739 ret = -EINVAL; 819 if (unlikely(filp->f_op != &inotify_fops))
740 goto out; 820 goto out;
741 }
742 821
743 group = filp->private_data; 822 group = filp->private_data;
744 823
745 spin_lock(&group->inotify_data.idr_lock); 824 ret = -EINVAL;
746 entry = idr_find(&group->inotify_data.idr, wd); 825 i_mark = inotify_idr_find(group, wd);
747 if (unlikely(!entry)) { 826 if (unlikely(!i_mark))
748 spin_unlock(&group->inotify_data.idr_lock);
749 ret = -EINVAL;
750 goto out; 827 goto out;
751 }
752 fsnotify_get_mark(entry);
753 spin_unlock(&group->inotify_data.idr_lock);
754 828
755 fsnotify_destroy_mark_by_entry(entry); 829 ret = 0;
756 fsnotify_put_mark(entry); 830
831 fsnotify_destroy_mark(&i_mark->fsn_mark);
832
833 /* match ref taken by inotify_idr_find */
834 fsnotify_put_mark(&i_mark->fsn_mark);
757 835
758out: 836out:
759 fput_light(filp, fput_needed); 837 fput_light(filp, fput_needed);
@@ -767,7 +845,28 @@ out:
767 */ 845 */
768static int __init inotify_user_setup(void) 846static int __init inotify_user_setup(void)
769{ 847{
770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 848 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
849 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
850 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
851 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
852 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
853 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
854 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
855 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
856 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
857 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
858 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
859 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
860 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
863 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866
867 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
868
869 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 870 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
772 871
773 inotify_max_queued_events = 16384; 872 inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 000000000000..325185e514bb
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * mark->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the marks_list anchored inside a given group
42 * and each mark is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/kthread.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/slab.h>
92#include <linux/spinlock.h>
93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95
96#include <asm/atomic.h>
97
98#include <linux/fsnotify_backend.h>
99#include "fsnotify.h"
100
101struct srcu_struct fsnotify_mark_srcu;
102static DEFINE_SPINLOCK(destroy_lock);
103static LIST_HEAD(destroy_list);
104static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
105
106void fsnotify_get_mark(struct fsnotify_mark *mark)
107{
108 atomic_inc(&mark->refcnt);
109}
110
111void fsnotify_put_mark(struct fsnotify_mark *mark)
112{
113 if (atomic_dec_and_test(&mark->refcnt))
114 mark->free_mark(mark);
115}
116
117/*
118 * Any time a mark is getting freed we end up here.
119 * The caller had better be holding a reference to this mark so we don't actually
120 * do the final put under the mark->lock
121 */
122void fsnotify_destroy_mark(struct fsnotify_mark *mark)
123{
124 struct fsnotify_group *group;
125 struct inode *inode = NULL;
126
127 spin_lock(&mark->lock);
128
129 group = mark->group;
130
131 /* something else already called this function on this mark */
132 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
133 spin_unlock(&mark->lock);
134 return;
135 }
136
137 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
138
139 /* 1 from caller and 1 for being on i_list/g_list */
140 BUG_ON(atomic_read(&mark->refcnt) < 2);
141
142 spin_lock(&group->mark_lock);
143
144 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
145 inode = mark->i.inode;
146 fsnotify_destroy_inode_mark(mark);
147 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
148 fsnotify_destroy_vfsmount_mark(mark);
149 else
150 BUG();
151
152 list_del_init(&mark->g_list);
153
154 spin_unlock(&group->mark_lock);
155 spin_unlock(&mark->lock);
156
157 spin_lock(&destroy_lock);
158 list_add(&mark->destroy_list, &destroy_list);
159 spin_unlock(&destroy_lock);
160 wake_up(&destroy_waitq);
161
162 /*
163 * Some groups like to know that marks are being freed. This is a
164 * callback to the group function to let it know that this mark
165 * is being freed.
166 */
167 if (group->ops->freeing_mark)
168 group->ops->freeing_mark(mark, group);
169
170 /*
171 * __fsnotify_update_child_dentry_flags(inode);
172 *
173 * I really want to call that, but we can't, we have no idea if the inode
174 * still exists the second we drop the mark->lock.
175 *
176 * The next time an event arrive to this inode from one of it's children
177 * __fsnotify_parent will see that the inode doesn't care about it's
178 * children and will update all of these flags then. So really this
179 * is just a lazy update (and could be a perf win...)
180 */
181
182 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
183 iput(inode);
184
185 /*
186 * it's possible that this group tried to destroy itself, but this
187 * this mark was simultaneously being freed by inode. If that's the
188 * case, we finish freeing the group here.
189 */
190 if (unlikely(atomic_dec_and_test(&group->num_marks)))
191 fsnotify_final_destroy_group(group);
192}
193
194void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
195{
196 assert_spin_locked(&mark->lock);
197
198 mark->mask = mask;
199
200 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
201 fsnotify_set_inode_mark_mask_locked(mark, mask);
202}
203
204void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
205{
206 assert_spin_locked(&mark->lock);
207
208 mark->ignored_mask = mask;
209}
210
211/*
212 * Attach an initialized mark to a given group and fs object.
213 * These marks may be used for the fsnotify backend to determine which
214 * event types should be delivered to which group.
215 */
216int fsnotify_add_mark(struct fsnotify_mark *mark,
217 struct fsnotify_group *group, struct inode *inode,
218 struct vfsmount *mnt, int allow_dups)
219{
220 int ret = 0;
221
222 BUG_ON(inode && mnt);
223 BUG_ON(!inode && !mnt);
224
225 /*
226 * LOCKING ORDER!!!!
227 * mark->lock
228 * group->mark_lock
229 * inode->i_lock
230 */
231 spin_lock(&mark->lock);
232 spin_lock(&group->mark_lock);
233
234 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
235
236 mark->group = group;
237 list_add(&mark->g_list, &group->marks_list);
238 atomic_inc(&group->num_marks);
239 fsnotify_get_mark(mark); /* for i_list and g_list */
240
241 if (inode) {
242 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
243 if (ret)
244 goto err;
245 } else if (mnt) {
246 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
247 if (ret)
248 goto err;
249 } else {
250 BUG();
251 }
252
253 spin_unlock(&group->mark_lock);
254
255 /* this will pin the object if appropriate */
256 fsnotify_set_mark_mask_locked(mark, mark->mask);
257
258 spin_unlock(&mark->lock);
259
260 if (inode)
261 __fsnotify_update_child_dentry_flags(inode);
262
263 return ret;
264err:
265 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
266 list_del_init(&mark->g_list);
267 mark->group = NULL;
268 atomic_dec(&group->num_marks);
269
270 spin_unlock(&group->mark_lock);
271 spin_unlock(&mark->lock);
272
273 spin_lock(&destroy_lock);
274 list_add(&mark->destroy_list, &destroy_list);
275 spin_unlock(&destroy_lock);
276 wake_up(&destroy_waitq);
277
278 return ret;
279}
280
281/*
282 * clear any marks in a group in which mark->flags & flags is true
283 */
284void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
285 unsigned int flags)
286{
287 struct fsnotify_mark *lmark, *mark;
288 LIST_HEAD(free_list);
289
290 spin_lock(&group->mark_lock);
291 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
292 if (mark->flags & flags) {
293 list_add(&mark->free_g_list, &free_list);
294 list_del_init(&mark->g_list);
295 fsnotify_get_mark(mark);
296 }
297 }
298 spin_unlock(&group->mark_lock);
299
300 list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
301 fsnotify_destroy_mark(mark);
302 fsnotify_put_mark(mark);
303 }
304}
305
306/*
307 * Given a group, destroy all of the marks associated with that group.
308 */
309void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
310{
311 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
312}
313
314void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
315{
316 assert_spin_locked(&old->lock);
317 new->i.inode = old->i.inode;
318 new->m.mnt = old->m.mnt;
319 new->group = old->group;
320 new->mask = old->mask;
321 new->free_mark = old->free_mark;
322}
323
324/*
325 * Nothing fancy, just initialize lists and locks and counters.
326 */
327void fsnotify_init_mark(struct fsnotify_mark *mark,
328 void (*free_mark)(struct fsnotify_mark *mark))
329{
330 memset(mark, 0, sizeof(*mark));
331 spin_lock_init(&mark->lock);
332 atomic_set(&mark->refcnt, 1);
333 mark->free_mark = free_mark;
334}
335
336static int fsnotify_mark_destroy(void *ignored)
337{
338 struct fsnotify_mark *mark, *next;
339 LIST_HEAD(private_destroy_list);
340
341 for (;;) {
342 spin_lock(&destroy_lock);
343 /* exchange the list head */
344 list_replace_init(&destroy_list, &private_destroy_list);
345 spin_unlock(&destroy_lock);
346
347 synchronize_srcu(&fsnotify_mark_srcu);
348
349 list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
350 list_del_init(&mark->destroy_list);
351 fsnotify_put_mark(mark);
352 }
353
354 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
355 }
356
357 return 0;
358}
359
360static int __init fsnotify_mark_init(void)
361{
362 struct task_struct *thread;
363
364 thread = kthread_run(fsnotify_mark_destroy, NULL,
365 "fsnotify_mark");
366 if (IS_ERR(thread))
367 panic("unable to start fsnotify mark destruction thread.");
368
369 return 0;
370}
371device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c108..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -56,7 +56,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
56 * it is needed. It's refcnt is set 1 at kernel init time and will never 56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed' 57 * get set to 0 so it will never get 'freed'
58 */ 58 */
59static struct fsnotify_event q_overflow_event; 59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 61
62/** 62/**
@@ -87,12 +87,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
87 return; 87 return;
88 88
89 if (atomic_dec_and_test(&event->refcnt)) { 89 if (atomic_dec_and_test(&event->refcnt)) {
90 pr_debug("%s: event=%p\n", __func__, event);
91
90 if (event->data_type == FSNOTIFY_EVENT_PATH) 92 if (event->data_type == FSNOTIFY_EVENT_PATH)
91 path_put(&event->path); 93 path_put(&event->path);
92 94
93 BUG_ON(!list_empty(&event->private_data_list)); 95 BUG_ON(!list_empty(&event->private_data_list));
94 96
95 kfree(event->file_name); 97 kfree(event->file_name);
98 put_pid(event->tgid);
96 kmem_cache_free(fsnotify_event_cachep, event); 99 kmem_cache_free(fsnotify_event_cachep, event);
97 } 100 }
98} 101}
@@ -104,7 +107,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104 107
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) 108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{ 109{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder); 110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108} 112}
109 113
110/* 114/*
@@ -129,53 +133,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
129} 133}
130 134
131/* 135/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
141 switch (old->data_type) {
142 case (FSNOTIFY_EVENT_INODE):
143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name))
148 return true;
149 break;
150 case (FSNOTIFY_EVENT_PATH):
151 if ((old->path.mnt == new->path.mnt) &&
152 (old->path.dentry == new->path.dentry))
153 return true;
154 break;
155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
161 };
162 }
163 return false;
164}
165
166/*
167 * Add an event to the group notification queue. The group can later pull this 136 * Add an event to the group notification queue. The group can later pull this
168 * event off the queue to deal with. If the event is successfully added to the 137 * event off the queue to deal with. If the event is successfully added to the
169 * group's notification queue, a reference is taken on event. 138 * group's notification queue, a reference is taken on event.
170 */ 139 */
171int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
172 struct fsnotify_event_private_data *priv) 141 struct fsnotify_event_private_data *priv,
142 struct fsnotify_event *(*merge)(struct list_head *,
143 struct fsnotify_event *))
173{ 144{
145 struct fsnotify_event *return_event = NULL;
174 struct fsnotify_event_holder *holder = NULL; 146 struct fsnotify_event_holder *holder = NULL;
175 struct list_head *list = &group->notification_list; 147 struct list_head *list = &group->notification_list;
176 struct fsnotify_event_holder *last_holder; 148
177 struct fsnotify_event *last_event; 149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
178 int ret = 0;
179 150
180 /* 151 /*
181 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +160,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
189alloc_holder: 160alloc_holder:
190 holder = fsnotify_alloc_event_holder(); 161 holder = fsnotify_alloc_event_holder();
191 if (!holder) 162 if (!holder)
192 return -ENOMEM; 163 return ERR_PTR(-ENOMEM);
193 } 164 }
194 165
195 mutex_lock(&group->notification_mutex); 166 mutex_lock(&group->notification_mutex);
196 167
197 if (group->q_len >= group->max_events) { 168 if (group->q_len >= group->max_events) {
198 event = &q_overflow_event; 169 event = q_overflow_event;
199 ret = -EOVERFLOW; 170
171 /*
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event;
177
200 /* sorry, no private data on the overflow event */ 178 /* sorry, no private data on the overflow event */
201 priv = NULL; 179 priv = NULL;
202 } 180 }
203 181
182 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp;
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex);
188
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 }
195 }
196
204 spin_lock(&event->lock); 197 spin_lock(&event->lock);
205 198
206 if (list_empty(&event->holder.event_list)) { 199 if (list_empty(&event->holder.event_list)) {
@@ -212,19 +205,13 @@ alloc_holder:
212 * event holder was used, go back and get a new one */ 205 * event holder was used, go back and get a new one */
213 spin_unlock(&event->lock); 206 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex); 207 mutex_unlock(&group->notification_mutex);
215 goto alloc_holder;
216 }
217 208
218 if (!list_empty(list)) { 209 if (return_event) {
219 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); 210 fsnotify_put_event(return_event);
220 last_event = last_holder->event; 211 return_event = NULL;
221 if (event_compare(last_event, event)) {
222 spin_unlock(&event->lock);
223 mutex_unlock(&group->notification_mutex);
224 if (holder != &event->holder)
225 fsnotify_destroy_event_holder(holder);
226 return -EEXIST;
227 } 212 }
213
214 goto alloc_holder;
228 } 215 }
229 216
230 group->q_len++; 217 group->q_len++;
@@ -238,7 +225,7 @@ alloc_holder:
238 mutex_unlock(&group->notification_mutex); 225 mutex_unlock(&group->notification_mutex);
239 226
240 wake_up(&group->notification_waitq); 227 wake_up(&group->notification_waitq);
241 return ret; 228 return return_event;
242} 229}
243 230
244/* 231/*
@@ -253,6 +240,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
253 240
254 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 241 BUG_ON(!mutex_is_locked(&group->notification_mutex));
255 242
243 pr_debug("%s: group=%p\n", __func__, group);
244
256 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 245 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
257 246
258 event = holder->event; 247 event = holder->event;
@@ -314,25 +303,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
314 303
315static void initialize_event(struct fsnotify_event *event) 304static void initialize_event(struct fsnotify_event *event)
316{ 305{
317 event->holder.event = NULL;
318 INIT_LIST_HEAD(&event->holder.event_list); 306 INIT_LIST_HEAD(&event->holder.event_list);
319 atomic_set(&event->refcnt, 1); 307 atomic_set(&event->refcnt, 1);
320 308
321 spin_lock_init(&event->lock); 309 spin_lock_init(&event->lock);
322 310
323 event->path.dentry = NULL;
324 event->path.mnt = NULL;
325 event->inode = NULL;
326 event->data_type = FSNOTIFY_EVENT_NONE;
327
328 INIT_LIST_HEAD(&event->private_data_list); 311 INIT_LIST_HEAD(&event->private_data_list);
312}
313
314/*
315 * Caller damn well better be holding whatever mutex is protecting the
316 * old_holder->event_list and the new_event must be a clean event which
317 * cannot be found anywhere else in the kernel.
318 */
319int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
320 struct fsnotify_event *new_event)
321{
322 struct fsnotify_event *old_event = old_holder->event;
323 struct fsnotify_event_holder *new_holder = &new_event->holder;
329 324
330 event->to_tell = NULL; 325 enum event_spinlock_class {
326 SPINLOCK_OLD,
327 SPINLOCK_NEW,
328 };
331 329
332 event->file_name = NULL; 330 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
333 event->name_len = 0;
334 331
335 event->sync_cookie = 0; 332 /*
333 * if the new_event's embedded holder is in use someone
334 * screwed up and didn't give us a clean new event.
335 */
336 BUG_ON(!list_empty(&new_holder->event_list));
337
338 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
339 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
340
341 new_holder->event = new_event;
342 list_replace_init(&old_holder->event_list, &new_holder->event_list);
343
344 spin_unlock(&new_event->lock);
345 spin_unlock(&old_event->lock);
346
347 /* event == holder means we are referenced through the in event holder */
348 if (old_holder != &old_event->holder)
349 fsnotify_destroy_event_holder(old_holder);
350
351 fsnotify_get_event(new_event); /* on the list take reference */
352 fsnotify_put_event(old_event); /* off the list, drop reference */
353
354 return 0;
355}
356
357struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
358{
359 struct fsnotify_event *event;
360
361 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
362 if (!event)
363 return NULL;
364
365 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
366
367 memcpy(event, old_event, sizeof(*event));
368 initialize_event(event);
369
370 if (event->name_len) {
371 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
372 if (!event->file_name) {
373 kmem_cache_free(fsnotify_event_cachep, event);
374 return NULL;
375 }
376 }
377 event->tgid = get_pid(old_event->tgid);
378 if (event->data_type == FSNOTIFY_EVENT_PATH)
379 path_get(&event->path);
380
381 return event;
336} 382}
337 383
338/* 384/*
@@ -348,15 +394,18 @@ static void initialize_event(struct fsnotify_event *event)
348 * @name the filename, if available 394 * @name the filename, if available
349 */ 395 */
350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 396struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
351 int data_type, const char *name, u32 cookie, 397 int data_type, const unsigned char *name,
352 gfp_t gfp) 398 u32 cookie, gfp_t gfp)
353{ 399{
354 struct fsnotify_event *event; 400 struct fsnotify_event *event;
355 401
356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp); 402 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
357 if (!event) 403 if (!event)
358 return NULL; 404 return NULL;
359 405
406 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
407 __func__, event, to_tell, mask, data, data_type);
408
360 initialize_event(event); 409 initialize_event(event);
361 410
362 if (name) { 411 if (name) {
@@ -368,30 +417,21 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
368 event->name_len = strlen(event->file_name); 417 event->name_len = strlen(event->file_name);
369 } 418 }
370 419
420 event->tgid = get_pid(task_tgid(current));
371 event->sync_cookie = cookie; 421 event->sync_cookie = cookie;
372 event->to_tell = to_tell; 422 event->to_tell = to_tell;
423 event->data_type = data_type;
373 424
374 switch (data_type) { 425 switch (data_type) {
375 case FSNOTIFY_EVENT_FILE: {
376 struct file *file = data;
377 struct path *path = &file->f_path;
378 event->path.dentry = path->dentry;
379 event->path.mnt = path->mnt;
380 path_get(&event->path);
381 event->data_type = FSNOTIFY_EVENT_PATH;
382 break;
383 }
384 case FSNOTIFY_EVENT_PATH: { 426 case FSNOTIFY_EVENT_PATH: {
385 struct path *path = data; 427 struct path *path = data;
386 event->path.dentry = path->dentry; 428 event->path.dentry = path->dentry;
387 event->path.mnt = path->mnt; 429 event->path.mnt = path->mnt;
388 path_get(&event->path); 430 path_get(&event->path);
389 event->data_type = FSNOTIFY_EVENT_PATH;
390 break; 431 break;
391 } 432 }
392 case FSNOTIFY_EVENT_INODE: 433 case FSNOTIFY_EVENT_INODE:
393 event->inode = data; 434 event->inode = data;
394 event->data_type = FSNOTIFY_EVENT_INODE;
395 break; 435 break;
396 case FSNOTIFY_EVENT_NONE: 436 case FSNOTIFY_EVENT_NONE:
397 event->inode = NULL; 437 event->inode = NULL;
@@ -412,8 +452,11 @@ __init int fsnotify_notification_init(void)
412 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 452 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
413 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 453 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
414 454
415 initialize_event(&q_overflow_event); 455 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
416 q_overflow_event.mask = FS_Q_OVERFLOW; 456 FSNOTIFY_EVENT_NONE, NULL, 0,
457 GFP_KERNEL);
458 if (!q_overflow_event)
459 panic("unable to allocate fsnotify q_overflow_event\n");
417 460
418 return 0; 461 return 0;
419} 462}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000000..56772b578fbd
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/mount.h>
24#include <linux/mutex.h>
25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27
28#include <asm/atomic.h>
29
30#include <linux/fsnotify_backend.h>
31#include "fsnotify.h"
32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{
35 struct fsnotify_mark *mark, *lmark;
36 struct hlist_node *pos, *n;
37 LIST_HEAD(free_list);
38
39 spin_lock(&mnt->mnt_root->d_lock);
40 hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
41 list_add(&mark->m.free_m_list, &free_list);
42 hlist_del_init_rcu(&mark->m.m_list);
43 fsnotify_get_mark(mark);
44 }
45 spin_unlock(&mnt->mnt_root->d_lock);
46
47 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
48 fsnotify_destroy_mark(mark);
49 fsnotify_put_mark(mark);
50 }
51}
52
53void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
54{
55 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
56}
57
58/*
59 * Recalculate the mask of events relevant to a given vfsmount locked.
60 */
61static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
62{
63 struct fsnotify_mark *mark;
64 struct hlist_node *pos;
65 __u32 new_mask = 0;
66
67 assert_spin_locked(&mnt->mnt_root->d_lock);
68
69 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
70 new_mask |= mark->mask;
71 mnt->mnt_fsnotify_mask = new_mask;
72}
73
74/*
75 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
76 * any notifier is interested in hearing for this mount point
77 */
78void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
79{
80 spin_lock(&mnt->mnt_root->d_lock);
81 fsnotify_recalc_vfsmount_mask_locked(mnt);
82 spin_unlock(&mnt->mnt_root->d_lock);
83}
84
85void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
86{
87 struct vfsmount *mnt = mark->m.mnt;
88
89 assert_spin_locked(&mark->lock);
90 assert_spin_locked(&mark->group->mark_lock);
91
92 spin_lock(&mnt->mnt_root->d_lock);
93
94 hlist_del_init_rcu(&mark->m.m_list);
95 mark->m.mnt = NULL;
96
97 fsnotify_recalc_vfsmount_mask_locked(mnt);
98
99 spin_unlock(&mnt->mnt_root->d_lock);
100}
101
102static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
103 struct vfsmount *mnt)
104{
105 struct fsnotify_mark *mark;
106 struct hlist_node *pos;
107
108 assert_spin_locked(&mnt->mnt_root->d_lock);
109
110 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
111 if (mark->group == group) {
112 fsnotify_get_mark(mark);
113 return mark;
114 }
115 }
116 return NULL;
117}
118
119/*
120 * given a group and vfsmount, find the mark associated with that combination.
121 * if found take a reference to that mark and return it, else return NULL
122 */
123struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
124 struct vfsmount *mnt)
125{
126 struct fsnotify_mark *mark;
127
128 spin_lock(&mnt->mnt_root->d_lock);
129 mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
130 spin_unlock(&mnt->mnt_root->d_lock);
131
132 return mark;
133}
134
135/*
136 * Attach an initialized mark to a given group and vfsmount.
137 * These marks may be used for the fsnotify backend to determine which
138 * event types should be delivered to which groups.
139 */
140int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
141 struct fsnotify_group *group, struct vfsmount *mnt,
142 int allow_dups)
143{
144 struct fsnotify_mark *lmark;
145 struct hlist_node *node, *last = NULL;
146 int ret = 0;
147
148 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
149
150 assert_spin_locked(&mark->lock);
151 assert_spin_locked(&group->mark_lock);
152
153 spin_lock(&mnt->mnt_root->d_lock);
154
155 mark->m.mnt = mnt;
156
157 /* is mark the first mark? */
158 if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
159 hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
160 goto out;
161 }
162
163 /* should mark be in the middle of the current list? */
164 hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
165 last = node;
166
167 if ((lmark->group == group) && !allow_dups) {
168 ret = -EEXIST;
169 goto out;
170 }
171
172 if (mark->group < lmark->group)
173 continue;
174
175 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
176 goto out;
177 }
178
179 BUG_ON(last == NULL);
180 /* mark should be the last entry. last is the current last entry */
181 hlist_add_after_rcu(last, &mark->m.m_list);
182out:
183 fsnotify_recalc_vfsmount_mask_locked(mnt);
184 spin_unlock(&mnt->mnt_root->d_lock);
185
186 return ret;
187}