aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-10 14:39:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-10 14:39:13 -0400
commit8c8946f509a494769a8c602b5ed189df01917d39 (patch)
treedfd96bd6ca5ea6803c6d77f65ba37e04f78b2d3b
parent5f248c9c251c60af3403902b26e08de43964ea0b (diff)
parent1968f5eed54ce47bde488fd9a450912e4a2d7138 (diff)
Merge branch 'for-linus' of git://git.infradead.org/users/eparis/notify
* 'for-linus' of git://git.infradead.org/users/eparis/notify: (132 commits) fanotify: use both marks when possible fsnotify: pass both the vfsmount mark and inode mark fsnotify: walk the inode and vfsmount lists simultaneously fsnotify: rework ignored mark flushing fsnotify: remove global fsnotify groups lists fsnotify: remove group->mask fsnotify: remove the global masks fsnotify: cleanup should_send_event fanotify: use the mark in handler functions audit: use the mark in handler functions dnotify: use the mark in handler functions inotify: use the mark in handler functions fsnotify: send fsnotify_mark to groups in event handling functions fsnotify: Exchange list heads instead of moving elements fsnotify: srcu to protect read side of inode and vfsmount locks fsnotify: use an explicit flag to indicate fsnotify_destroy_mark has been called fsnotify: use _rcu functions for mark list traversal fsnotify: place marks on object in order of group memory address vfs/fsnotify: fsnotify_close can delay the final work in fput fsnotify: store struct file not struct path ... Fix up trivial delete/modify conflict in fs/notify/inotify/inotify.c.
-rw-r--r--Documentation/feature-removal-schedule.txt8
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/ia32/sys_ia32.c9
-rw-r--r--arch/x86/include/asm/sys_ia32.h3
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--fs/compat.c5
-rw-r--r--fs/exec.c4
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/inode.c8
-rw-r--r--fs/namei.c2
-rw-r--r--fs/namespace.c5
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c213
-rw-r--r--fs/notify/fanotify/Kconfig26
-rw-r--r--fs/notify/fanotify/Makefile1
-rw-r--r--fs/notify/fanotify/fanotify.c212
-rw-r--r--fs/notify/fanotify/fanotify_user.c760
-rw-r--r--fs/notify/fsnotify.c201
-rw-r--r--fs/notify/fsnotify.h27
-rw-r--r--fs/notify/group.c182
-rw-r--r--fs/notify/inode_mark.c331
-rw-r--r--fs/notify/inotify/Kconfig15
-rw-r--r--fs/notify/inotify/Makefile1
-rw-r--r--fs/notify/inotify/inotify.c872
-rw-r--r--fs/notify/inotify/inotify.h7
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c151
-rw-r--r--fs/notify/inotify/inotify_user.c369
-rw-r--r--fs/notify/mark.c371
-rw-r--r--fs/notify/notification.c236
-rw-r--r--fs/notify/vfsmount_mark.c187
-rw-r--r--fs/open.c3
-rw-r--r--fs/read_write.c8
-rw-r--r--include/asm-generic/fcntl.h8
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/dnotify.h1
-rw-r--r--include/linux/fanotify.h105
-rw-r--r--include/linux/fs.h16
-rw-r--r--include/linux/fsnotify.h161
-rw-r--r--include/linux/fsnotify_backend.h211
-rw-r--r--include/linux/inotify.h185
-rw-r--r--include/linux/mount.h6
-rw-r--r--include/linux/security.h1
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c7
-rw-r--r--security/security.c16
58 files changed, 3197 insertions, 2378 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 56cee4727b1a..b16cbe4152ea 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -360,14 +360,6 @@ When: 2.6.33
360Why: Should be implemented in userspace, policy daemon. 360Why: Should be implemented in userspace, policy daemon.
361Who: Johannes Berg <johannes@sipsolutions.net> 361Who: Johannes Berg <johannes@sipsolutions.net>
362 362
363---------------------------
364
365What: CONFIG_INOTIFY
366When: 2.6.33
367Why: last user (audit) will be converted to the newer more generic
368 and more easily maintained fsnotify subsystem
369Who: Eric Paris <eparis@redhat.com>
370
371---------------------------- 363----------------------------
372 364
373What: sound-slot/service-* module aliases and related clutters in 365What: sound-slot/service-* module aliases and related clutters in
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa3..17cf65c94804 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,6 @@ ia32_sys_call_table:
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg 844 .quad compat_sys_recvmmsg
845 .quad sys_fanotify_init
846 .quad sys32_fanotify_mark
845ia32_syscall_end: 847ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88d..3d093311d5e2 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, 546 return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
547 ((u64)len_hi << 32) | len_lo); 547 ((u64)len_hi << 32) | len_lo);
548} 548}
549
550asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
551 u32 mask_lo, u32 mask_hi,
552 int fd, const char __user *pathname)
553{
554 return sys_fanotify_mark(fanotify_fd, flags,
555 ((u64)mask_hi << 32) | mask_lo,
556 fd, pathname);
557}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae7..cf4e2e381cba 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
80 80
81/* ia32/ipc32.c */ 81/* ia32/ipc32.c */
82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); 82asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
83
84asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
85 const char __user *);
83#endif /* _ASM_X86_SYS_IA32_H */ 86#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a4..80b799cd74f7 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,12 @@
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337 345#define __NR_recvmmsg 337
346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339
346 348
347#ifdef __KERNEL__ 349#ifdef __KERNEL__
348 350
349#define NR_syscalls 338 351#define NR_syscalls 340
350 352
351#define __ARCH_WANT_IPC_PARSE_VERSION 353#define __ARCH_WANT_IPC_PARSE_VERSION
352#define __ARCH_WANT_OLD_READDIR 354#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81e..5b7b1d585616 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299 664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg) 665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
666#define __NR_fanotify_init 300
667__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
668#define __NR_fanotify_mark 301
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
666 670
667#ifndef __NO_STUBS 671#ifndef __NO_STUBS
668#define __ARCH_WANT_OLD_READDIR 672#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..07ad5eb7cc5c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,5 @@ ENTRY(sys_call_table)
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg 339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
diff --git a/fs/compat.c b/fs/compat.c
index 3e57e8162a39..e6d5d70cf3cf 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1193,11 +1193,10 @@ out:
1193 if (iov != iovstack) 1193 if (iov != iovstack)
1194 kfree(iov); 1194 kfree(iov);
1195 if ((ret + (type == READ)) > 0) { 1195 if ((ret + (type == READ)) > 0) {
1196 struct dentry *dentry = file->f_path.dentry;
1197 if (type == READ) 1196 if (type == READ)
1198 fsnotify_access(dentry); 1197 fsnotify_access(file);
1199 else 1198 else
1200 fsnotify_modify(dentry); 1199 fsnotify_modify(file);
1201 } 1200 }
1202 return ret; 1201 return ret;
1203} 1202}
diff --git a/fs/exec.c b/fs/exec.c
index dab85ecad686..7761837e4500 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
128 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 128 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
129 goto exit; 129 goto exit;
130 130
131 fsnotify_open(file->f_path.dentry); 131 fsnotify_open(file);
132 132
133 error = -ENOEXEC; 133 error = -ENOEXEC;
134 if(file->f_op) { 134 if(file->f_op) {
@@ -683,7 +683,7 @@ struct file *open_exec(const char *name)
683 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 683 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
684 goto exit; 684 goto exit;
685 685
686 fsnotify_open(file->f_path.dentry); 686 fsnotify_open(file);
687 687
688 err = deny_write_access(file); 688 err = deny_write_access(file);
689 if (err) 689 if (err)
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c7d10ead4ad..b8a0bb63cbd7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -230,6 +230,15 @@ static void __fput(struct file *file)
230 might_sleep(); 230 might_sleep();
231 231
232 fsnotify_close(file); 232 fsnotify_close(file);
233
234 /*
235 * fsnotify_create_event may have taken one or more references on this
236 * file. If it did so it left one reference for us to drop to make sure
237 * its calls to fput could not prematurely destroy the file.
238 */
239 if (atomic_long_read(&file->f_count))
240 return fput(file);
241
233 /* 242 /*
234 * The function eventpoll_release() should be the first called 243 * The function eventpoll_release() should be the first called
235 * in the file cleanup chain. 244 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index 2575244640a8..86464332e590 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,7 +20,6 @@
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/cdev.h> 21#include <linux/cdev.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/inotify.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/mount.h> 24#include <linux/mount.h>
26#include <linux/async.h> 25#include <linux/async.h>
@@ -264,12 +263,8 @@ void inode_init_once(struct inode *inode)
264 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 263 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
265 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 264 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
266 i_size_ordered_init(inode); 265 i_size_ordered_init(inode);
267#ifdef CONFIG_INOTIFY
268 INIT_LIST_HEAD(&inode->inotify_watches);
269 mutex_init(&inode->inotify_mutex);
270#endif
271#ifdef CONFIG_FSNOTIFY 266#ifdef CONFIG_FSNOTIFY
272 INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries); 267 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
273#endif 268#endif
274} 269}
275EXPORT_SYMBOL(inode_init_once); 270EXPORT_SYMBOL(inode_init_once);
@@ -413,7 +408,6 @@ int invalidate_inodes(struct super_block *sb)
413 408
414 down_write(&iprune_sem); 409 down_write(&iprune_sem);
415 spin_lock(&inode_lock); 410 spin_lock(&inode_lock);
416 inotify_unmount_inodes(&sb->s_inodes);
417 fsnotify_unmount_inodes(&sb->s_inodes); 411 fsnotify_unmount_inodes(&sb->s_inodes);
418 busy = invalidate_list(&sb->s_inodes, &throw_away); 412 busy = invalidate_list(&sb->s_inodes, &throw_away);
419 spin_unlock(&inode_lock); 413 spin_unlock(&inode_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 42d2d28fb827..13ff4abdbdca 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2633,7 +2633,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2633{ 2633{
2634 int error; 2634 int error;
2635 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 2635 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2636 const char *old_name; 2636 const unsigned char *old_name;
2637 2637
2638 if (old_dentry->d_inode == new_dentry->d_inode) 2638 if (old_dentry->d_inode == new_dentry->d_inode)
2639 return 0; 2639 return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 32dcd24bbc9a..66c4f7e781cb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
29#include <linux/log2.h> 29#include <linux/log2.h>
30#include <linux/idr.h> 30#include <linux/idr.h>
31#include <linux/fs_struct.h> 31#include <linux/fs_struct.h>
32#include <linux/fsnotify.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/unistd.h> 34#include <asm/unistd.h>
34#include "pnode.h" 35#include "pnode.h"
@@ -150,6 +151,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
150 INIT_LIST_HEAD(&mnt->mnt_share); 151 INIT_LIST_HEAD(&mnt->mnt_share);
151 INIT_LIST_HEAD(&mnt->mnt_slave_list); 152 INIT_LIST_HEAD(&mnt->mnt_slave_list);
152 INIT_LIST_HEAD(&mnt->mnt_slave); 153 INIT_LIST_HEAD(&mnt->mnt_slave);
154#ifdef CONFIG_FSNOTIFY
155 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
156#endif
153#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
154 mnt->mnt_writers = alloc_percpu(int); 158 mnt->mnt_writers = alloc_percpu(int);
155 if (!mnt->mnt_writers) 159 if (!mnt->mnt_writers)
@@ -610,6 +614,7 @@ static inline void __mntput(struct vfsmount *mnt)
610 * provides barriers, so count_mnt_writers() below is safe. AV 614 * provides barriers, so count_mnt_writers() below is safe. AV
611 */ 615 */
612 WARN_ON(count_mnt_writers(mnt)); 616 WARN_ON(count_mnt_writers(mnt));
617 fsnotify_vfsmount_delete(mnt);
613 dput(mnt->mnt_root); 618 dput(mnt->mnt_root);
614 free_vfsmnt(mnt); 619 free_vfsmnt(mnt);
615 deactivate_super(sb); 620 deactivate_super(sb);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8812f6b93969..96360a83cb91 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -934,7 +934,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
934 nfsdstats.io_read += host_err; 934 nfsdstats.io_read += host_err;
935 *count = host_err; 935 *count = host_err;
936 err = 0; 936 err = 0;
937 fsnotify_access(file->f_path.dentry); 937 fsnotify_access(file);
938 } else 938 } else
939 err = nfserrno(host_err); 939 err = nfserrno(host_err);
940out: 940out:
@@ -1045,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1045 goto out_nfserr; 1045 goto out_nfserr;
1046 *cnt = host_err; 1046 *cnt = host_err;
1047 nfsdstats.io_write += host_err; 1047 nfsdstats.io_write += host_err;
1048 fsnotify_modify(file->f_path.dentry); 1048 fsnotify_modify(file);
1049 1049
1050 /* clear setuid/setgid flag after write */ 1050 /* clear setuid/setgid flag after write */
1051 if (inode->i_mode & (S_ISUID | S_ISGID)) 1051 if (inode->i_mode & (S_ISUID | S_ISGID))
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d02..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c46..ae5f33a6d868 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o
2 3
3obj-y += dnotify/ 4obj-y += dnotify/
4obj-y += inotify/ 5obj-y += inotify/
6obj-y += fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964dd..3344bdd5506e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
29int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
30 30
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex); 34static DEFINE_MUTEX(dnotify_mark_mutex);
35 35
36/* 36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which 37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
38 * is being watched by dnotify. If multiple userspace applications are watching 38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn 39 * the same directory with dnotify their information is chained in dn
40 */ 40 */
41struct dnotify_mark_entry { 41struct dnotify_mark {
42 struct fsnotify_mark_entry fsn_entry; 42 struct fsnotify_mark fsn_mark;
43 struct dnotify_struct *dn; 43 struct dnotify_struct *dn;
44}; 44};
45 45
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
51 * it calls the fsnotify function so it can update the set of all events relevant 51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode. 52 * to this inode.
53 */ 53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) 54static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
55{ 55{
56 __u32 new_mask, old_mask; 56 __u32 new_mask, old_mask;
57 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry, 58 struct dnotify_mark *dn_mark = container_of(fsn_mark,
59 struct dnotify_mark_entry, 59 struct dnotify_mark,
60 fsn_entry); 60 fsn_mark);
61 61
62 assert_spin_locked(&entry->lock); 62 assert_spin_locked(&fsn_mark->lock);
63 63
64 old_mask = entry->mask; 64 old_mask = fsn_mark->mask;
65 new_mask = 0; 65 new_mask = 0;
66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) 66 for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
68 entry->mask = new_mask; 68 fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
69 69
70 if (old_mask == new_mask) 70 if (old_mask == new_mask)
71 return; 71 return;
72 72
73 if (entry->inode) 73 if (fsn_mark->i.inode)
74 fsnotify_recalc_inode_mask(entry->inode); 74 fsnotify_recalc_inode_mask(fsn_mark->i.inode);
75} 75}
76 76
77/* 77/*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
83 * events. 83 * events.
84 */ 84 */
85static int dnotify_handle_event(struct fsnotify_group *group, 85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_mark *inode_mark,
87 struct fsnotify_mark *vfsmount_mark,
86 struct fsnotify_event *event) 88 struct fsnotify_event *event)
87{ 89{
88 struct fsnotify_mark_entry *entry = NULL; 90 struct dnotify_mark *dn_mark;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell; 91 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
95 96
96 to_tell = event->to_tell; 97 BUG_ON(vfsmount_mark);
97 98
98 spin_lock(&to_tell->i_lock); 99 to_tell = event->to_tell;
99 entry = fsnotify_find_mark_entry(group, to_tell);
100 spin_unlock(&to_tell->i_lock);
101 100
102 /* unlikely since we alreay passed dnotify_should_send_event() */ 101 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
103 if (unlikely(!entry))
104 return 0;
105 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
106 102
107 spin_lock(&entry->lock); 103 spin_lock(&inode_mark->lock);
108 prev = &dnentry->dn; 104 prev = &dn_mark->dn;
109 while ((dn = *prev) != NULL) { 105 while ((dn = *prev) != NULL) {
110 if ((dn->dn_mask & test_mask) == 0) { 106 if ((dn->dn_mask & test_mask) == 0) {
111 prev = &dn->dn_next; 107 prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
118 else { 114 else {
119 *prev = dn->dn_next; 115 *prev = dn->dn_next;
120 kmem_cache_free(dnotify_struct_cache, dn); 116 kmem_cache_free(dnotify_struct_cache, dn);
121 dnotify_recalc_inode_mask(entry); 117 dnotify_recalc_inode_mask(inode_mark);
122 } 118 }
123 } 119 }
124 120
125 spin_unlock(&entry->lock); 121 spin_unlock(&inode_mark->lock);
126 fsnotify_put_mark(entry);
127 122
128 return 0; 123 return 0;
129} 124}
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
133 * userspace notification for that pair. 128 * userspace notification for that pair.
134 */ 129 */
135static bool dnotify_should_send_event(struct fsnotify_group *group, 130static bool dnotify_should_send_event(struct fsnotify_group *group,
136 struct inode *inode, __u32 mask) 131 struct inode *inode,
132 struct fsnotify_mark *inode_mark,
133 struct fsnotify_mark *vfsmount_mark,
134 __u32 mask, void *data, int data_type)
137{ 135{
138 struct fsnotify_mark_entry *entry;
139 bool send;
140
141 /* !dir_notify_enable should never get here, don't waste time checking
142 if (!dir_notify_enable)
143 return 0; */
144
145 /* not a dir, dnotify doesn't care */ 136 /* not a dir, dnotify doesn't care */
146 if (!S_ISDIR(inode->i_mode)) 137 if (!S_ISDIR(inode->i_mode))
147 return false; 138 return false;
148 139
149 spin_lock(&inode->i_lock); 140 return true;
150 entry = fsnotify_find_mark_entry(group, inode);
151 spin_unlock(&inode->i_lock);
152
153 /* no mark means no dnotify watch */
154 if (!entry)
155 return false;
156
157 mask = (mask & ~FS_EVENT_ON_CHILD);
158 send = (mask & entry->mask);
159
160 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
161
162 return send;
163} 141}
164 142
165static void dnotify_free_mark(struct fsnotify_mark_entry *entry) 143static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
166{ 144{
167 struct dnotify_mark_entry *dnentry = container_of(entry, 145 struct dnotify_mark *dn_mark = container_of(fsn_mark,
168 struct dnotify_mark_entry, 146 struct dnotify_mark,
169 fsn_entry); 147 fsn_mark);
170 148
171 BUG_ON(dnentry->dn); 149 BUG_ON(dn_mark->dn);
172 150
173 kmem_cache_free(dnotify_mark_entry_cache, dnentry); 151 kmem_cache_free(dnotify_mark_cache, dn_mark);
174} 152}
175 153
176static struct fsnotify_ops dnotify_fsnotify_ops = { 154static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
183 161
184/* 162/*
185 * Called every time a file is closed. Looks first for a dnotify mark on the 163 * Called every time a file is closed. Looks first for a dnotify mark on the
186 * inode. If one is found run all of the ->dn entries attached to that 164 * inode. If one is found run all of the ->dn structures attached to that
187 * mark for one relevant to this process closing the file and remove that 165 * mark for one relevant to this process closing the file and remove that
188 * dnotify_struct. If that was the last dnotify_struct also remove the 166 * dnotify_struct. If that was the last dnotify_struct also remove the
189 * fsnotify_mark_entry. 167 * fsnotify_mark.
190 */ 168 */
191void dnotify_flush(struct file *filp, fl_owner_t id) 169void dnotify_flush(struct file *filp, fl_owner_t id)
192{ 170{
193 struct fsnotify_mark_entry *entry; 171 struct fsnotify_mark *fsn_mark;
194 struct dnotify_mark_entry *dnentry; 172 struct dnotify_mark *dn_mark;
195 struct dnotify_struct *dn; 173 struct dnotify_struct *dn;
196 struct dnotify_struct **prev; 174 struct dnotify_struct **prev;
197 struct inode *inode; 175 struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
200 if (!S_ISDIR(inode->i_mode)) 178 if (!S_ISDIR(inode->i_mode))
201 return; 179 return;
202 180
203 spin_lock(&inode->i_lock); 181 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
204 entry = fsnotify_find_mark_entry(dnotify_group, inode); 182 if (!fsn_mark)
205 spin_unlock(&inode->i_lock);
206 if (!entry)
207 return; 183 return;
208 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); 184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
209 185
210 mutex_lock(&dnotify_mark_mutex); 186 mutex_lock(&dnotify_mark_mutex);
211 187
212 spin_lock(&entry->lock); 188 spin_lock(&fsn_mark->lock);
213 prev = &dnentry->dn; 189 prev = &dn_mark->dn;
214 while ((dn = *prev) != NULL) { 190 while ((dn = *prev) != NULL) {
215 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 191 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
216 *prev = dn->dn_next; 192 *prev = dn->dn_next;
217 kmem_cache_free(dnotify_struct_cache, dn); 193 kmem_cache_free(dnotify_struct_cache, dn);
218 dnotify_recalc_inode_mask(entry); 194 dnotify_recalc_inode_mask(fsn_mark);
219 break; 195 break;
220 } 196 }
221 prev = &dn->dn_next; 197 prev = &dn->dn_next;
222 } 198 }
223 199
224 spin_unlock(&entry->lock); 200 spin_unlock(&fsn_mark->lock);
225 201
226 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 202 /* nothing else could have found us thanks to the dnotify_mark_mutex */
227 if (dnentry->dn == NULL) 203 if (dn_mark->dn == NULL)
228 fsnotify_destroy_mark_by_entry(entry); 204 fsnotify_destroy_mark(fsn_mark);
229
230 fsnotify_recalc_group_mask(dnotify_group);
231 205
232 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_mark_mutex);
233 207
234 fsnotify_put_mark(entry); 208 fsnotify_put_mark(fsn_mark);
235} 209}
236 210
237/* this conversion is done only at watch creation */ 211/* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
259 233
260/* 234/*
261 * If multiple processes watch the same inode with dnotify there is only one 235 * If multiple processes watch the same inode with dnotify there is only one
262 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct 236 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
263 * onto that mark. This function either attaches the new dnotify_struct onto 237 * onto that mark. This function either attaches the new dnotify_struct onto
264 * that list, or it |= the mask onto an existing dnofiy_struct. 238 * that list, or it |= the mask onto an existing dnofiy_struct.
265 */ 239 */
266static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, 240static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
267 fl_owner_t id, int fd, struct file *filp, __u32 mask) 241 fl_owner_t id, int fd, struct file *filp, __u32 mask)
268{ 242{
269 struct dnotify_struct *odn; 243 struct dnotify_struct *odn;
270 244
271 odn = dnentry->dn; 245 odn = dn_mark->dn;
272 while (odn != NULL) { 246 while (odn != NULL) {
273 /* adding more events to existing dnofiy_struct? */ 247 /* adding more events to existing dnofiy_struct? */
274 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { 248 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
283 dn->dn_fd = fd; 257 dn->dn_fd = fd;
284 dn->dn_filp = filp; 258 dn->dn_filp = filp;
285 dn->dn_owner = id; 259 dn->dn_owner = id;
286 dn->dn_next = dnentry->dn; 260 dn->dn_next = dn_mark->dn;
287 dnentry->dn = dn; 261 dn_mark->dn = dn;
288 262
289 return 0; 263 return 0;
290} 264}
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
296 */ 270 */
297int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 271int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
298{ 272{
299 struct dnotify_mark_entry *new_dnentry, *dnentry; 273 struct dnotify_mark *new_dn_mark, *dn_mark;
300 struct fsnotify_mark_entry *new_entry, *entry; 274 struct fsnotify_mark *new_fsn_mark, *fsn_mark;
301 struct dnotify_struct *dn; 275 struct dnotify_struct *dn;
302 struct inode *inode; 276 struct inode *inode;
303 fl_owner_t id = current->files; 277 fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
306 __u32 mask; 280 __u32 mask;
307 281
308 /* we use these to tell if we need to kfree */ 282 /* we use these to tell if we need to kfree */
309 new_entry = NULL; 283 new_fsn_mark = NULL;
310 dn = NULL; 284 dn = NULL;
311 285
312 if (!dir_notify_enable) { 286 if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
336 } 310 }
337 311
338 /* new fsnotify mark, we expect most fcntl calls to add a new mark */ 312 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
339 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); 313 new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
340 if (!new_dnentry) { 314 if (!new_dn_mark) {
341 error = -ENOMEM; 315 error = -ENOMEM;
342 goto out_err; 316 goto out_err;
343 } 317 }
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
345 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ 319 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
346 mask = convert_arg(arg); 320 mask = convert_arg(arg);
347 321
348 /* set up the new_entry and new_dnentry */ 322 /* set up the new_fsn_mark and new_dn_mark */
349 new_entry = &new_dnentry->fsn_entry; 323 new_fsn_mark = &new_dn_mark->fsn_mark;
350 fsnotify_init_mark(new_entry, dnotify_free_mark); 324 fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
351 new_entry->mask = mask; 325 new_fsn_mark->mask = mask;
352 new_dnentry->dn = NULL; 326 new_dn_mark->dn = NULL;
353 327
354 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
355 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_mark_mutex);
356 330
357 /* add the new_entry or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
358 spin_lock(&inode->i_lock); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
359 entry = fsnotify_find_mark_entry(dnotify_group, inode); 333 if (fsn_mark) {
360 spin_unlock(&inode->i_lock); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
361 if (entry) { 335 spin_lock(&fsn_mark->lock);
362 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
363 spin_lock(&entry->lock);
364 } else { 336 } else {
365 fsnotify_add_mark(new_entry, dnotify_group, inode); 337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
366 spin_lock(&new_entry->lock); 338 spin_lock(&new_fsn_mark->lock);
367 entry = new_entry; 339 fsn_mark = new_fsn_mark;
368 dnentry = new_dnentry; 340 dn_mark = new_dn_mark;
369 /* we used new_entry, so don't free it */ 341 /* we used new_fsn_mark, so don't free it */
370 new_entry = NULL; 342 new_fsn_mark = NULL;
371 } 343 }
372 344
373 rcu_read_lock(); 345 rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
376 348
377 /* if (f != filp) means that we lost a race and another task/thread 349 /* if (f != filp) means that we lost a race and another task/thread
378 * actually closed the fd we are still playing with before we grabbed 350 * actually closed the fd we are still playing with before we grabbed
379 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the 351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
380 * only time we clean up the mark entries we need to get our mark off 352 * only time we clean up the marks we need to get our mark off
381 * the list. */ 353 * the list. */
382 if (f != filp) { 354 if (f != filp) {
383 /* if we added ourselves, shoot ourselves, it's possible that 355 /* if we added ourselves, shoot ourselves, it's possible that
384 * the flush actually did shoot this entry. That's fine too 356 * the flush actually did shoot this fsn_mark. That's fine too
385 * since multiple calls to destroy_mark is perfectly safe, if 357 * since multiple calls to destroy_mark is perfectly safe, if
386 * we found a dnentry already attached to the inode, just sod 358 * we found a dn_mark already attached to the inode, just sod
387 * off silently as the flush at close time dealt with it. 359 * off silently as the flush at close time dealt with it.
388 */ 360 */
389 if (dnentry == new_dnentry) 361 if (dn_mark == new_dn_mark)
390 destroy = 1; 362 destroy = 1;
391 goto out; 363 goto out;
392 } 364 }
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
394 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 366 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
395 if (error) { 367 if (error) {
396 /* if we added, we must shoot */ 368 /* if we added, we must shoot */
397 if (dnentry == new_dnentry) 369 if (dn_mark == new_dn_mark)
398 destroy = 1; 370 destroy = 1;
399 goto out; 371 goto out;
400 } 372 }
401 373
402 error = attach_dn(dn, dnentry, id, fd, filp, mask); 374 error = attach_dn(dn, dn_mark, id, fd, filp, mask);
403 /* !error means that we attached the dn to the dnentry, so don't free it */ 375 /* !error means that we attached the dn to the dn_mark, so don't free it */
404 if (!error) 376 if (!error)
405 dn = NULL; 377 dn = NULL;
406 /* -EEXIST means that we didn't add this new dn and used an old one. 378 /* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
408 else if (error == -EEXIST) 380 else if (error == -EEXIST)
409 error = 0; 381 error = 0;
410 382
411 dnotify_recalc_inode_mask(entry); 383 dnotify_recalc_inode_mask(fsn_mark);
412out: 384out:
413 spin_unlock(&entry->lock); 385 spin_unlock(&fsn_mark->lock);
414 386
415 if (destroy) 387 if (destroy)
416 fsnotify_destroy_mark_by_entry(entry); 388 fsnotify_destroy_mark(fsn_mark);
417
418 fsnotify_recalc_group_mask(dnotify_group);
419 389
420 mutex_unlock(&dnotify_mark_mutex); 390 mutex_unlock(&dnotify_mark_mutex);
421 fsnotify_put_mark(entry); 391 fsnotify_put_mark(fsn_mark);
422out_err: 392out_err:
423 if (new_entry) 393 if (new_fsn_mark)
424 fsnotify_put_mark(new_entry); 394 fsnotify_put_mark(new_fsn_mark);
425 if (dn) 395 if (dn)
426 kmem_cache_free(dnotify_struct_cache, dn); 396 kmem_cache_free(dnotify_struct_cache, dn);
427 return error; 397 return error;
@@ -430,10 +400,9 @@ out_err:
430static int __init dnotify_init(void) 400static int __init dnotify_init(void)
431{ 401{
432 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); 402 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
433 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); 403 dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
434 404
435 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, 405 dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
436 0, &dnotify_fsnotify_ops);
437 if (IS_ERR(dnotify_group)) 406 if (IS_ERR(dnotify_group))
438 panic("unable to allocate fsnotify group for dnotify\n"); 407 panic("unable to allocate fsnotify group for dnotify\n");
439 return 0; 408 return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000000..3ac36b7bf6b9
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
1config FANOTIFY
2 bool "Filesystem wide access notification"
3 select FSNOTIFY
4 select ANON_INODES
5 default n
6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with
10 the event.
11
12 If unsure, say Y.
13
14config FANOTIFY_ACCESS_PERMISSIONS
15 bool "fanotify permissions checking"
16 depends on FANOTIFY
17 depends on SECURITY
18 default n
19 ---help---
20 Say Y here is you want fanotify listeners to be able to make permissions
21 decisions concerning filesystem events. This is used by some fanotify
22 listeners which need to scan files before allowing the system access to
23 use those files. This is used by some anti-malware vendors and by some
24 hierarchical storage managent systems.
25
26 If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000000..0999213e7e6e
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000000..eb8f73c9c131
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,212 @@
1#include <linux/fanotify.h>
2#include <linux/fdtable.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/init.h>
5#include <linux/jiffies.h>
6#include <linux/kernel.h> /* UINT_MAX */
7#include <linux/mount.h>
8#include <linux/sched.h>
9#include <linux/types.h>
10#include <linux/wait.h>
11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
13{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new);
15
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_FILE):
21 if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
22 (old->file->f_path.dentry == new->file->f_path.dentry))
23 return true;
24 case (FSNOTIFY_EVENT_NONE):
25 return true;
26 default:
27 BUG();
28 };
29 }
30 return false;
31}
32
33/* and the list better be locked by something too! */
34static struct fsnotify_event *fanotify_merge(struct list_head *list,
35 struct fsnotify_event *event)
36{
37 struct fsnotify_event_holder *test_holder;
38 struct fsnotify_event *test_event = NULL;
39 struct fsnotify_event *new_event;
40
41 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
42
43
44 list_for_each_entry_reverse(test_holder, list, event_list) {
45 if (should_merge(test_holder->event, event)) {
46 test_event = test_holder->event;
47 break;
48 }
49 }
50
51 if (!test_event)
52 return NULL;
53
54 fsnotify_get_event(test_event);
55
56 /* if they are exactly the same we are done */
57 if (test_event->mask == event->mask)
58 return test_event;
59
60 /*
61 * if the refcnt == 2 this is the only queue
62 * for this event and so we can update the mask
63 * in place.
64 */
65 if (atomic_read(&test_event->refcnt) == 2) {
66 test_event->mask |= event->mask;
67 return test_event;
68 }
69
70 new_event = fsnotify_clone_event(test_event);
71
72 /* done with test_event */
73 fsnotify_put_event(test_event);
74
75 /* couldn't allocate memory, merge was not possible */
76 if (unlikely(!new_event))
77 return ERR_PTR(-ENOMEM);
78
79 /* build new event and replace it on the list */
80 new_event->mask = (test_event->mask | event->mask);
81 fsnotify_replace_event(test_holder, new_event);
82
83 /* we hold a reference on new_event from clone_event */
84 return new_event;
85}
86
87#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
88static int fanotify_get_response_from_access(struct fsnotify_group *group,
89 struct fsnotify_event *event)
90{
91 int ret;
92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94
95 wait_event(group->fanotify_data.access_waitq, event->response);
96
97 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock);
99 switch (event->response) {
100 case FAN_ALLOW:
101 ret = 0;
102 break;
103 case FAN_DENY:
104 default:
105 ret = -EPERM;
106 }
107 event->response = 0;
108 spin_unlock(&event->lock);
109
110 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
111 group, event, ret);
112
113 return ret;
114}
115#endif
116
117static int fanotify_handle_event(struct fsnotify_group *group,
118 struct fsnotify_mark *inode_mark,
119 struct fsnotify_mark *fanotify_mark,
120 struct fsnotify_event *event)
121{
122 int ret = 0;
123 struct fsnotify_event *notify_event = NULL;
124
125 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
126 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
127 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
128 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
129 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
130 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
134
135 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
136
137 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
138 if (IS_ERR(notify_event))
139 return PTR_ERR(notify_event);
140
141#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
142 if (event->mask & FAN_ALL_PERM_EVENTS) {
143 /* if we merged we need to wait on the new event */
144 if (notify_event)
145 event = notify_event;
146 ret = fanotify_get_response_from_access(group, event);
147 }
148#endif
149
150 if (notify_event)
151 fsnotify_put_event(notify_event);
152
153 return ret;
154}
155
156static bool fanotify_should_send_event(struct fsnotify_group *group,
157 struct inode *to_tell,
158 struct fsnotify_mark *inode_mark,
159 struct fsnotify_mark *vfsmnt_mark,
160 __u32 event_mask, void *data, int data_type)
161{
162 __u32 marks_mask, marks_ignored_mask;
163
164 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167
168 pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
169 __func__, group, vfsmnt_mark, inode_mark, event_mask);
170
171 /* sorry, fanotify only gives a damn about files and dirs */
172 if (!S_ISREG(to_tell->i_mode) &&
173 !S_ISDIR(to_tell->i_mode))
174 return false;
175
176 /* if we don't have enough info to send an event to userspace say no */
177 if (data_type != FSNOTIFY_EVENT_FILE)
178 return false;
179
180 if (inode_mark && vfsmnt_mark) {
181 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
182 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
183 } else if (inode_mark) {
184 /*
185 * if the event is for a child and this inode doesn't care about
186 * events on the child, don't send it!
187 */
188 if ((event_mask & FS_EVENT_ON_CHILD) &&
189 !(inode_mark->mask & FS_EVENT_ON_CHILD))
190 return false;
191 marks_mask = inode_mark->mask;
192 marks_ignored_mask = inode_mark->ignored_mask;
193 } else if (vfsmnt_mark) {
194 marks_mask = vfsmnt_mark->mask;
195 marks_ignored_mask = vfsmnt_mark->ignored_mask;
196 } else {
197 BUG();
198 }
199
200 if (event_mask & marks_mask & ~marks_ignored_mask)
201 return true;
202
203 return false;
204}
205
206const struct fsnotify_ops fanotify_fsnotify_ops = {
207 .handle_event = fanotify_handle_event,
208 .should_send_event = fanotify_should_send_event,
209 .free_group_priv = NULL,
210 .free_event_priv = NULL,
211 .freeing_mark = NULL,
212};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000000..25a3b4dfcf61
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,760 @@
1#include <linux/fanotify.h>
2#include <linux/fcntl.h>
3#include <linux/file.h>
4#include <linux/fs.h>
5#include <linux/anon_inodes.h>
6#include <linux/fsnotify_backend.h>
7#include <linux/init.h>
8#include <linux/mount.h>
9#include <linux/namei.h>
10#include <linux/poll.h>
11#include <linux/security.h>
12#include <linux/syscalls.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/uaccess.h>
16
17#include <asm/ioctls.h>
18
19extern const struct fsnotify_ops fanotify_fsnotify_ops;
20
21static struct kmem_cache *fanotify_mark_cache __read_mostly;
22static struct kmem_cache *fanotify_response_event_cache __read_mostly;
23
24struct fanotify_response_event {
25 struct list_head list;
26 __s32 fd;
27 struct fsnotify_event *event;
28};
29
30/*
31 * Get an fsnotify notification event if one exists and is small
32 * enough to fit in "count". Return an error pointer if the count
33 * is not large enough.
34 *
35 * Called with the group->notification_mutex held.
36 */
37static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
38 size_t count)
39{
40 BUG_ON(!mutex_is_locked(&group->notification_mutex));
41
42 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
43
44 if (fsnotify_notify_queue_is_empty(group))
45 return NULL;
46
47 if (FAN_EVENT_METADATA_LEN > count)
48 return ERR_PTR(-EINVAL);
49
50 /* held the notification_mutex the whole time, so this is the
51 * same event we peeked above */
52 return fsnotify_remove_notify_event(group);
53}
54
55static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
56{
57 int client_fd;
58 struct dentry *dentry;
59 struct vfsmount *mnt;
60 struct file *new_file;
61
62 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
63
64 client_fd = get_unused_fd();
65 if (client_fd < 0)
66 return client_fd;
67
68 if (event->data_type != FSNOTIFY_EVENT_FILE) {
69 WARN_ON(1);
70 put_unused_fd(client_fd);
71 return -EINVAL;
72 }
73
74 /*
75 * we need a new file handle for the userspace program so it can read even if it was
76 * originally opened O_WRONLY.
77 */
78 dentry = dget(event->file->f_path.dentry);
79 mnt = mntget(event->file->f_path.mnt);
80 /* it's possible this event was an overflow event. in that case dentry and mnt
81 * are NULL; That's fine, just don't call dentry open */
82 if (dentry && mnt)
83 new_file = dentry_open(dentry, mnt,
84 group->fanotify_data.f_flags | FMODE_NONOTIFY,
85 current_cred());
86 else
87 new_file = ERR_PTR(-EOVERFLOW);
88 if (IS_ERR(new_file)) {
89 /*
90 * we still send an event even if we can't open the file. this
91 * can happen when say tasks are gone and we try to open their
92 * /proc files or we try to open a WRONLY file like in sysfs
93 * we just send the errno to userspace since there isn't much
94 * else we can do.
95 */
96 put_unused_fd(client_fd);
97 client_fd = PTR_ERR(new_file);
98 } else {
99 fd_install(client_fd, new_file);
100 }
101
102 return client_fd;
103}
104
105static ssize_t fill_event_metadata(struct fsnotify_group *group,
106 struct fanotify_event_metadata *metadata,
107 struct fsnotify_event *event)
108{
109 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
110 group, metadata, event);
111
112 metadata->event_len = FAN_EVENT_METADATA_LEN;
113 metadata->vers = FANOTIFY_METADATA_VERSION;
114 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
115 metadata->pid = pid_vnr(event->tgid);
116 metadata->fd = create_fd(group, event);
117
118 return metadata->fd;
119}
120
121#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
122static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
123 __s32 fd)
124{
125 struct fanotify_response_event *re, *return_re = NULL;
126
127 mutex_lock(&group->fanotify_data.access_mutex);
128 list_for_each_entry(re, &group->fanotify_data.access_list, list) {
129 if (re->fd != fd)
130 continue;
131
132 list_del_init(&re->list);
133 return_re = re;
134 break;
135 }
136 mutex_unlock(&group->fanotify_data.access_mutex);
137
138 pr_debug("%s: found return_re=%p\n", __func__, return_re);
139
140 return return_re;
141}
142
143static int process_access_response(struct fsnotify_group *group,
144 struct fanotify_response *response_struct)
145{
146 struct fanotify_response_event *re;
147 __s32 fd = response_struct->fd;
148 __u32 response = response_struct->response;
149
150 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
151 fd, response);
152 /*
153 * make sure the response is valid, if invalid we do nothing and either
154 * userspace can send a valid responce or we will clean it up after the
155 * timeout
156 */
157 switch (response) {
158 case FAN_ALLOW:
159 case FAN_DENY:
160 break;
161 default:
162 return -EINVAL;
163 }
164
165 if (fd < 0)
166 return -EINVAL;
167
168 re = dequeue_re(group, fd);
169 if (!re)
170 return -ENOENT;
171
172 re->event->response = response;
173
174 wake_up(&group->fanotify_data.access_waitq);
175
176 kmem_cache_free(fanotify_response_event_cache, re);
177
178 return 0;
179}
180
181static int prepare_for_access_response(struct fsnotify_group *group,
182 struct fsnotify_event *event,
183 __s32 fd)
184{
185 struct fanotify_response_event *re;
186
187 if (!(event->mask & FAN_ALL_PERM_EVENTS))
188 return 0;
189
190 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
191 if (!re)
192 return -ENOMEM;
193
194 re->event = event;
195 re->fd = fd;
196
197 mutex_lock(&group->fanotify_data.access_mutex);
198 list_add_tail(&re->list, &group->fanotify_data.access_list);
199 mutex_unlock(&group->fanotify_data.access_mutex);
200
201 return 0;
202}
203
204static void remove_access_response(struct fsnotify_group *group,
205 struct fsnotify_event *event,
206 __s32 fd)
207{
208 struct fanotify_response_event *re;
209
210 if (!(event->mask & FAN_ALL_PERM_EVENTS))
211 return;
212
213 re = dequeue_re(group, fd);
214 if (!re)
215 return;
216
217 BUG_ON(re->event != event);
218
219 kmem_cache_free(fanotify_response_event_cache, re);
220
221 return;
222}
223#else
224static int prepare_for_access_response(struct fsnotify_group *group,
225 struct fsnotify_event *event,
226 __s32 fd)
227{
228 return 0;
229}
230
231static void remove_access_response(struct fsnotify_group *group,
232 struct fsnotify_event *event,
233 __s32 fd)
234{
235 return;
236}
237#endif
238
239static ssize_t copy_event_to_user(struct fsnotify_group *group,
240 struct fsnotify_event *event,
241 char __user *buf)
242{
243 struct fanotify_event_metadata fanotify_event_metadata;
244 int fd, ret;
245
246 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
247
248 fd = fill_event_metadata(group, &fanotify_event_metadata, event);
249 if (fd < 0)
250 return fd;
251
252 ret = prepare_for_access_response(group, event, fd);
253 if (ret)
254 goto out_close_fd;
255
256 ret = -EFAULT;
257 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
258 goto out_kill_access_response;
259
260 return FAN_EVENT_METADATA_LEN;
261
262out_kill_access_response:
263 remove_access_response(group, event, fd);
264out_close_fd:
265 sys_close(fd);
266 return ret;
267}
268
269/* intofiy userspace file descriptor functions */
270static unsigned int fanotify_poll(struct file *file, poll_table *wait)
271{
272 struct fsnotify_group *group = file->private_data;
273 int ret = 0;
274
275 poll_wait(file, &group->notification_waitq, wait);
276 mutex_lock(&group->notification_mutex);
277 if (!fsnotify_notify_queue_is_empty(group))
278 ret = POLLIN | POLLRDNORM;
279 mutex_unlock(&group->notification_mutex);
280
281 return ret;
282}
283
284static ssize_t fanotify_read(struct file *file, char __user *buf,
285 size_t count, loff_t *pos)
286{
287 struct fsnotify_group *group;
288 struct fsnotify_event *kevent;
289 char __user *start;
290 int ret;
291 DEFINE_WAIT(wait);
292
293 start = buf;
294 group = file->private_data;
295
296 pr_debug("%s: group=%p\n", __func__, group);
297
298 while (1) {
299 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
300
301 mutex_lock(&group->notification_mutex);
302 kevent = get_one_event(group, count);
303 mutex_unlock(&group->notification_mutex);
304
305 if (kevent) {
306 ret = PTR_ERR(kevent);
307 if (IS_ERR(kevent))
308 break;
309 ret = copy_event_to_user(group, kevent, buf);
310 fsnotify_put_event(kevent);
311 if (ret < 0)
312 break;
313 buf += ret;
314 count -= ret;
315 continue;
316 }
317
318 ret = -EAGAIN;
319 if (file->f_flags & O_NONBLOCK)
320 break;
321 ret = -EINTR;
322 if (signal_pending(current))
323 break;
324
325 if (start != buf)
326 break;
327
328 schedule();
329 }
330
331 finish_wait(&group->notification_waitq, &wait);
332 if (start != buf && ret != -EFAULT)
333 ret = buf - start;
334 return ret;
335}
336
337static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
338{
339#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
340 struct fanotify_response response = { .fd = -1, .response = -1 };
341 struct fsnotify_group *group;
342 int ret;
343
344 group = file->private_data;
345
346 if (count > sizeof(response))
347 count = sizeof(response);
348
349 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
350
351 if (copy_from_user(&response, buf, count))
352 return -EFAULT;
353
354 ret = process_access_response(group, &response);
355 if (ret < 0)
356 count = ret;
357
358 return count;
359#else
360 return -EINVAL;
361#endif
362}
363
364static int fanotify_release(struct inode *ignored, struct file *file)
365{
366 struct fsnotify_group *group = file->private_data;
367
368 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
369
370 /* matches the fanotify_init->fsnotify_alloc_group */
371 fsnotify_put_group(group);
372
373 return 0;
374}
375
376static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
377{
378 struct fsnotify_group *group;
379 struct fsnotify_event_holder *holder;
380 void __user *p;
381 int ret = -ENOTTY;
382 size_t send_len = 0;
383
384 group = file->private_data;
385
386 p = (void __user *) arg;
387
388 switch (cmd) {
389 case FIONREAD:
390 mutex_lock(&group->notification_mutex);
391 list_for_each_entry(holder, &group->notification_list, event_list)
392 send_len += FAN_EVENT_METADATA_LEN;
393 mutex_unlock(&group->notification_mutex);
394 ret = put_user(send_len, (int __user *) p);
395 break;
396 }
397
398 return ret;
399}
400
401static const struct file_operations fanotify_fops = {
402 .poll = fanotify_poll,
403 .read = fanotify_read,
404 .write = fanotify_write,
405 .fasync = NULL,
406 .release = fanotify_release,
407 .unlocked_ioctl = fanotify_ioctl,
408 .compat_ioctl = fanotify_ioctl,
409};
410
411static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
412{
413 kmem_cache_free(fanotify_mark_cache, fsn_mark);
414}
415
416static int fanotify_find_path(int dfd, const char __user *filename,
417 struct path *path, unsigned int flags)
418{
419 int ret;
420
421 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
422 dfd, filename, flags);
423
424 if (filename == NULL) {
425 struct file *file;
426 int fput_needed;
427
428 ret = -EBADF;
429 file = fget_light(dfd, &fput_needed);
430 if (!file)
431 goto out;
432
433 ret = -ENOTDIR;
434 if ((flags & FAN_MARK_ONLYDIR) &&
435 !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
436 fput_light(file, fput_needed);
437 goto out;
438 }
439
440 *path = file->f_path;
441 path_get(path);
442 fput_light(file, fput_needed);
443 } else {
444 unsigned int lookup_flags = 0;
445
446 if (!(flags & FAN_MARK_DONT_FOLLOW))
447 lookup_flags |= LOOKUP_FOLLOW;
448 if (flags & FAN_MARK_ONLYDIR)
449 lookup_flags |= LOOKUP_DIRECTORY;
450
451 ret = user_path_at(dfd, filename, lookup_flags, path);
452 if (ret)
453 goto out;
454 }
455
456 /* you can only watch an inode if you have read permissions on it */
457 ret = inode_permission(path->dentry->d_inode, MAY_READ);
458 if (ret)
459 path_put(path);
460out:
461 return ret;
462}
463
464static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
465 __u32 mask,
466 unsigned int flags)
467{
468 __u32 oldmask;
469
470 spin_lock(&fsn_mark->lock);
471 if (!(flags & FAN_MARK_IGNORED_MASK)) {
472 oldmask = fsn_mark->mask;
473 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
474 } else {
475 oldmask = fsn_mark->ignored_mask;
476 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
477 }
478 spin_unlock(&fsn_mark->lock);
479
480 if (!(oldmask & ~mask))
481 fsnotify_destroy_mark(fsn_mark);
482
483 return mask & oldmask;
484}
485
486static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
487 struct vfsmount *mnt, __u32 mask,
488 unsigned int flags)
489{
490 struct fsnotify_mark *fsn_mark = NULL;
491 __u32 removed;
492
493 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
494 if (!fsn_mark)
495 return -ENOENT;
496
497 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
498 fsnotify_put_mark(fsn_mark);
499 if (removed & mnt->mnt_fsnotify_mask)
500 fsnotify_recalc_vfsmount_mask(mnt);
501
502 return 0;
503}
504
505static int fanotify_remove_inode_mark(struct fsnotify_group *group,
506 struct inode *inode, __u32 mask,
507 unsigned int flags)
508{
509 struct fsnotify_mark *fsn_mark = NULL;
510 __u32 removed;
511
512 fsn_mark = fsnotify_find_inode_mark(group, inode);
513 if (!fsn_mark)
514 return -ENOENT;
515
516 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
517 /* matches the fsnotify_find_inode_mark() */
518 fsnotify_put_mark(fsn_mark);
519 if (removed & inode->i_fsnotify_mask)
520 fsnotify_recalc_inode_mask(inode);
521
522 return 0;
523}
524
525static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
526 __u32 mask,
527 unsigned int flags)
528{
529 __u32 oldmask;
530
531 spin_lock(&fsn_mark->lock);
532 if (!(flags & FAN_MARK_IGNORED_MASK)) {
533 oldmask = fsn_mark->mask;
534 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
535 } else {
536 oldmask = fsn_mark->ignored_mask;
537 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
538 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
539 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
540 }
541 spin_unlock(&fsn_mark->lock);
542
543 return mask & ~oldmask;
544}
545
546static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
547 struct vfsmount *mnt, __u32 mask,
548 unsigned int flags)
549{
550 struct fsnotify_mark *fsn_mark;
551 __u32 added;
552
553 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
554 if (!fsn_mark) {
555 int ret;
556
557 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
558 if (!fsn_mark)
559 return -ENOMEM;
560
561 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
562 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
563 if (ret) {
564 fanotify_free_mark(fsn_mark);
565 return ret;
566 }
567 }
568 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
569 fsnotify_put_mark(fsn_mark);
570 if (added & ~mnt->mnt_fsnotify_mask)
571 fsnotify_recalc_vfsmount_mask(mnt);
572
573 return 0;
574}
575
576static int fanotify_add_inode_mark(struct fsnotify_group *group,
577 struct inode *inode, __u32 mask,
578 unsigned int flags)
579{
580 struct fsnotify_mark *fsn_mark;
581 __u32 added;
582
583 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
584
585 fsn_mark = fsnotify_find_inode_mark(group, inode);
586 if (!fsn_mark) {
587 int ret;
588
589 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
590 if (!fsn_mark)
591 return -ENOMEM;
592
593 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
594 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
595 if (ret) {
596 fanotify_free_mark(fsn_mark);
597 return ret;
598 }
599 }
600 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
601 fsnotify_put_mark(fsn_mark);
602 if (added & ~inode->i_fsnotify_mask)
603 fsnotify_recalc_inode_mask(inode);
604 return 0;
605}
606
607/* fanotify syscalls */
608SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
609{
610 struct fsnotify_group *group;
611 int f_flags, fd;
612
613 pr_debug("%s: flags=%d event_f_flags=%d\n",
614 __func__, flags, event_f_flags);
615
616 if (!capable(CAP_SYS_ADMIN))
617 return -EACCES;
618
619 if (flags & ~FAN_ALL_INIT_FLAGS)
620 return -EINVAL;
621
622 f_flags = O_RDWR | FMODE_NONOTIFY;
623 if (flags & FAN_CLOEXEC)
624 f_flags |= O_CLOEXEC;
625 if (flags & FAN_NONBLOCK)
626 f_flags |= O_NONBLOCK;
627
628 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
629 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
630 if (IS_ERR(group))
631 return PTR_ERR(group);
632
633 group->fanotify_data.f_flags = event_f_flags;
634#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
635 mutex_init(&group->fanotify_data.access_mutex);
636 init_waitqueue_head(&group->fanotify_data.access_waitq);
637 INIT_LIST_HEAD(&group->fanotify_data.access_list);
638#endif
639
640 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
641 if (fd < 0)
642 goto out_put_group;
643
644 return fd;
645
646out_put_group:
647 fsnotify_put_group(group);
648 return fd;
649}
650
651SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
652 __u64 mask, int dfd,
653 const char __user * pathname)
654{
655 struct inode *inode = NULL;
656 struct vfsmount *mnt = NULL;
657 struct fsnotify_group *group;
658 struct file *filp;
659 struct path path;
660 int ret, fput_needed;
661
662 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
663 __func__, fanotify_fd, flags, dfd, pathname, mask);
664
665 /* we only use the lower 32 bits as of right now. */
666 if (mask & ((__u64)0xffffffff << 32))
667 return -EINVAL;
668
669 if (flags & ~FAN_ALL_MARK_FLAGS)
670 return -EINVAL;
671 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
672 case FAN_MARK_ADD:
673 case FAN_MARK_REMOVE:
674 case FAN_MARK_FLUSH:
675 break;
676 default:
677 return -EINVAL;
678 }
679#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
680 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
681#else
682 if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
683#endif
684 return -EINVAL;
685
686 filp = fget_light(fanotify_fd, &fput_needed);
687 if (unlikely(!filp))
688 return -EBADF;
689
690 /* verify that this is indeed an fanotify instance */
691 ret = -EINVAL;
692 if (unlikely(filp->f_op != &fanotify_fops))
693 goto fput_and_out;
694
695 ret = fanotify_find_path(dfd, pathname, &path, flags);
696 if (ret)
697 goto fput_and_out;
698
699 /* inode held in place by reference to path; group by fget on fd */
700 if (!(flags & FAN_MARK_MOUNT))
701 inode = path.dentry->d_inode;
702 else
703 mnt = path.mnt;
704 group = filp->private_data;
705
706 /* create/update an inode mark */
707 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
708 case FAN_MARK_ADD:
709 if (flags & FAN_MARK_MOUNT)
710 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
711 else
712 ret = fanotify_add_inode_mark(group, inode, mask, flags);
713 break;
714 case FAN_MARK_REMOVE:
715 if (flags & FAN_MARK_MOUNT)
716 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
717 else
718 ret = fanotify_remove_inode_mark(group, inode, mask, flags);
719 break;
720 case FAN_MARK_FLUSH:
721 if (flags & FAN_MARK_MOUNT)
722 fsnotify_clear_vfsmount_marks_by_group(group);
723 else
724 fsnotify_clear_inode_marks_by_group(group);
725 break;
726 default:
727 ret = -EINVAL;
728 }
729
730 path_put(&path);
731fput_and_out:
732 fput_light(filp, fput_needed);
733 return ret;
734}
735
736#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
737asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
738 long dfd, long pathname)
739{
740 return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
741 mask, (int) dfd,
742 (const char __user *) pathname);
743}
744SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
745#endif
746
747/*
748 * fanotify_user_setup - Our initialization function. Note that we cannnot return
749 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
750 * must result in panic().
751 */
752static int __init fanotify_user_setup(void)
753{
754 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
755 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
756 SLAB_PANIC);
757
758 return 0;
759}
760device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af83..4d2a82c1ceb1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/mount.h>
24#include <linux/srcu.h> 25#include <linux/srcu.h>
25 26
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
35} 36}
36EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); 37EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
37 38
39void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
40{
41 fsnotify_clear_marks_by_mount(mnt);
42}
43
38/* 44/*
39 * Given an inode, first check if we care what happens to our children. Inotify 45 * Given an inode, first check if we care what happens to our children. Inotify
40 * and dnotify both tell their parents about events. If we care about any event 46 * and dnotify both tell their parents about events. If we care about any event
@@ -78,13 +84,16 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
78} 84}
79 85
80/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
81void __fsnotify_parent(struct dentry *dentry, __u32 mask) 87void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
82{ 88{
83 struct dentry *parent; 89 struct dentry *parent;
84 struct inode *p_inode; 90 struct inode *p_inode;
85 bool send = false; 91 bool send = false;
86 bool should_update_children = false; 92 bool should_update_children = false;
87 93
94 if (!dentry)
95 dentry = file->f_path.dentry;
96
88 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
89 return; 98 return;
90 99
@@ -115,8 +124,12 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
115 * specifies these are events which came from a child. */ 124 * specifies these are events which came from a child. */
116 mask |= FS_EVENT_ON_CHILD; 125 mask |= FS_EVENT_ON_CHILD;
117 126
118 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 127 if (file)
119 dentry->d_name.name, 0); 128 fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
129 dentry->d_name.name, 0);
130 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0);
120 dput(parent); 133 dput(parent);
121 } 134 }
122 135
@@ -127,63 +140,181 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
127} 140}
128EXPORT_SYMBOL_GPL(__fsnotify_parent); 141EXPORT_SYMBOL_GPL(__fsnotify_parent);
129 142
143static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
144 struct fsnotify_mark *inode_mark,
145 struct fsnotify_mark *vfsmount_mark,
146 __u32 mask, void *data,
147 int data_is, u32 cookie,
148 const unsigned char *file_name,
149 struct fsnotify_event **event)
150{
151 struct fsnotify_group *group = inode_mark->group;
152 __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
153 __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
154
155 pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
156 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
157 mnt, inode_mark, mask, data, data_is, cookie, *event);
158
159 /* clear ignored on inode modification */
160 if (mask & FS_MODIFY) {
161 if (inode_mark &&
162 !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
163 inode_mark->ignored_mask = 0;
164 if (vfsmount_mark &&
165 !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
166 vfsmount_mark->ignored_mask = 0;
167 }
168
169 /* does the inode mark tell us to do something? */
170 if (inode_mark) {
171 inode_test_mask &= inode_mark->mask;
172 inode_test_mask &= ~inode_mark->ignored_mask;
173 }
174
175 /* does the vfsmount_mark tell us to do something? */
176 if (vfsmount_mark) {
177 vfsmount_test_mask &= vfsmount_mark->mask;
178 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
179 if (inode_mark)
180 vfsmount_test_mask &= ~inode_mark->ignored_mask;
181 }
182
183 if (!inode_test_mask && !vfsmount_test_mask)
184 return 0;
185
186 if (group->ops->should_send_event(group, to_tell, inode_mark,
187 vfsmount_mark, mask, data,
188 data_is) == false)
189 return 0;
190
191 if (!*event) {
192 *event = fsnotify_create_event(to_tell, mask, data,
193 data_is, file_name,
194 cookie, GFP_KERNEL);
195 if (!*event)
196 return -ENOMEM;
197 }
198 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
199}
200
130/* 201/*
131 * This is the main call to fsnotify. The VFS calls into hook specific functions 202 * This is the main call to fsnotify. The VFS calls into hook specific functions
132 * in linux/fsnotify.h. Those functions then in turn call here. Here will call 203 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
133 * out to all of the registered fsnotify_group. Those groups can then use the 204 * out to all of the registered fsnotify_group. Those groups can then use the
134 * notification event in whatever means they feel necessary. 205 * notification event in whatever means they feel necessary.
135 */ 206 */
136void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) 207int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
208 const unsigned char *file_name, u32 cookie)
137{ 209{
138 struct fsnotify_group *group; 210 struct hlist_node *inode_node, *vfsmount_node;
211 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
212 struct fsnotify_group *inode_group, *vfsmount_group;
139 struct fsnotify_event *event = NULL; 213 struct fsnotify_event *event = NULL;
140 int idx; 214 struct vfsmount *mnt;
215 int idx, ret = 0;
216 bool used_inode = false, used_vfsmount = false;
141 /* global tests shouldn't care about events on child only the specific event */ 217 /* global tests shouldn't care about events on child only the specific event */
142 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 218 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
143 219
144 if (list_empty(&fsnotify_groups)) 220 if (data_is == FSNOTIFY_EVENT_FILE)
145 return; 221 mnt = ((struct file *)data)->f_path.mnt;
222 else
223 mnt = NULL;
146 224
147 if (!(test_mask & fsnotify_mask))
148 return;
149
150 if (!(test_mask & to_tell->i_fsnotify_mask))
151 return;
152 /* 225 /*
153 * SRCU!! the groups list is very very much read only and the path is 226 * if this is a modify event we may need to clear the ignored masks
154 * very hot. The VAST majority of events are not going to need to do 227 * otherwise return if neither the inode nor the vfsmount care about
155 * anything other than walk the list so it's crazy to pre-allocate. 228 * this type of event.
156 */ 229 */
157 idx = srcu_read_lock(&fsnotify_grp_srcu); 230 if (!(mask & FS_MODIFY) &&
158 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { 231 !(test_mask & to_tell->i_fsnotify_mask) &&
159 if (test_mask & group->mask) { 232 !(mnt && test_mask & mnt->mnt_fsnotify_mask))
160 if (!group->ops->should_send_event(group, to_tell, mask)) 233 return 0;
161 continue; 234
162 if (!event) { 235 idx = srcu_read_lock(&fsnotify_mark_srcu);
163 event = fsnotify_create_event(to_tell, mask, data, 236
164 data_is, file_name, cookie, 237 if ((mask & FS_MODIFY) ||
165 GFP_KERNEL); 238 (test_mask & to_tell->i_fsnotify_mask))
166 /* shit, we OOM'd and now we can't tell, maybe 239 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
167 * someday someone else will want to do something 240 &fsnotify_mark_srcu);
168 * here */ 241 else
169 if (!event) 242 inode_node = NULL;
170 break; 243
171 } 244 if (mnt) {
172 group->ops->handle_event(group, event); 245 if ((mask & FS_MODIFY) ||
246 (test_mask & mnt->mnt_fsnotify_mask))
247 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
248 &fsnotify_mark_srcu);
249 else
250 vfsmount_node = NULL;
251 } else {
252 mnt = NULL;
253 vfsmount_node = NULL;
254 }
255
256 while (inode_node || vfsmount_node) {
257 if (inode_node) {
258 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
259 struct fsnotify_mark, i.i_list);
260 inode_group = inode_mark->group;
261 } else
262 inode_group = (void *)-1;
263
264 if (vfsmount_node) {
265 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
266 struct fsnotify_mark, m.m_list);
267 vfsmount_group = vfsmount_mark->group;
268 } else
269 vfsmount_group = (void *)-1;
270
271 if (inode_group < vfsmount_group) {
272 /* handle inode */
273 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
274 data_is, cookie, file_name, &event);
275 used_inode = true;
276 } else if (vfsmount_group < inode_group) {
277 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
278 data_is, cookie, file_name, &event);
279 used_vfsmount = true;
280 } else {
281 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
282 mask, data, data_is, cookie, file_name,
283 &event);
284 used_vfsmount = true;
285 used_inode = true;
173 } 286 }
287
288 if (used_inode)
289 inode_node = srcu_dereference(inode_node->next,
290 &fsnotify_mark_srcu);
291 if (used_vfsmount)
292 vfsmount_node = srcu_dereference(vfsmount_node->next,
293 &fsnotify_mark_srcu);
174 } 294 }
175 srcu_read_unlock(&fsnotify_grp_srcu, idx); 295
296 srcu_read_unlock(&fsnotify_mark_srcu, idx);
176 /* 297 /*
177 * fsnotify_create_event() took a reference so the event can't be cleaned 298 * fsnotify_create_event() took a reference so the event can't be cleaned
178 * up while we are still trying to add it to lists, drop that one. 299 * up while we are still trying to add it to lists, drop that one.
179 */ 300 */
180 if (event) 301 if (event)
181 fsnotify_put_event(event); 302 fsnotify_put_event(event);
303
304 return ret;
182} 305}
183EXPORT_SYMBOL_GPL(fsnotify); 306EXPORT_SYMBOL_GPL(fsnotify);
184 307
185static __init int fsnotify_init(void) 308static __init int fsnotify_init(void)
186{ 309{
187 return init_srcu_struct(&fsnotify_grp_srcu); 310 int ret;
311
312 BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
313
314 ret = init_srcu_struct(&fsnotify_mark_srcu);
315 if (ret)
316 panic("initializing fsnotify_mark_srcu");
317
318 return 0;
188} 319}
189subsys_initcall(fsnotify_init); 320core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2d..85e7d2b431d9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
6#include <linux/srcu.h> 6#include <linux/srcu.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */ 9/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group); 10extern void fsnotify_flush_notify(struct fsnotify_group *group);
18 11
12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu;
14
15extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
16 __u32 mask);
17/* add a mark to an inode */
18extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
19 struct fsnotify_group *group, struct inode *inode,
20 int allow_dups);
21/* add a mark to a vfsmount */
22extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups);
25
19/* final kfree of a group */ 26/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group); 27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21 28
29/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */
32extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
22/* run the list of all marks associated with inode and flag them to be freed */ 33/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode); 34extern void fsnotify_clear_marks_by_inode(struct inode *inode);
35/* run the list of all marks associated with vfsmount and flag them to be freed */
36extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
24/* 37/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares 38 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children. 39 * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc5..d309f38449cb 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30 30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/* 31/*
90 * Final freeing of a group 32 * Final freeing of a group
91 */ 33 */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
110 */ 52 */
111static void fsnotify_destroy_group(struct fsnotify_group *group) 53static void fsnotify_destroy_group(struct fsnotify_group *group)
112{ 54{
113 /* clear all inode mark entries for this group */ 55 /* clear all inode marks for this group */
114 fsnotify_clear_marks_by_group(group); 56 fsnotify_clear_marks_by_group(group);
115 57
58 synchronize_srcu(&fsnotify_mark_srcu);
59
116 /* past the point of no return, matches the initial value of 1 */ 60 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks)) 61 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group); 62 fsnotify_final_destroy_group(group);
119} 63}
120 64
121/* 65/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through. 66 * Drop a reference to a group. Free it if it's through.
149 */ 67 */
150void fsnotify_put_group(struct fsnotify_group *group) 68void fsnotify_put_group(struct fsnotify_group *group)
151{ 69{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) 70 if (atomic_dec_and_test(&group->refcnt))
153 return; 71 fsnotify_destroy_group(group);
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197} 72}
198 73
199/* 74/*
200 * Either finds an existing group which matches the group_num, mask, and ops or 75 * Create a new fsnotify_group and hold a reference for the group returned.
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */ 76 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, 77struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
205 const struct fsnotify_ops *ops)
206{ 78{
207 struct fsnotify_group *group, *tgroup; 79 struct fsnotify_group *group;
208 80
209 /* very low use, simpler locking if we just always alloc */ 81 group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group) 82 if (!group)
212 return ERR_PTR(-ENOMEM); 83 return ERR_PTR(-ENOMEM);
213 84
85 /* set to 0 when there a no external references to this group */
214 atomic_set(&group->refcnt, 1); 86 atomic_set(&group->refcnt, 1);
215 87 /*
216 group->on_group_list = 0; 88 * hits 0 when there are no external references AND no marks for
217 group->group_num = group_num; 89 * this group
218 group->mask = mask; 90 */
91 atomic_set(&group->num_marks, 1);
219 92
220 mutex_init(&group->notification_mutex); 93 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list); 94 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq); 95 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX; 96 group->max_events = UINT_MAX;
225 97
226 spin_lock_init(&group->mark_lock); 98 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0); 99 INIT_LIST_HEAD(&group->marks_list);
228 INIT_LIST_HEAD(&group->mark_entries);
229 100
230 group->ops = ops; 101 group->ops = ops;
231 102
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group; 103 return group;
254} 104}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 152b83ec005d..33297c005060 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */ 17 */
18 18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h> 19#include <linux/fs.h>
86#include <linux/init.h> 20#include <linux/init.h>
87#include <linux/kernel.h> 21#include <linux/kernel.h>
@@ -95,30 +29,19 @@
95#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
96#include "fsnotify.h" 30#include "fsnotify.h"
97 31
98void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
99{
100 atomic_inc(&entry->refcnt);
101}
102
103void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
104{
105 if (atomic_dec_and_test(&entry->refcnt))
106 entry->free_mark(entry);
107}
108
109/* 32/*
110 * Recalculate the mask of events relevant to a given inode locked. 33 * Recalculate the mask of events relevant to a given inode locked.
111 */ 34 */
112static void fsnotify_recalc_inode_mask_locked(struct inode *inode) 35static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
113{ 36{
114 struct fsnotify_mark_entry *entry; 37 struct fsnotify_mark *mark;
115 struct hlist_node *pos; 38 struct hlist_node *pos;
116 __u32 new_mask = 0; 39 __u32 new_mask = 0;
117 40
118 assert_spin_locked(&inode->i_lock); 41 assert_spin_locked(&inode->i_lock);
119 42
120 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) 43 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
121 new_mask |= entry->mask; 44 new_mask |= mark->mask;
122 inode->i_fsnotify_mask = new_mask; 45 inode->i_fsnotify_mask = new_mask;
123} 46}
124 47
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
135 __fsnotify_update_child_dentry_flags(inode); 58 __fsnotify_update_child_dentry_flags(inode);
136} 59}
137 60
138/* 61void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
139 * Any time a mark is getting freed we end up here.
140 * The caller had better be holding a reference to this mark so we don't actually
141 * do the final put under the entry->lock
142 */
143void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
144{ 62{
145 struct fsnotify_group *group; 63 struct inode *inode = mark->i.inode;
146 struct inode *inode;
147 64
148 spin_lock(&entry->lock); 65 assert_spin_locked(&mark->lock);
66 assert_spin_locked(&mark->group->mark_lock);
149 67
150 group = entry->group;
151 inode = entry->inode;
152
153 BUG_ON(group && !inode);
154 BUG_ON(!group && inode);
155
156 /* if !group something else already marked this to die */
157 if (!group) {
158 spin_unlock(&entry->lock);
159 return;
160 }
161
162 /* 1 from caller and 1 for being on i_list/g_list */
163 BUG_ON(atomic_read(&entry->refcnt) < 2);
164
165 spin_lock(&group->mark_lock);
166 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
167 69
168 hlist_del_init(&entry->i_list); 70 hlist_del_init_rcu(&mark->i.i_list);
169 entry->inode = NULL; 71 mark->i.inode = NULL;
170
171 list_del_init(&entry->g_list);
172 entry->group = NULL;
173
174 fsnotify_put_mark(entry); /* for i_list and g_list */
175 72
176 /* 73 /*
177 * this mark is now off the inode->i_fsnotify_mark_entries list and we 74 * this mark is now off the inode->i_fsnotify_marks list and we
178 * hold the inode->i_lock, so this is the perfect time to update the 75 * hold the inode->i_lock, so this is the perfect time to update the
179 * inode->i_fsnotify_mask 76 * inode->i_fsnotify_mask
180 */ 77 */
181 fsnotify_recalc_inode_mask_locked(inode); 78 fsnotify_recalc_inode_mask_locked(inode);
182 79
183 spin_unlock(&inode->i_lock); 80 spin_unlock(&inode->i_lock);
184 spin_unlock(&group->mark_lock);
185 spin_unlock(&entry->lock);
186
187 /*
188 * Some groups like to know that marks are being freed. This is a
189 * callback to the group function to let it know that this entry
190 * is being freed.
191 */
192 if (group->ops->freeing_mark)
193 group->ops->freeing_mark(entry, group);
194
195 /*
196 * __fsnotify_update_child_dentry_flags(inode);
197 *
198 * I really want to call that, but we can't, we have no idea if the inode
199 * still exists the second we drop the entry->lock.
200 *
201 * The next time an event arrive to this inode from one of it's children
202 * __fsnotify_parent will see that the inode doesn't care about it's
203 * children and will update all of these flags then. So really this
204 * is just a lazy update (and could be a perf win...)
205 */
206
207
208 iput(inode);
209
210 /*
211 * it's possible that this group tried to destroy itself, but this
212 * this mark was simultaneously being freed by inode. If that's the
213 * case, we finish freeing the group here.
214 */
215 if (unlikely(atomic_dec_and_test(&group->num_marks)))
216 fsnotify_final_destroy_group(group);
217}
218
219/*
220 * Given a group, destroy all of the marks associated with that group.
221 */
222void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
223{
224 struct fsnotify_mark_entry *lentry, *entry;
225 LIST_HEAD(free_list);
226
227 spin_lock(&group->mark_lock);
228 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
229 list_add(&entry->free_g_list, &free_list);
230 list_del_init(&entry->g_list);
231 fsnotify_get_mark(entry);
232 }
233 spin_unlock(&group->mark_lock);
234
235 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
236 fsnotify_destroy_mark_by_entry(entry);
237 fsnotify_put_mark(entry);
238 }
239} 81}
240 82
241/* 83/*
@@ -243,112 +85,145 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
243 */ 85 */
244void fsnotify_clear_marks_by_inode(struct inode *inode) 86void fsnotify_clear_marks_by_inode(struct inode *inode)
245{ 87{
246 struct fsnotify_mark_entry *entry, *lentry; 88 struct fsnotify_mark *mark, *lmark;
247 struct hlist_node *pos, *n; 89 struct hlist_node *pos, *n;
248 LIST_HEAD(free_list); 90 LIST_HEAD(free_list);
249 91
250 spin_lock(&inode->i_lock); 92 spin_lock(&inode->i_lock);
251 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { 93 hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
252 list_add(&entry->free_i_list, &free_list); 94 list_add(&mark->i.free_i_list, &free_list);
253 hlist_del_init(&entry->i_list); 95 hlist_del_init_rcu(&mark->i.i_list);
254 fsnotify_get_mark(entry); 96 fsnotify_get_mark(mark);
255 } 97 }
256 spin_unlock(&inode->i_lock); 98 spin_unlock(&inode->i_lock);
257 99
258 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { 100 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
259 fsnotify_destroy_mark_by_entry(entry); 101 fsnotify_destroy_mark(mark);
260 fsnotify_put_mark(entry); 102 fsnotify_put_mark(mark);
261 } 103 }
262} 104}
263 105
264/* 106/*
107 * Given a group clear all of the inode marks associated with that group.
108 */
109void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
110{
111 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
112}
113
114/*
265 * given a group and inode, find the mark associated with that combination. 115 * given a group and inode, find the mark associated with that combination.
266 * if found take a reference to that mark and return it, else return NULL 116 * if found take a reference to that mark and return it, else return NULL
267 */ 117 */
268struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, 118struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
269 struct inode *inode) 119 struct inode *inode)
270{ 120{
271 struct fsnotify_mark_entry *entry; 121 struct fsnotify_mark *mark;
272 struct hlist_node *pos; 122 struct hlist_node *pos;
273 123
274 assert_spin_locked(&inode->i_lock); 124 assert_spin_locked(&inode->i_lock);
275 125
276 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { 126 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
277 if (entry->group == group) { 127 if (mark->group == group) {
278 fsnotify_get_mark(entry); 128 fsnotify_get_mark(mark);
279 return entry; 129 return mark;
280 } 130 }
281 } 131 }
282 return NULL; 132 return NULL;
283} 133}
284 134
285/* 135/*
286 * Nothing fancy, just initialize lists and locks and counters. 136 * given a group and inode, find the mark associated with that combination.
137 * if found take a reference to that mark and return it, else return NULL
287 */ 138 */
288void fsnotify_init_mark(struct fsnotify_mark_entry *entry, 139struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
289 void (*free_mark)(struct fsnotify_mark_entry *entry)) 140 struct inode *inode)
141{
142 struct fsnotify_mark *mark;
143
144 spin_lock(&inode->i_lock);
145 mark = fsnotify_find_inode_mark_locked(group, inode);
146 spin_unlock(&inode->i_lock);
290 147
148 return mark;
149}
150
151/*
152 * If we are setting a mark mask on an inode mark we should pin the inode
153 * in memory.
154 */
155void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
156 __u32 mask)
291{ 157{
292 spin_lock_init(&entry->lock); 158 struct inode *inode;
293 atomic_set(&entry->refcnt, 1); 159
294 INIT_HLIST_NODE(&entry->i_list); 160 assert_spin_locked(&mark->lock);
295 entry->group = NULL; 161
296 entry->mask = 0; 162 if (mask &&
297 entry->inode = NULL; 163 mark->i.inode &&
298 entry->free_mark = free_mark; 164 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
165 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
166 inode = igrab(mark->i.inode);
167 /*
168 * we shouldn't be able to get here if the inode wasn't
169 * already safely held in memory. But bug in case it
170 * ever is wrong.
171 */
172 BUG_ON(!inode);
173 }
299} 174}
300 175
301/* 176/*
302 * Attach an initialized mark entry to a given group and inode. 177 * Attach an initialized mark to a given inode.
303 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
304 * event types should be delivered to which group and for which inodes. 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to the group's location in memory.
305 */ 181 */
306int fsnotify_add_mark(struct fsnotify_mark_entry *entry, 182int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
307 struct fsnotify_group *group, struct inode *inode) 183 struct fsnotify_group *group, struct inode *inode,
184 int allow_dups)
308{ 185{
309 struct fsnotify_mark_entry *lentry; 186 struct fsnotify_mark *lmark;
187 struct hlist_node *node, *last = NULL;
310 int ret = 0; 188 int ret = 0;
311 189
312 inode = igrab(inode); 190 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
313 if (unlikely(!inode)) 191
314 return -EINVAL; 192 assert_spin_locked(&mark->lock);
193 assert_spin_locked(&group->mark_lock);
315 194
316 /*
317 * LOCKING ORDER!!!!
318 * entry->lock
319 * group->mark_lock
320 * inode->i_lock
321 */
322 spin_lock(&entry->lock);
323 spin_lock(&group->mark_lock);
324 spin_lock(&inode->i_lock); 195 spin_lock(&inode->i_lock);
325 196
326 lentry = fsnotify_find_mark_entry(group, inode); 197 mark->i.inode = inode;
327 if (!lentry) {
328 entry->group = group;
329 entry->inode = inode;
330 198
331 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 199 /* is mark the first mark? */
332 list_add(&entry->g_list, &group->mark_entries); 200 if (hlist_empty(&inode->i_fsnotify_marks)) {
201 hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
202 goto out;
203 }
333 204
334 fsnotify_get_mark(entry); /* for i_list and g_list */ 205 /* should mark be in the middle of the current list? */
206 hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
207 last = node;
208
209 if ((lmark->group == group) && !allow_dups) {
210 ret = -EEXIST;
211 goto out;
212 }
335 213
336 atomic_inc(&group->num_marks); 214 if (mark->group < lmark->group)
215 continue;
337 216
338 fsnotify_recalc_inode_mask_locked(inode); 217 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
218 goto out;
339 } 219 }
340 220
221 BUG_ON(last == NULL);
222 /* mark should be the last entry. last is the current last entry */
223 hlist_add_after_rcu(last, &mark->i.i_list);
224out:
225 fsnotify_recalc_inode_mask_locked(inode);
341 spin_unlock(&inode->i_lock); 226 spin_unlock(&inode->i_lock);
342 spin_unlock(&group->mark_lock);
343 spin_unlock(&entry->lock);
344
345 if (lentry) {
346 ret = -EEXIST;
347 iput(inode);
348 fsnotify_put_mark(lentry);
349 } else {
350 __fsnotify_update_child_dentry_flags(inode);
351 }
352 227
353 return ret; 228 return ret;
354} 229}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cfd..b981fc0c8379 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default n
4 ---help---
5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 file change notification system. It is a replacement for dnotify.
7 This option only provides the legacy inotify in kernel API. There
8 are no in tree kernel users of this interface since it is deprecated.
9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
11
12 For more information, see <file:Documentation/filesystems/inotify.txt>
13
14 If unsure, say N.
15
16config INOTIFY_USER 1config INOTIFY_USER
17 bool "Inotify support for userspace" 2 bool "Inotify support for userspace"
18 select ANON_INODES 3 select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 943828171362..a380dabe09de 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index cf6b0429a257..000000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,872 +0,0 @@
1/*
2 * fs/inotify.c - inode-based file event notifications
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
9 *
10 * Copyright (C) 2005 John McCutchan
11 * Copyright 2006 Hewlett-Packard Development Company, L.P.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by the
15 * Free Software Foundation; either version 2, or (at your option) any
16 * later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/spinlock.h>
27#include <linux/idr.h>
28#include <linux/slab.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/list.h>
33#include <linux/writeback.h>
34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
36
37static atomic_t inotify_cookie;
38
39/*
40 * Lock ordering:
41 *
42 * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
43 * iprune_mutex (synchronize shrink_icache_memory())
44 * inode_lock (protects the super_block->s_inodes list)
45 * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
46 * inotify_handle->mutex (protects inotify_handle and watches->h_list)
47 *
48 * The inode->inotify_mutex and inotify_handle->mutex and held during execution
49 * of a caller's event handler. Thus, the caller must not hold any locks
50 * taken in their event handler while calling any of the published inotify
51 * interfaces.
52 */
53
54/*
55 * Lifetimes of the three main data structures--inotify_handle, inode, and
56 * inotify_watch--are managed by reference count.
57 *
58 * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
59 * Additional references can bump the count via get_inotify_handle() and drop
60 * the count via put_inotify_handle().
61 *
62 * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
63 * to remove_watch_no_event(). Additional references can bump the count via
64 * get_inotify_watch() and drop the count via put_inotify_watch(). The caller
65 * is reponsible for the final put after receiving IN_IGNORED, or when using
66 * IN_ONESHOT after receiving the first event. Inotify does the final put if
67 * inotify_destroy() is called.
68 *
69 * inode: Pinned so long as the inode is associated with a watch, from
70 * inotify_add_watch() to the final put_inotify_watch().
71 */
72
73/*
74 * struct inotify_handle - represents an inotify instance
75 *
76 * This structure is protected by the mutex 'mutex'.
77 */
78struct inotify_handle {
79 struct idr idr; /* idr mapping wd -> watch */
80 struct mutex mutex; /* protects this bad boy */
81 struct list_head watches; /* list of watches */
82 atomic_t count; /* reference count */
83 u32 last_wd; /* the last wd allocated */
84 const struct inotify_operations *in_ops; /* inotify caller operations */
85};
86
87static inline void get_inotify_handle(struct inotify_handle *ih)
88{
89 atomic_inc(&ih->count);
90}
91
92static inline void put_inotify_handle(struct inotify_handle *ih)
93{
94 if (atomic_dec_and_test(&ih->count)) {
95 idr_destroy(&ih->idr);
96 kfree(ih);
97 }
98}
99
100/**
101 * get_inotify_watch - grab a reference to an inotify_watch
102 * @watch: watch to grab
103 */
104void get_inotify_watch(struct inotify_watch *watch)
105{
106 atomic_inc(&watch->count);
107}
108EXPORT_SYMBOL_GPL(get_inotify_watch);
109
110int pin_inotify_watch(struct inotify_watch *watch)
111{
112 struct super_block *sb = watch->inode->i_sb;
113 if (atomic_inc_not_zero(&sb->s_active)) {
114 atomic_inc(&watch->count);
115 return 1;
116 }
117 return 0;
118}
119
120/**
121 * put_inotify_watch - decrements the ref count on a given watch. cleans up
122 * watch references if the count reaches zero. inotify_watch is freed by
123 * inotify callers via the destroy_watch() op.
124 * @watch: watch to release
125 */
126void put_inotify_watch(struct inotify_watch *watch)
127{
128 if (atomic_dec_and_test(&watch->count)) {
129 struct inotify_handle *ih = watch->ih;
130
131 iput(watch->inode);
132 ih->in_ops->destroy_watch(watch);
133 put_inotify_handle(ih);
134 }
135}
136EXPORT_SYMBOL_GPL(put_inotify_watch);
137
138void unpin_inotify_watch(struct inotify_watch *watch)
139{
140 struct super_block *sb = watch->inode->i_sb;
141 put_inotify_watch(watch);
142 deactivate_super(sb);
143}
144
145/*
146 * inotify_handle_get_wd - returns the next WD for use by the given handle
147 *
148 * Callers must hold ih->mutex. This function can sleep.
149 */
150static int inotify_handle_get_wd(struct inotify_handle *ih,
151 struct inotify_watch *watch)
152{
153 int ret;
154
155 do {
156 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
157 return -ENOSPC;
158 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
159 } while (ret == -EAGAIN);
160
161 if (likely(!ret))
162 ih->last_wd = watch->wd;
163
164 return ret;
165}
166
167/*
168 * inotify_inode_watched - returns nonzero if there are watches on this inode
169 * and zero otherwise. We call this lockless, we do not care if we race.
170 */
171static inline int inotify_inode_watched(struct inode *inode)
172{
173 return !list_empty(&inode->inotify_watches);
174}
175
176/*
177 * Get child dentry flag into synch with parent inode.
178 * Flag should always be clear for negative dentrys.
179 */
180static void set_dentry_child_flags(struct inode *inode, int watched)
181{
182 struct dentry *alias;
183
184 spin_lock(&dcache_lock);
185 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
186 struct dentry *child;
187
188 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
189 if (!child->d_inode)
190 continue;
191
192 spin_lock(&child->d_lock);
193 if (watched)
194 child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
195 else
196 child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
197 spin_unlock(&child->d_lock);
198 }
199 }
200 spin_unlock(&dcache_lock);
201}
202
203/*
204 * inotify_find_handle - find the watch associated with the given inode and
205 * handle
206 *
207 * Callers must hold inode->inotify_mutex.
208 */
209static struct inotify_watch *inode_find_handle(struct inode *inode,
210 struct inotify_handle *ih)
211{
212 struct inotify_watch *watch;
213
214 list_for_each_entry(watch, &inode->inotify_watches, i_list) {
215 if (watch->ih == ih)
216 return watch;
217 }
218
219 return NULL;
220}
221
222/*
223 * remove_watch_no_event - remove watch without the IN_IGNORED event.
224 *
225 * Callers must hold both inode->inotify_mutex and ih->mutex.
226 */
227static void remove_watch_no_event(struct inotify_watch *watch,
228 struct inotify_handle *ih)
229{
230 list_del(&watch->i_list);
231 list_del(&watch->h_list);
232
233 if (!inotify_inode_watched(watch->inode))
234 set_dentry_child_flags(watch->inode, 0);
235
236 idr_remove(&ih->idr, watch->wd);
237}
238
239/**
240 * inotify_remove_watch_locked - Remove a watch from both the handle and the
241 * inode. Sends the IN_IGNORED event signifying that the inode is no longer
242 * watched. May be invoked from a caller's event handler.
243 * @ih: inotify handle associated with watch
244 * @watch: watch to remove
245 *
246 * Callers must hold both inode->inotify_mutex and ih->mutex.
247 */
248void inotify_remove_watch_locked(struct inotify_handle *ih,
249 struct inotify_watch *watch)
250{
251 remove_watch_no_event(watch, ih);
252 ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
253}
254EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
255
256/* Kernel API for producing events */
257
258/*
259 * inotify_d_instantiate - instantiate dcache entry for inode
260 */
261void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
262{
263 struct dentry *parent;
264
265 if (!inode)
266 return;
267
268 spin_lock(&entry->d_lock);
269 parent = entry->d_parent;
270 if (parent->d_inode && inotify_inode_watched(parent->d_inode))
271 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
272 spin_unlock(&entry->d_lock);
273}
274
275/*
276 * inotify_d_move - dcache entry has been moved
277 */
278void inotify_d_move(struct dentry *entry)
279{
280 struct dentry *parent;
281
282 parent = entry->d_parent;
283 if (inotify_inode_watched(parent->d_inode))
284 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
285 else
286 entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
287}
288
289/**
290 * inotify_inode_queue_event - queue an event to all watches on this inode
291 * @inode: inode event is originating from
292 * @mask: event mask describing this event
293 * @cookie: cookie for synchronization, or zero
294 * @name: filename, if any
295 * @n_inode: inode associated with name
296 */
297void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
298 const char *name, struct inode *n_inode)
299{
300 struct inotify_watch *watch, *next;
301
302 if (!inotify_inode_watched(inode))
303 return;
304
305 mutex_lock(&inode->inotify_mutex);
306 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
307 u32 watch_mask = watch->mask;
308 if (watch_mask & mask) {
309 struct inotify_handle *ih= watch->ih;
310 mutex_lock(&ih->mutex);
311 if (watch_mask & IN_ONESHOT)
312 remove_watch_no_event(watch, ih);
313 ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
314 name, n_inode);
315 mutex_unlock(&ih->mutex);
316 }
317 }
318 mutex_unlock(&inode->inotify_mutex);
319}
320EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
321
322/**
323 * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
324 * @dentry: the dentry in question, we queue against this dentry's parent
325 * @mask: event mask describing this event
326 * @cookie: cookie for synchronization, or zero
327 * @name: filename, if any
328 */
329void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
330 u32 cookie, const char *name)
331{
332 struct dentry *parent;
333 struct inode *inode;
334
335 if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
336 return;
337
338 spin_lock(&dentry->d_lock);
339 parent = dentry->d_parent;
340 inode = parent->d_inode;
341
342 if (inotify_inode_watched(inode)) {
343 dget(parent);
344 spin_unlock(&dentry->d_lock);
345 inotify_inode_queue_event(inode, mask, cookie, name,
346 dentry->d_inode);
347 dput(parent);
348 } else
349 spin_unlock(&dentry->d_lock);
350}
351EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
352
353/**
354 * inotify_get_cookie - return a unique cookie for use in synchronizing events.
355 */
356u32 inotify_get_cookie(void)
357{
358 return atomic_inc_return(&inotify_cookie);
359}
360EXPORT_SYMBOL_GPL(inotify_get_cookie);
361
362/**
363 * inotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
364 * @list: list of inodes being unmounted (sb->s_inodes)
365 *
366 * Called with inode_lock held, protecting the unmounting super block's list
367 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
368 * We temporarily drop inode_lock, however, and CAN block.
369 */
370void inotify_unmount_inodes(struct list_head *list)
371{
372 struct inode *inode, *next_i, *need_iput = NULL;
373
374 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
375 struct inotify_watch *watch, *next_w;
376 struct inode *need_iput_tmp;
377 struct list_head *watches;
378
379 /*
380 * We cannot __iget() an inode in state I_FREEING,
381 * I_WILL_FREE, or I_NEW which is fine because by that point
382 * the inode cannot have any associated watches.
383 */
384 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
385 continue;
386
387 /*
388 * If i_count is zero, the inode cannot have any watches and
389 * doing an __iget/iput with MS_ACTIVE clear would actually
390 * evict all inodes with zero i_count from icache which is
391 * unnecessarily violent and may in fact be illegal to do.
392 */
393 if (!atomic_read(&inode->i_count))
394 continue;
395
396 need_iput_tmp = need_iput;
397 need_iput = NULL;
398 /* In case inotify_remove_watch_locked() drops a reference. */
399 if (inode != need_iput_tmp)
400 __iget(inode);
401 else
402 need_iput_tmp = NULL;
403 /* In case the dropping of a reference would nuke next_i. */
404 if ((&next_i->i_sb_list != list) &&
405 atomic_read(&next_i->i_count) &&
406 !(next_i->i_state & (I_FREEING|I_WILL_FREE))) {
407 __iget(next_i);
408 need_iput = next_i;
409 }
410
411 /*
412 * We can safely drop inode_lock here because we hold
413 * references on both inode and next_i. Also no new inodes
414 * will be added since the umount has begun. Finally,
415 * iprune_mutex keeps shrink_icache_memory() away.
416 */
417 spin_unlock(&inode_lock);
418
419 if (need_iput_tmp)
420 iput(need_iput_tmp);
421
422 /* for each watch, send IN_UNMOUNT and then remove it */
423 mutex_lock(&inode->inotify_mutex);
424 watches = &inode->inotify_watches;
425 list_for_each_entry_safe(watch, next_w, watches, i_list) {
426 struct inotify_handle *ih= watch->ih;
427 get_inotify_watch(watch);
428 mutex_lock(&ih->mutex);
429 ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
430 NULL, NULL);
431 inotify_remove_watch_locked(ih, watch);
432 mutex_unlock(&ih->mutex);
433 put_inotify_watch(watch);
434 }
435 mutex_unlock(&inode->inotify_mutex);
436 iput(inode);
437
438 spin_lock(&inode_lock);
439 }
440}
441EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
442
443/**
444 * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
445 * @inode: inode that is about to be removed
446 */
447void inotify_inode_is_dead(struct inode *inode)
448{
449 struct inotify_watch *watch, *next;
450
451 mutex_lock(&inode->inotify_mutex);
452 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
453 struct inotify_handle *ih = watch->ih;
454 mutex_lock(&ih->mutex);
455 inotify_remove_watch_locked(ih, watch);
456 mutex_unlock(&ih->mutex);
457 }
458 mutex_unlock(&inode->inotify_mutex);
459}
460EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
461
462/* Kernel Consumer API */
463
464/**
465 * inotify_init - allocate and initialize an inotify instance
466 * @ops: caller's inotify operations
467 */
468struct inotify_handle *inotify_init(const struct inotify_operations *ops)
469{
470 struct inotify_handle *ih;
471
472 ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
473 if (unlikely(!ih))
474 return ERR_PTR(-ENOMEM);
475
476 idr_init(&ih->idr);
477 INIT_LIST_HEAD(&ih->watches);
478 mutex_init(&ih->mutex);
479 ih->last_wd = 0;
480 ih->in_ops = ops;
481 atomic_set(&ih->count, 0);
482 get_inotify_handle(ih);
483
484 return ih;
485}
486EXPORT_SYMBOL_GPL(inotify_init);
487
488/**
489 * inotify_init_watch - initialize an inotify watch
490 * @watch: watch to initialize
491 */
492void inotify_init_watch(struct inotify_watch *watch)
493{
494 INIT_LIST_HEAD(&watch->h_list);
495 INIT_LIST_HEAD(&watch->i_list);
496 atomic_set(&watch->count, 0);
497 get_inotify_watch(watch); /* initial get */
498}
499EXPORT_SYMBOL_GPL(inotify_init_watch);
500
501/*
502 * Watch removals suck violently. To kick the watch out we need (in this
503 * order) inode->inotify_mutex and ih->mutex. That's fine if we have
504 * a hold on inode; however, for all other cases we need to make damn sure
505 * we don't race with umount. We can *NOT* just grab a reference to a
506 * watch - inotify_unmount_inodes() will happily sail past it and we'll end
507 * with reference to inode potentially outliving its superblock. Ideally
508 * we just want to grab an active reference to superblock if we can; that
509 * will make sure we won't go into inotify_umount_inodes() until we are
510 * done. Cleanup is just deactivate_super(). However, that leaves a messy
511 * case - what if we *are* racing with umount() and active references to
512 * superblock can't be acquired anymore? We can bump ->s_count, grab
513 * ->s_umount, which will wait until the superblock is shut down and the
514 * watch in question is pining for fjords.
515 *
516 * And yes, this is far beyond mere "not very pretty"; so's the entire
517 * concept of inotify to start with.
518 */
519
520/**
521 * pin_to_kill - pin the watch down for removal
522 * @ih: inotify handle
523 * @watch: watch to kill
524 *
525 * Called with ih->mutex held, drops it. Possible return values:
526 * 0 - nothing to do, it has died
527 * 1 - remove it, drop the reference and deactivate_super()
528 */
529static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
530{
531 struct super_block *sb = watch->inode->i_sb;
532
533 if (atomic_inc_not_zero(&sb->s_active)) {
534 get_inotify_watch(watch);
535 mutex_unlock(&ih->mutex);
536 return 1; /* the best outcome */
537 }
538 spin_lock(&sb_lock);
539 sb->s_count++;
540 spin_unlock(&sb_lock);
541 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
542 down_read(&sb->s_umount);
543 /* fs is already shut down; the watch is dead */
544 drop_super(sb);
545 return 0;
546}
547
548static void unpin_and_kill(struct inotify_watch *watch)
549{
550 struct super_block *sb = watch->inode->i_sb;
551 put_inotify_watch(watch);
552 deactivate_super(sb);
553}
554
555/**
556 * inotify_destroy - clean up and destroy an inotify instance
557 * @ih: inotify handle
558 */
559void inotify_destroy(struct inotify_handle *ih)
560{
561 /*
562 * Destroy all of the watches for this handle. Unfortunately, not very
563 * pretty. We cannot do a simple iteration over the list, because we
564 * do not know the inode until we iterate to the watch. But we need to
565 * hold inode->inotify_mutex before ih->mutex. The following works.
566 *
567 * AV: it had to become even uglier to start working ;-/
568 */
569 while (1) {
570 struct inotify_watch *watch;
571 struct list_head *watches;
572 struct super_block *sb;
573 struct inode *inode;
574
575 mutex_lock(&ih->mutex);
576 watches = &ih->watches;
577 if (list_empty(watches)) {
578 mutex_unlock(&ih->mutex);
579 break;
580 }
581 watch = list_first_entry(watches, struct inotify_watch, h_list);
582 sb = watch->inode->i_sb;
583 if (!pin_to_kill(ih, watch))
584 continue;
585
586 inode = watch->inode;
587 mutex_lock(&inode->inotify_mutex);
588 mutex_lock(&ih->mutex);
589
590 /* make sure we didn't race with another list removal */
591 if (likely(idr_find(&ih->idr, watch->wd))) {
592 remove_watch_no_event(watch, ih);
593 put_inotify_watch(watch);
594 }
595
596 mutex_unlock(&ih->mutex);
597 mutex_unlock(&inode->inotify_mutex);
598 unpin_and_kill(watch);
599 }
600
601 /* free this handle: the put matching the get in inotify_init() */
602 put_inotify_handle(ih);
603}
604EXPORT_SYMBOL_GPL(inotify_destroy);
605
606/**
607 * inotify_find_watch - find an existing watch for an (ih,inode) pair
608 * @ih: inotify handle
609 * @inode: inode to watch
610 * @watchp: pointer to existing inotify_watch
611 *
612 * Caller must pin given inode (via nameidata).
613 */
614s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
615 struct inotify_watch **watchp)
616{
617 struct inotify_watch *old;
618 int ret = -ENOENT;
619
620 mutex_lock(&inode->inotify_mutex);
621 mutex_lock(&ih->mutex);
622
623 old = inode_find_handle(inode, ih);
624 if (unlikely(old)) {
625 get_inotify_watch(old); /* caller must put watch */
626 *watchp = old;
627 ret = old->wd;
628 }
629
630 mutex_unlock(&ih->mutex);
631 mutex_unlock(&inode->inotify_mutex);
632
633 return ret;
634}
635EXPORT_SYMBOL_GPL(inotify_find_watch);
636
637/**
638 * inotify_find_update_watch - find and update the mask of an existing watch
639 * @ih: inotify handle
640 * @inode: inode's watch to update
641 * @mask: mask of events to watch
642 *
643 * Caller must pin given inode (via nameidata).
644 */
645s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
646 u32 mask)
647{
648 struct inotify_watch *old;
649 int mask_add = 0;
650 int ret;
651
652 if (mask & IN_MASK_ADD)
653 mask_add = 1;
654
655 /* don't allow invalid bits: we don't want flags set */
656 mask &= IN_ALL_EVENTS | IN_ONESHOT;
657 if (unlikely(!mask))
658 return -EINVAL;
659
660 mutex_lock(&inode->inotify_mutex);
661 mutex_lock(&ih->mutex);
662
663 /*
664 * Handle the case of re-adding a watch on an (inode,ih) pair that we
665 * are already watching. We just update the mask and return its wd.
666 */
667 old = inode_find_handle(inode, ih);
668 if (unlikely(!old)) {
669 ret = -ENOENT;
670 goto out;
671 }
672
673 if (mask_add)
674 old->mask |= mask;
675 else
676 old->mask = mask;
677 ret = old->wd;
678out:
679 mutex_unlock(&ih->mutex);
680 mutex_unlock(&inode->inotify_mutex);
681 return ret;
682}
683EXPORT_SYMBOL_GPL(inotify_find_update_watch);
684
685/**
686 * inotify_add_watch - add a watch to an inotify instance
687 * @ih: inotify handle
688 * @watch: caller allocated watch structure
689 * @inode: inode to watch
690 * @mask: mask of events to watch
691 *
692 * Caller must pin given inode (via nameidata).
693 * Caller must ensure it only calls inotify_add_watch() once per watch.
694 * Calls inotify_handle_get_wd() so may sleep.
695 */
696s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
697 struct inode *inode, u32 mask)
698{
699 int ret = 0;
700 int newly_watched;
701
702 /* don't allow invalid bits: we don't want flags set */
703 mask &= IN_ALL_EVENTS | IN_ONESHOT;
704 if (unlikely(!mask))
705 return -EINVAL;
706 watch->mask = mask;
707
708 mutex_lock(&inode->inotify_mutex);
709 mutex_lock(&ih->mutex);
710
711 /* Initialize a new watch */
712 ret = inotify_handle_get_wd(ih, watch);
713 if (unlikely(ret))
714 goto out;
715 ret = watch->wd;
716
717 /* save a reference to handle and bump the count to make it official */
718 get_inotify_handle(ih);
719 watch->ih = ih;
720
721 /*
722 * Save a reference to the inode and bump the ref count to make it
723 * official. We hold a reference to nameidata, which makes this safe.
724 */
725 watch->inode = igrab(inode);
726
727 /* Add the watch to the handle's and the inode's list */
728 newly_watched = !inotify_inode_watched(inode);
729 list_add(&watch->h_list, &ih->watches);
730 list_add(&watch->i_list, &inode->inotify_watches);
731 /*
732 * Set child flags _after_ adding the watch, so there is no race
733 * windows where newly instantiated children could miss their parent's
734 * watched flag.
735 */
736 if (newly_watched)
737 set_dentry_child_flags(inode, 1);
738
739out:
740 mutex_unlock(&ih->mutex);
741 mutex_unlock(&inode->inotify_mutex);
742 return ret;
743}
744EXPORT_SYMBOL_GPL(inotify_add_watch);
745
746/**
747 * inotify_clone_watch - put the watch next to existing one
748 * @old: already installed watch
749 * @new: new watch
750 *
751 * Caller must hold the inotify_mutex of inode we are dealing with;
752 * it is expected to remove the old watch before unlocking the inode.
753 */
754s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
755{
756 struct inotify_handle *ih = old->ih;
757 int ret = 0;
758
759 new->mask = old->mask;
760 new->ih = ih;
761
762 mutex_lock(&ih->mutex);
763
764 /* Initialize a new watch */
765 ret = inotify_handle_get_wd(ih, new);
766 if (unlikely(ret))
767 goto out;
768 ret = new->wd;
769
770 get_inotify_handle(ih);
771
772 new->inode = igrab(old->inode);
773
774 list_add(&new->h_list, &ih->watches);
775 list_add(&new->i_list, &old->inode->inotify_watches);
776out:
777 mutex_unlock(&ih->mutex);
778 return ret;
779}
780
781void inotify_evict_watch(struct inotify_watch *watch)
782{
783 get_inotify_watch(watch);
784 mutex_lock(&watch->ih->mutex);
785 inotify_remove_watch_locked(watch->ih, watch);
786 mutex_unlock(&watch->ih->mutex);
787}
788
789/**
790 * inotify_rm_wd - remove a watch from an inotify instance
791 * @ih: inotify handle
792 * @wd: watch descriptor to remove
793 *
794 * Can sleep.
795 */
796int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
797{
798 struct inotify_watch *watch;
799 struct super_block *sb;
800 struct inode *inode;
801
802 mutex_lock(&ih->mutex);
803 watch = idr_find(&ih->idr, wd);
804 if (unlikely(!watch)) {
805 mutex_unlock(&ih->mutex);
806 return -EINVAL;
807 }
808 sb = watch->inode->i_sb;
809 if (!pin_to_kill(ih, watch))
810 return 0;
811
812 inode = watch->inode;
813
814 mutex_lock(&inode->inotify_mutex);
815 mutex_lock(&ih->mutex);
816
817 /* make sure that we did not race */
818 if (likely(idr_find(&ih->idr, wd) == watch))
819 inotify_remove_watch_locked(ih, watch);
820
821 mutex_unlock(&ih->mutex);
822 mutex_unlock(&inode->inotify_mutex);
823 unpin_and_kill(watch);
824
825 return 0;
826}
827EXPORT_SYMBOL_GPL(inotify_rm_wd);
828
829/**
830 * inotify_rm_watch - remove a watch from an inotify instance
831 * @ih: inotify handle
832 * @watch: watch to remove
833 *
834 * Can sleep.
835 */
836int inotify_rm_watch(struct inotify_handle *ih,
837 struct inotify_watch *watch)
838{
839 return inotify_rm_wd(ih, watch->wd);
840}
841EXPORT_SYMBOL_GPL(inotify_rm_watch);
842
843/*
844 * inotify_setup - core initialization function
845 */
846static int __init inotify_setup(void)
847{
848 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
849 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
850 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
851 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
852 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
853 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
854 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
855 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
856 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
857 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
858 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
859 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
860 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
861
862 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
863 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
864 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866
867 atomic_set(&inotify_cookie, 0);
868
869 return 0;
870}
871
872module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8ca..b6642e4de4bf 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
9 int wd; 9 int wd;
10}; 10};
11 11
12struct inotify_inode_mark_entry { 12struct inotify_inode_mark {
13 /* fsnotify_mark_entry MUST be the first thing */ 13 struct fsnotify_mark fsn_mark;
14 struct fsnotify_mark_entry fsn_entry;
15 int wd; 14 int wd;
16}; 15};
17 16
18extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
19 struct fsnotify_group *group); 18 struct fsnotify_group *group);
20extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
21 20
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76ab..5e73eeb2c697 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
22 * General Public License for more details. 22 * General Public License for more details.
23 */ 23 */
24 24
25#include <linux/dcache.h> /* d_unlinked */
25#include <linux/fs.h> /* struct inode */ 26#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h> 28#include <linux/inotify.h>
@@ -32,26 +33,84 @@
32 33
33#include "inotify.h" 34#include "inotify.h"
34 35
35static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) 36/*
37 * Check if 2 events contain the same information. We do not compare private data
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
41{
42 if ((old->mask == new->mask) &&
43 (old->to_tell == new->to_tell) &&
44 (old->data_type == new->data_type) &&
45 (old->name_len == new->name_len)) {
46 switch (old->data_type) {
47 case (FSNOTIFY_EVENT_INODE):
48 /* remember, after old was put on the wait_q we aren't
49 * allowed to look at the inode any more, only thing
50 * left to check was if the file_name is the same */
51 if (!old->name_len ||
52 !strcmp(old->file_name, new->file_name))
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_FILE):
56 if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
57 (old->file->f_path.dentry == new->file->f_path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false;
69}
70
71static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event)
36{ 73{
37 struct fsnotify_mark_entry *entry; 74 struct fsnotify_event_holder *last_holder;
38 struct inotify_inode_mark_entry *ientry; 75 struct fsnotify_event *last_event;
76
77 /* and the list better be locked by something too */
78 spin_lock(&event->lock);
79
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event;
90}
91
92static int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark,
94 struct fsnotify_mark *vfsmount_mark,
95 struct fsnotify_event *event)
96{
97 struct inotify_inode_mark *i_mark;
39 struct inode *to_tell; 98 struct inode *to_tell;
40 struct inotify_event_private_data *event_priv; 99 struct inotify_event_private_data *event_priv;
41 struct fsnotify_event_private_data *fsn_event_priv; 100 struct fsnotify_event_private_data *fsn_event_priv;
42 int wd, ret; 101 struct fsnotify_event *added_event;
102 int wd, ret = 0;
103
104 BUG_ON(vfsmount_mark);
105
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
107 event, event->to_tell, event->mask);
43 108
44 to_tell = event->to_tell; 109 to_tell = event->to_tell;
45 110
46 spin_lock(&to_tell->i_lock); 111 i_mark = container_of(inode_mark, struct inotify_inode_mark,
47 entry = fsnotify_find_mark_entry(group, to_tell); 112 fsn_mark);
48 spin_unlock(&to_tell->i_lock); 113 wd = i_mark->wd;
49 /* race with watch removal? We already passes should_send */
50 if (unlikely(!entry))
51 return 0;
52 ientry = container_of(entry, struct inotify_inode_mark_entry,
53 fsn_entry);
54 wd = ientry->wd;
55 114
56 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
57 if (unlikely(!event_priv)) 116 if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 fsn_event_priv->group = group; 121 fsn_event_priv->group = group;
63 event_priv->wd = wd; 122 event_priv->wd = wd;
64 123
65 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 124 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
66 if (ret) { 125 if (added_event) {
67 inotify_free_event_priv(fsn_event_priv); 126 inotify_free_event_priv(fsn_event_priv);
68 /* EEXIST says we tail matched, EOVERFLOW isn't something 127 if (!IS_ERR(added_event))
69 * to report up the stack. */ 128 fsnotify_put_event(added_event);
70 if ((ret == -EEXIST) || 129 else
71 (ret == -EOVERFLOW)) 130 ret = PTR_ERR(added_event);
72 ret = 0;
73 } 131 }
74 132
75 /* 133 if (inode_mark->mask & IN_ONESHOT)
76 * If we hold the entry until after the event is on the queue 134 fsnotify_destroy_mark(inode_mark);
77 * IN_IGNORED won't be able to pass this event in the queue
78 */
79 fsnotify_put_mark(entry);
80 135
81 return ret; 136 return ret;
82} 137}
83 138
84static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) 139static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
85{ 140{
86 inotify_ignored_and_remove_idr(entry, group); 141 inotify_ignored_and_remove_idr(fsn_mark, group);
87} 142}
88 143
89static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) 144static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
145 struct fsnotify_mark *inode_mark,
146 struct fsnotify_mark *vfsmount_mark,
147 __u32 mask, void *data, int data_type)
90{ 148{
91 struct fsnotify_mark_entry *entry; 149 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
92 bool send; 150 (data_type == FSNOTIFY_EVENT_FILE)) {
93 151 struct file *file = data;
94 spin_lock(&inode->i_lock);
95 entry = fsnotify_find_mark_entry(group, inode);
96 spin_unlock(&inode->i_lock);
97 if (!entry)
98 return false;
99 152
100 mask = (mask & ~FS_EVENT_ON_CHILD); 153 if (d_unlinked(file->f_path.dentry))
101 send = (entry->mask & mask); 154 return false;
102 155 }
103 /* find took a reference */
104 fsnotify_put_mark(entry);
105 156
106 return send; 157 return true;
107} 158}
108 159
109/* 160/*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
115 */ 166 */
116static int idr_callback(int id, void *p, void *data) 167static int idr_callback(int id, void *p, void *data)
117{ 168{
118 struct fsnotify_mark_entry *entry; 169 struct fsnotify_mark *fsn_mark;
119 struct inotify_inode_mark_entry *ientry; 170 struct inotify_inode_mark *i_mark;
120 static bool warned = false; 171 static bool warned = false;
121 172
122 if (warned) 173 if (warned)
123 return 0; 174 return 0;
124 175
125 warned = true; 176 warned = true;
126 entry = p; 177 fsn_mark = p;
127 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 178 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
128 179
129 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " 180 WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
130 "idr. Probably leaking memory\n", id, p, data); 181 "idr. Probably leaking memory\n", id, p, data);
131 182
132 /* 183 /*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
135 * out why we got here and the panic is no worse than the original 186 * out why we got here and the panic is no worse than the original
136 * BUG() that was here. 187 * BUG() that was here.
137 */ 188 */
138 if (entry) 189 if (fsn_mark)
139 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", 190 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
140 entry->group, entry->inode, ientry->wd); 191 fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
141 return 0; 192 return 0;
142} 193}
143 194
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9be..bf7f6d776c31 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
46/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
47static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
48static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
49int inotify_max_user_watches __read_mostly; 49static int inotify_max_user_watches __read_mostly;
50 50
51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
52struct kmem_cache *event_priv_cachep __read_mostly; 52struct kmem_cache *event_priv_cachep __read_mostly;
53 53
54/*
55 * When inotify registers a new group it increments this and uses that
56 * value as an offset to set the fsnotify group "name" and priority.
57 */
58static atomic_t inotify_grp_num;
59
60#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
61 55
62#include <linux/sysctl.h> 56#include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
96{ 90{
97 __u32 mask; 91 __u32 mask;
98 92
99 /* everything should accept their own ignored and cares about children */ 93 /*
100 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); 94 * everything should accept their own ignored, cares about children,
95 * and should receive events when the inode is unmounted
96 */
97 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
101 98
102 /* mask off the flags used to open the fd */ 99 /* mask off the flags used to open the fd */
103 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); 100 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
104 101
105 return mask; 102 return mask;
106} 103}
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 141
145 event = fsnotify_peek_notify_event(group); 142 event = fsnotify_peek_notify_event(group);
146 143
144 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
145
147 if (event->name_len) 146 if (event->name_len)
148 event_size += roundup(event->name_len + 1, event_size); 147 event_size += roundup(event->name_len + 1, event_size);
149 148
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
173 size_t event_size = sizeof(struct inotify_event); 172 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 173 size_t name_len = 0;
175 174
175 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
176
176 /* we get the inotify watch descriptor from the event private data */ 177 /* we get the inotify watch descriptor from the event private data */
177 spin_lock(&event->lock); 178 spin_lock(&event->lock);
178 fsn_priv = fsnotify_remove_priv_from_event(group, event); 179 fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
245 kevent = get_one_event(group, count); 246 kevent = get_one_event(group, count);
246 mutex_unlock(&group->notification_mutex); 247 mutex_unlock(&group->notification_mutex);
247 248
249 pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
250
248 if (kevent) { 251 if (kevent) {
249 ret = PTR_ERR(kevent); 252 ret = PTR_ERR(kevent);
250 if (IS_ERR(kevent)) 253 if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
289 struct fsnotify_group *group = file->private_data; 292 struct fsnotify_group *group = file->private_data;
290 struct user_struct *user = group->inotify_data.user; 293 struct user_struct *user = group->inotify_data.user;
291 294
295 pr_debug("%s: group=%p\n", __func__, group);
296
292 fsnotify_clear_marks_by_group(group); 297 fsnotify_clear_marks_by_group(group);
293 298
294 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 299 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
312 group = file->private_data; 317 group = file->private_data;
313 p = (void __user *) arg; 318 p = (void __user *) arg;
314 319
320 pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
321
315 switch (cmd) { 322 switch (cmd) {
316 case FIONREAD: 323 case FIONREAD:
317 mutex_lock(&group->notification_mutex); 324 mutex_lock(&group->notification_mutex);
@@ -357,59 +364,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
357 return error; 364 return error;
358} 365}
359 366
367static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
368 int *last_wd,
369 struct inotify_inode_mark *i_mark)
370{
371 int ret;
372
373 do {
374 if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
375 return -ENOMEM;
376
377 spin_lock(idr_lock);
378 ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
379 &i_mark->wd);
380 /* we added the mark to the idr, take a reference */
381 if (!ret) {
382 *last_wd = i_mark->wd;
383 fsnotify_get_mark(&i_mark->fsn_mark);
384 }
385 spin_unlock(idr_lock);
386 } while (ret == -EAGAIN);
387
388 return ret;
389}
390
391static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
392 int wd)
393{
394 struct idr *idr = &group->inotify_data.idr;
395 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
396 struct inotify_inode_mark *i_mark;
397
398 assert_spin_locked(idr_lock);
399
400 i_mark = idr_find(idr, wd);
401 if (i_mark) {
402 struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
403
404 fsnotify_get_mark(fsn_mark);
405 /* One ref for being in the idr, one ref we just took */
406 BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
407 }
408
409 return i_mark;
410}
411
412static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
413 int wd)
414{
415 struct inotify_inode_mark *i_mark;
416 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
417
418 spin_lock(idr_lock);
419 i_mark = inotify_idr_find_locked(group, wd);
420 spin_unlock(idr_lock);
421
422 return i_mark;
423}
424
425static void do_inotify_remove_from_idr(struct fsnotify_group *group,
426 struct inotify_inode_mark *i_mark)
427{
428 struct idr *idr = &group->inotify_data.idr;
429 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
430 int wd = i_mark->wd;
431
432 assert_spin_locked(idr_lock);
433
434 idr_remove(idr, wd);
435
436 /* removed from the idr, drop that ref */
437 fsnotify_put_mark(&i_mark->fsn_mark);
438}
439
360/* 440/*
361 * Remove the mark from the idr (if present) and drop the reference 441 * Remove the mark from the idr (if present) and drop the reference
362 * on the mark because it was in the idr. 442 * on the mark because it was in the idr.
363 */ 443 */
364static void inotify_remove_from_idr(struct fsnotify_group *group, 444static void inotify_remove_from_idr(struct fsnotify_group *group,
365 struct inotify_inode_mark_entry *ientry) 445 struct inotify_inode_mark *i_mark)
366{ 446{
367 struct idr *idr; 447 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
368 struct fsnotify_mark_entry *entry; 448 struct inotify_inode_mark *found_i_mark = NULL;
369 struct inotify_inode_mark_entry *found_ientry;
370 int wd; 449 int wd;
371 450
372 spin_lock(&group->inotify_data.idr_lock); 451 spin_lock(idr_lock);
373 idr = &group->inotify_data.idr; 452 wd = i_mark->wd;
374 wd = ientry->wd;
375 453
376 if (wd == -1) 454 /*
455 * does this i_mark think it is in the idr? we shouldn't get called
456 * if it wasn't....
457 */
458 if (wd == -1) {
459 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
460 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
461 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
377 goto out; 462 goto out;
463 }
378 464
379 entry = idr_find(&group->inotify_data.idr, wd); 465 /* Lets look in the idr to see if we find it */
380 if (unlikely(!entry)) 466 found_i_mark = inotify_idr_find_locked(group, wd);
467 if (unlikely(!found_i_mark)) {
468 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
469 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
470 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
381 goto out; 471 goto out;
472 }
382 473
383 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 474 /*
384 if (unlikely(found_ientry != ientry)) { 475 * We found an mark in the idr at the right wd, but it's
385 /* We found an entry in the idr with the right wd, but it's 476 * not the mark we were told to remove. eparis seriously
386 * not the entry we were told to remove. eparis seriously 477 * fucked up somewhere.
387 * fucked up somewhere. */ 478 */
388 WARN_ON(1); 479 if (unlikely(found_i_mark != i_mark)) {
389 ientry->wd = -1; 480 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
481 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
482 "found_i_mark->group=%p found_i_mark->inode=%p\n",
483 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
484 i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
485 found_i_mark->fsn_mark.group,
486 found_i_mark->fsn_mark.i.inode);
390 goto out; 487 goto out;
391 } 488 }
392 489
393 /* One ref for being in the idr, one ref held by the caller */ 490 /*
394 BUG_ON(atomic_read(&entry->refcnt) < 2); 491 * One ref for being in the idr
395 492 * one ref held by the caller trying to kill us
396 idr_remove(idr, wd); 493 * one ref grabbed by inotify_idr_find
397 ientry->wd = -1; 494 */
495 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
496 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
497 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
498 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
499 /* we can't really recover with bad ref cnting.. */
500 BUG();
501 }
398 502
399 /* removed from the idr, drop that ref */ 503 do_inotify_remove_from_idr(group, i_mark);
400 fsnotify_put_mark(entry);
401out: 504out:
402 spin_unlock(&group->inotify_data.idr_lock); 505 /* match the ref taken by inotify_idr_find_locked() */
506 if (found_i_mark)
507 fsnotify_put_mark(&found_i_mark->fsn_mark);
508 i_mark->wd = -1;
509 spin_unlock(idr_lock);
403} 510}
404 511
405/* 512/*
406 * Send IN_IGNORED for this wd, remove this wd from the idr. 513 * Send IN_IGNORED for this wd, remove this wd from the idr.
407 */ 514 */
408void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 515void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
409 struct fsnotify_group *group) 516 struct fsnotify_group *group)
410{ 517{
411 struct inotify_inode_mark_entry *ientry; 518 struct inotify_inode_mark *i_mark;
412 struct fsnotify_event *ignored_event; 519 struct fsnotify_event *ignored_event, *notify_event;
413 struct inotify_event_private_data *event_priv; 520 struct inotify_event_private_data *event_priv;
414 struct fsnotify_event_private_data *fsn_event_priv; 521 struct fsnotify_event_private_data *fsn_event_priv;
415 int ret; 522 int ret;
@@ -420,7 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
420 if (!ignored_event) 527 if (!ignored_event)
421 return; 528 return;
422 529
423 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 530 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
424 531
425 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 532 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
426 if (unlikely(!event_priv)) 533 if (unlikely(!event_priv))
@@ -429,37 +536,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
429 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 536 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
430 537
431 fsn_event_priv->group = group; 538 fsn_event_priv->group = group;
432 event_priv->wd = ientry->wd; 539 event_priv->wd = i_mark->wd;
433 540
434 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); 541 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
435 if (ret) 542 if (notify_event) {
543 if (IS_ERR(notify_event))
544 ret = PTR_ERR(notify_event);
545 else
546 fsnotify_put_event(notify_event);
436 inotify_free_event_priv(fsn_event_priv); 547 inotify_free_event_priv(fsn_event_priv);
548 }
437 549
438skip_send_ignore: 550skip_send_ignore:
439 551
440 /* matches the reference taken when the event was created */ 552 /* matches the reference taken when the event was created */
441 fsnotify_put_event(ignored_event); 553 fsnotify_put_event(ignored_event);
442 554
443 /* remove this entry from the idr */ 555 /* remove this mark from the idr */
444 inotify_remove_from_idr(group, ientry); 556 inotify_remove_from_idr(group, i_mark);
445 557
446 atomic_dec(&group->inotify_data.user->inotify_watches); 558 atomic_dec(&group->inotify_data.user->inotify_watches);
447} 559}
448 560
449/* ding dong the mark is dead */ 561/* ding dong the mark is dead */
450static void inotify_free_mark(struct fsnotify_mark_entry *entry) 562static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
451{ 563{
452 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; 564 struct inotify_inode_mark *i_mark;
565
566 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
453 567
454 kmem_cache_free(inotify_inode_mark_cachep, ientry); 568 kmem_cache_free(inotify_inode_mark_cachep, i_mark);
455} 569}
456 570
457static int inotify_update_existing_watch(struct fsnotify_group *group, 571static int inotify_update_existing_watch(struct fsnotify_group *group,
458 struct inode *inode, 572 struct inode *inode,
459 u32 arg) 573 u32 arg)
460{ 574{
461 struct fsnotify_mark_entry *entry; 575 struct fsnotify_mark *fsn_mark;
462 struct inotify_inode_mark_entry *ientry; 576 struct inotify_inode_mark *i_mark;
463 __u32 old_mask, new_mask; 577 __u32 old_mask, new_mask;
464 __u32 mask; 578 __u32 mask;
465 int add = (arg & IN_MASK_ADD); 579 int add = (arg & IN_MASK_ADD);
@@ -467,52 +581,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
467 581
468 /* don't allow invalid bits: we don't want flags set */ 582 /* don't allow invalid bits: we don't want flags set */
469 mask = inotify_arg_to_mask(arg); 583 mask = inotify_arg_to_mask(arg);
470 if (unlikely(!mask)) 584 if (unlikely(!(mask & IN_ALL_EVENTS)))
471 return -EINVAL; 585 return -EINVAL;
472 586
473 spin_lock(&inode->i_lock); 587 fsn_mark = fsnotify_find_inode_mark(group, inode);
474 entry = fsnotify_find_mark_entry(group, inode); 588 if (!fsn_mark)
475 spin_unlock(&inode->i_lock);
476 if (!entry)
477 return -ENOENT; 589 return -ENOENT;
478 590
479 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 591 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
480 592
481 spin_lock(&entry->lock); 593 spin_lock(&fsn_mark->lock);
482 594
483 old_mask = entry->mask; 595 old_mask = fsn_mark->mask;
484 if (add) { 596 if (add)
485 entry->mask |= mask; 597 fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
486 new_mask = entry->mask; 598 else
487 } else { 599 fsnotify_set_mark_mask_locked(fsn_mark, mask);
488 entry->mask = mask; 600 new_mask = fsn_mark->mask;
489 new_mask = entry->mask;
490 }
491 601
492 spin_unlock(&entry->lock); 602 spin_unlock(&fsn_mark->lock);
493 603
494 if (old_mask != new_mask) { 604 if (old_mask != new_mask) {
495 /* more bits in old than in new? */ 605 /* more bits in old than in new? */
496 int dropped = (old_mask & ~new_mask); 606 int dropped = (old_mask & ~new_mask);
497 /* more bits in this entry than the inode's mask? */ 607 /* more bits in this fsn_mark than the inode's mask? */
498 int do_inode = (new_mask & ~inode->i_fsnotify_mask); 608 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
499 /* more bits in this entry than the group? */
500 int do_group = (new_mask & ~group->mask);
501 609
502 /* update the inode with this new entry */ 610 /* update the inode with this new fsn_mark */
503 if (dropped || do_inode) 611 if (dropped || do_inode)
504 fsnotify_recalc_inode_mask(inode); 612 fsnotify_recalc_inode_mask(inode);
505 613
506 /* update the group mask with the new mask */
507 if (dropped || do_group)
508 fsnotify_recalc_group_mask(group);
509 } 614 }
510 615
511 /* return the wd */ 616 /* return the wd */
512 ret = ientry->wd; 617 ret = i_mark->wd;
513 618
514 /* match the get from fsnotify_find_mark_entry() */ 619 /* match the get from fsnotify_find_mark() */
515 fsnotify_put_mark(entry); 620 fsnotify_put_mark(fsn_mark);
516 621
517 return ret; 622 return ret;
518} 623}
@@ -521,73 +626,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
521 struct inode *inode, 626 struct inode *inode,
522 u32 arg) 627 u32 arg)
523{ 628{
524 struct inotify_inode_mark_entry *tmp_ientry; 629 struct inotify_inode_mark *tmp_i_mark;
525 __u32 mask; 630 __u32 mask;
526 int ret; 631 int ret;
632 struct idr *idr = &group->inotify_data.idr;
633 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
527 634
528 /* don't allow invalid bits: we don't want flags set */ 635 /* don't allow invalid bits: we don't want flags set */
529 mask = inotify_arg_to_mask(arg); 636 mask = inotify_arg_to_mask(arg);
530 if (unlikely(!mask)) 637 if (unlikely(!(mask & IN_ALL_EVENTS)))
531 return -EINVAL; 638 return -EINVAL;
532 639
533 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); 640 tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
534 if (unlikely(!tmp_ientry)) 641 if (unlikely(!tmp_i_mark))
535 return -ENOMEM; 642 return -ENOMEM;
536 643
537 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); 644 fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
538 tmp_ientry->fsn_entry.mask = mask; 645 tmp_i_mark->fsn_mark.mask = mask;
539 tmp_ientry->wd = -1; 646 tmp_i_mark->wd = -1;
540 647
541 ret = -ENOSPC; 648 ret = -ENOSPC;
542 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) 649 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
543 goto out_err; 650 goto out_err;
544retry:
545 ret = -ENOMEM;
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err;
548 651
549 /* we are putting the mark on the idr, take a reference */ 652 ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
550 fsnotify_get_mark(&tmp_ientry->fsn_entry); 653 tmp_i_mark);
551 654 if (ret)
552 spin_lock(&group->inotify_data.idr_lock);
553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
554 group->inotify_data.last_wd+1,
555 &tmp_ientry->wd);
556 spin_unlock(&group->inotify_data.idr_lock);
557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
561 /* idr was out of memory allocate and try again */
562 if (ret == -EAGAIN)
563 goto retry;
564 goto out_err; 655 goto out_err;
565 }
566 656
567 /* we are on the idr, now get on the inode */ 657 /* we are on the idr, now get on the inode */
568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 658 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
569 if (ret) { 659 if (ret) {
570 /* we failed to get on the inode, get off the idr */ 660 /* we failed to get on the inode, get off the idr */
571 inotify_remove_from_idr(group, tmp_ientry); 661 inotify_remove_from_idr(group, tmp_i_mark);
572 goto out_err; 662 goto out_err;
573 } 663 }
574 664
575 /* update the idr hint, who cares about races, it's just a hint */
576 group->inotify_data.last_wd = tmp_ientry->wd;
577
578 /* increment the number of watches the user has */ 665 /* increment the number of watches the user has */
579 atomic_inc(&group->inotify_data.user->inotify_watches); 666 atomic_inc(&group->inotify_data.user->inotify_watches);
580 667
581 /* return the watch descriptor for this new entry */ 668 /* return the watch descriptor for this new mark */
582 ret = tmp_ientry->wd; 669 ret = tmp_i_mark->wd;
583
584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group);
587 670
588out_err: 671out_err:
589 /* match the ref from fsnotify_init_markentry() */ 672 /* match the ref from fsnotify_init_mark() */
590 fsnotify_put_mark(&tmp_ientry->fsn_entry); 673 fsnotify_put_mark(&tmp_i_mark->fsn_mark);
591 674
592 return ret; 675 return ret;
593} 676}
@@ -616,11 +699,8 @@ retry:
616static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) 699static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
617{ 700{
618 struct fsnotify_group *group; 701 struct fsnotify_group *group;
619 unsigned int grp_num;
620 702
621 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ 703 group = fsnotify_alloc_group(&inotify_fsnotify_ops);
622 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
623 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
624 if (IS_ERR(group)) 704 if (IS_ERR(group))
625 return group; 705 return group;
626 706
@@ -726,7 +806,7 @@ fput_and_out:
726SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 806SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
727{ 807{
728 struct fsnotify_group *group; 808 struct fsnotify_group *group;
729 struct fsnotify_mark_entry *entry; 809 struct inotify_inode_mark *i_mark;
730 struct file *filp; 810 struct file *filp;
731 int ret = 0, fput_needed; 811 int ret = 0, fput_needed;
732 812
@@ -735,25 +815,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
735 return -EBADF; 815 return -EBADF;
736 816
737 /* verify that this is indeed an inotify instance */ 817 /* verify that this is indeed an inotify instance */
738 if (unlikely(filp->f_op != &inotify_fops)) { 818 ret = -EINVAL;
739 ret = -EINVAL; 819 if (unlikely(filp->f_op != &inotify_fops))
740 goto out; 820 goto out;
741 }
742 821
743 group = filp->private_data; 822 group = filp->private_data;
744 823
745 spin_lock(&group->inotify_data.idr_lock); 824 ret = -EINVAL;
746 entry = idr_find(&group->inotify_data.idr, wd); 825 i_mark = inotify_idr_find(group, wd);
747 if (unlikely(!entry)) { 826 if (unlikely(!i_mark))
748 spin_unlock(&group->inotify_data.idr_lock);
749 ret = -EINVAL;
750 goto out; 827 goto out;
751 }
752 fsnotify_get_mark(entry);
753 spin_unlock(&group->inotify_data.idr_lock);
754 828
755 fsnotify_destroy_mark_by_entry(entry); 829 ret = 0;
756 fsnotify_put_mark(entry); 830
831 fsnotify_destroy_mark(&i_mark->fsn_mark);
832
833 /* match ref taken by inotify_idr_find */
834 fsnotify_put_mark(&i_mark->fsn_mark);
757 835
758out: 836out:
759 fput_light(filp, fput_needed); 837 fput_light(filp, fput_needed);
@@ -767,7 +845,28 @@ out:
767 */ 845 */
768static int __init inotify_user_setup(void) 846static int __init inotify_user_setup(void)
769{ 847{
770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 848 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
849 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
850 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
851 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
852 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
853 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
854 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
855 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
856 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
857 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
858 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
859 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
860 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
863 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866
867 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
868
869 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 870 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
772 871
773 inotify_max_queued_events = 16384; 872 inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 000000000000..325185e514bb
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * mark->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the marks_list anchored inside a given group
42 * and each mark is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/kthread.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/slab.h>
92#include <linux/spinlock.h>
93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95
96#include <asm/atomic.h>
97
98#include <linux/fsnotify_backend.h>
99#include "fsnotify.h"
100
101struct srcu_struct fsnotify_mark_srcu;
102static DEFINE_SPINLOCK(destroy_lock);
103static LIST_HEAD(destroy_list);
104static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
105
106void fsnotify_get_mark(struct fsnotify_mark *mark)
107{
108 atomic_inc(&mark->refcnt);
109}
110
111void fsnotify_put_mark(struct fsnotify_mark *mark)
112{
113 if (atomic_dec_and_test(&mark->refcnt))
114 mark->free_mark(mark);
115}
116
117/*
118 * Any time a mark is getting freed we end up here.
119 * The caller had better be holding a reference to this mark so we don't actually
120 * do the final put under the mark->lock
121 */
122void fsnotify_destroy_mark(struct fsnotify_mark *mark)
123{
124 struct fsnotify_group *group;
125 struct inode *inode = NULL;
126
127 spin_lock(&mark->lock);
128
129 group = mark->group;
130
131 /* something else already called this function on this mark */
132 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
133 spin_unlock(&mark->lock);
134 return;
135 }
136
137 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
138
139 /* 1 from caller and 1 for being on i_list/g_list */
140 BUG_ON(atomic_read(&mark->refcnt) < 2);
141
142 spin_lock(&group->mark_lock);
143
144 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
145 inode = mark->i.inode;
146 fsnotify_destroy_inode_mark(mark);
147 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
148 fsnotify_destroy_vfsmount_mark(mark);
149 else
150 BUG();
151
152 list_del_init(&mark->g_list);
153
154 spin_unlock(&group->mark_lock);
155 spin_unlock(&mark->lock);
156
157 spin_lock(&destroy_lock);
158 list_add(&mark->destroy_list, &destroy_list);
159 spin_unlock(&destroy_lock);
160 wake_up(&destroy_waitq);
161
162 /*
163 * Some groups like to know that marks are being freed. This is a
164 * callback to the group function to let it know that this mark
165 * is being freed.
166 */
167 if (group->ops->freeing_mark)
168 group->ops->freeing_mark(mark, group);
169
170 /*
171 * __fsnotify_update_child_dentry_flags(inode);
172 *
173 * I really want to call that, but we can't, we have no idea if the inode
174 * still exists the second we drop the mark->lock.
175 *
176 * The next time an event arrive to this inode from one of it's children
177 * __fsnotify_parent will see that the inode doesn't care about it's
178 * children and will update all of these flags then. So really this
179 * is just a lazy update (and could be a perf win...)
180 */
181
182 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
183 iput(inode);
184
185 /*
186 * it's possible that this group tried to destroy itself, but this
187 * this mark was simultaneously being freed by inode. If that's the
188 * case, we finish freeing the group here.
189 */
190 if (unlikely(atomic_dec_and_test(&group->num_marks)))
191 fsnotify_final_destroy_group(group);
192}
193
194void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
195{
196 assert_spin_locked(&mark->lock);
197
198 mark->mask = mask;
199
200 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
201 fsnotify_set_inode_mark_mask_locked(mark, mask);
202}
203
204void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
205{
206 assert_spin_locked(&mark->lock);
207
208 mark->ignored_mask = mask;
209}
210
211/*
212 * Attach an initialized mark to a given group and fs object.
213 * These marks may be used for the fsnotify backend to determine which
214 * event types should be delivered to which group.
215 */
216int fsnotify_add_mark(struct fsnotify_mark *mark,
217 struct fsnotify_group *group, struct inode *inode,
218 struct vfsmount *mnt, int allow_dups)
219{
220 int ret = 0;
221
222 BUG_ON(inode && mnt);
223 BUG_ON(!inode && !mnt);
224
225 /*
226 * LOCKING ORDER!!!!
227 * mark->lock
228 * group->mark_lock
229 * inode->i_lock
230 */
231 spin_lock(&mark->lock);
232 spin_lock(&group->mark_lock);
233
234 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
235
236 mark->group = group;
237 list_add(&mark->g_list, &group->marks_list);
238 atomic_inc(&group->num_marks);
239 fsnotify_get_mark(mark); /* for i_list and g_list */
240
241 if (inode) {
242 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
243 if (ret)
244 goto err;
245 } else if (mnt) {
246 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
247 if (ret)
248 goto err;
249 } else {
250 BUG();
251 }
252
253 spin_unlock(&group->mark_lock);
254
255 /* this will pin the object if appropriate */
256 fsnotify_set_mark_mask_locked(mark, mark->mask);
257
258 spin_unlock(&mark->lock);
259
260 if (inode)
261 __fsnotify_update_child_dentry_flags(inode);
262
263 return ret;
264err:
265 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
266 list_del_init(&mark->g_list);
267 mark->group = NULL;
268 atomic_dec(&group->num_marks);
269
270 spin_unlock(&group->mark_lock);
271 spin_unlock(&mark->lock);
272
273 spin_lock(&destroy_lock);
274 list_add(&mark->destroy_list, &destroy_list);
275 spin_unlock(&destroy_lock);
276 wake_up(&destroy_waitq);
277
278 return ret;
279}
280
281/*
282 * clear any marks in a group in which mark->flags & flags is true
283 */
284void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
285 unsigned int flags)
286{
287 struct fsnotify_mark *lmark, *mark;
288 LIST_HEAD(free_list);
289
290 spin_lock(&group->mark_lock);
291 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
292 if (mark->flags & flags) {
293 list_add(&mark->free_g_list, &free_list);
294 list_del_init(&mark->g_list);
295 fsnotify_get_mark(mark);
296 }
297 }
298 spin_unlock(&group->mark_lock);
299
300 list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
301 fsnotify_destroy_mark(mark);
302 fsnotify_put_mark(mark);
303 }
304}
305
306/*
307 * Given a group, destroy all of the marks associated with that group.
308 */
309void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
310{
311 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
312}
313
314void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
315{
316 assert_spin_locked(&old->lock);
317 new->i.inode = old->i.inode;
318 new->m.mnt = old->m.mnt;
319 new->group = old->group;
320 new->mask = old->mask;
321 new->free_mark = old->free_mark;
322}
323
324/*
325 * Nothing fancy, just initialize lists and locks and counters.
326 */
327void fsnotify_init_mark(struct fsnotify_mark *mark,
328 void (*free_mark)(struct fsnotify_mark *mark))
329{
330 memset(mark, 0, sizeof(*mark));
331 spin_lock_init(&mark->lock);
332 atomic_set(&mark->refcnt, 1);
333 mark->free_mark = free_mark;
334}
335
336static int fsnotify_mark_destroy(void *ignored)
337{
338 struct fsnotify_mark *mark, *next;
339 LIST_HEAD(private_destroy_list);
340
341 for (;;) {
342 spin_lock(&destroy_lock);
343 /* exchange the list head */
344 list_replace_init(&destroy_list, &private_destroy_list);
345 spin_unlock(&destroy_lock);
346
347 synchronize_srcu(&fsnotify_mark_srcu);
348
349 list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
350 list_del_init(&mark->destroy_list);
351 fsnotify_put_mark(mark);
352 }
353
354 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
355 }
356
357 return 0;
358}
359
360static int __init fsnotify_mark_init(void)
361{
362 struct task_struct *thread;
363
364 thread = kthread_run(fsnotify_mark_destroy, NULL,
365 "fsnotify_mark");
366 if (IS_ERR(thread))
367 panic("unable to start fsnotify mark destruction thread.");
368
369 return 0;
370}
371device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c108..d6c435adc7a2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,6 +31,7 @@
31 * allocated and used. 31 * allocated and used.
32 */ 32 */
33 33
34#include <linux/file.h>
34#include <linux/fs.h> 35#include <linux/fs.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <linux/kernel.h> 37#include <linux/kernel.h>
@@ -56,7 +57,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
56 * it is needed. It's refcnt is set 1 at kernel init time and will never 57 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed' 58 * get set to 0 so it will never get 'freed'
58 */ 59 */
59static struct fsnotify_event q_overflow_event; 60static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 61static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 62
62/** 63/**
@@ -87,12 +88,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
87 return; 88 return;
88 89
89 if (atomic_dec_and_test(&event->refcnt)) { 90 if (atomic_dec_and_test(&event->refcnt)) {
90 if (event->data_type == FSNOTIFY_EVENT_PATH) 91 pr_debug("%s: event=%p\n", __func__, event);
91 path_put(&event->path); 92
93 if (event->data_type == FSNOTIFY_EVENT_FILE)
94 fput(event->file);
92 95
93 BUG_ON(!list_empty(&event->private_data_list)); 96 BUG_ON(!list_empty(&event->private_data_list));
94 97
95 kfree(event->file_name); 98 kfree(event->file_name);
99 put_pid(event->tgid);
96 kmem_cache_free(fsnotify_event_cachep, event); 100 kmem_cache_free(fsnotify_event_cachep, event);
97 } 101 }
98} 102}
@@ -104,7 +108,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104 108
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) 109void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{ 110{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder); 111 if (holder)
112 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108} 113}
109 114
110/* 115/*
@@ -129,53 +134,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
129} 134}
130 135
131/* 136/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
141 switch (old->data_type) {
142 case (FSNOTIFY_EVENT_INODE):
143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name))
148 return true;
149 break;
150 case (FSNOTIFY_EVENT_PATH):
151 if ((old->path.mnt == new->path.mnt) &&
152 (old->path.dentry == new->path.dentry))
153 return true;
154 break;
155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
161 };
162 }
163 return false;
164}
165
166/*
167 * Add an event to the group notification queue. The group can later pull this 137 * Add an event to the group notification queue. The group can later pull this
168 * event off the queue to deal with. If the event is successfully added to the 138 * event off the queue to deal with. If the event is successfully added to the
169 * group's notification queue, a reference is taken on event. 139 * group's notification queue, a reference is taken on event.
170 */ 140 */
171int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 141struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
172 struct fsnotify_event_private_data *priv) 142 struct fsnotify_event_private_data *priv,
143 struct fsnotify_event *(*merge)(struct list_head *,
144 struct fsnotify_event *))
173{ 145{
146 struct fsnotify_event *return_event = NULL;
174 struct fsnotify_event_holder *holder = NULL; 147 struct fsnotify_event_holder *holder = NULL;
175 struct list_head *list = &group->notification_list; 148 struct list_head *list = &group->notification_list;
176 struct fsnotify_event_holder *last_holder; 149
177 struct fsnotify_event *last_event; 150 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
178 int ret = 0;
179 151
180 /* 152 /*
181 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 153 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +161,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
189alloc_holder: 161alloc_holder:
190 holder = fsnotify_alloc_event_holder(); 162 holder = fsnotify_alloc_event_holder();
191 if (!holder) 163 if (!holder)
192 return -ENOMEM; 164 return ERR_PTR(-ENOMEM);
193 } 165 }
194 166
195 mutex_lock(&group->notification_mutex); 167 mutex_lock(&group->notification_mutex);
196 168
197 if (group->q_len >= group->max_events) { 169 if (group->q_len >= group->max_events) {
198 event = &q_overflow_event; 170 event = q_overflow_event;
199 ret = -EOVERFLOW; 171
172 /*
173 * we need to return the overflow event
174 * which means we need a ref
175 */
176 fsnotify_get_event(event);
177 return_event = event;
178
200 /* sorry, no private data on the overflow event */ 179 /* sorry, no private data on the overflow event */
201 priv = NULL; 180 priv = NULL;
202 } 181 }
203 182
183 if (!list_empty(list) && merge) {
184 struct fsnotify_event *tmp;
185
186 tmp = merge(list, event);
187 if (tmp) {
188 mutex_unlock(&group->notification_mutex);
189
190 if (return_event)
191 fsnotify_put_event(return_event);
192 if (holder != &event->holder)
193 fsnotify_destroy_event_holder(holder);
194 return tmp;
195 }
196 }
197
204 spin_lock(&event->lock); 198 spin_lock(&event->lock);
205 199
206 if (list_empty(&event->holder.event_list)) { 200 if (list_empty(&event->holder.event_list)) {
@@ -212,19 +206,13 @@ alloc_holder:
212 * event holder was used, go back and get a new one */ 206 * event holder was used, go back and get a new one */
213 spin_unlock(&event->lock); 207 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex); 208 mutex_unlock(&group->notification_mutex);
215 goto alloc_holder;
216 }
217 209
218 if (!list_empty(list)) { 210 if (return_event) {
219 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); 211 fsnotify_put_event(return_event);
220 last_event = last_holder->event; 212 return_event = NULL;
221 if (event_compare(last_event, event)) {
222 spin_unlock(&event->lock);
223 mutex_unlock(&group->notification_mutex);
224 if (holder != &event->holder)
225 fsnotify_destroy_event_holder(holder);
226 return -EEXIST;
227 } 213 }
214
215 goto alloc_holder;
228 } 216 }
229 217
230 group->q_len++; 218 group->q_len++;
@@ -238,7 +226,7 @@ alloc_holder:
238 mutex_unlock(&group->notification_mutex); 226 mutex_unlock(&group->notification_mutex);
239 227
240 wake_up(&group->notification_waitq); 228 wake_up(&group->notification_waitq);
241 return ret; 229 return return_event;
242} 230}
243 231
244/* 232/*
@@ -253,6 +241,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
253 241
254 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 242 BUG_ON(!mutex_is_locked(&group->notification_mutex));
255 243
244 pr_debug("%s: group=%p\n", __func__, group);
245
256 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 246 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
257 247
258 event = holder->event; 248 event = holder->event;
@@ -314,25 +304,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
314 304
315static void initialize_event(struct fsnotify_event *event) 305static void initialize_event(struct fsnotify_event *event)
316{ 306{
317 event->holder.event = NULL;
318 INIT_LIST_HEAD(&event->holder.event_list); 307 INIT_LIST_HEAD(&event->holder.event_list);
319 atomic_set(&event->refcnt, 1); 308 atomic_set(&event->refcnt, 1);
320 309
321 spin_lock_init(&event->lock); 310 spin_lock_init(&event->lock);
322 311
323 event->path.dentry = NULL;
324 event->path.mnt = NULL;
325 event->inode = NULL;
326 event->data_type = FSNOTIFY_EVENT_NONE;
327
328 INIT_LIST_HEAD(&event->private_data_list); 312 INIT_LIST_HEAD(&event->private_data_list);
313}
329 314
330 event->to_tell = NULL; 315/*
316 * Caller damn well better be holding whatever mutex is protecting the
317 * old_holder->event_list and the new_event must be a clean event which
318 * cannot be found anywhere else in the kernel.
319 */
320int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
321 struct fsnotify_event *new_event)
322{
323 struct fsnotify_event *old_event = old_holder->event;
324 struct fsnotify_event_holder *new_holder = &new_event->holder;
331 325
332 event->file_name = NULL; 326 enum event_spinlock_class {
333 event->name_len = 0; 327 SPINLOCK_OLD,
328 SPINLOCK_NEW,
329 };
334 330
335 event->sync_cookie = 0; 331 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
332
333 /*
334 * if the new_event's embedded holder is in use someone
335 * screwed up and didn't give us a clean new event.
336 */
337 BUG_ON(!list_empty(&new_holder->event_list));
338
339 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
340 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
341
342 new_holder->event = new_event;
343 list_replace_init(&old_holder->event_list, &new_holder->event_list);
344
345 spin_unlock(&new_event->lock);
346 spin_unlock(&old_event->lock);
347
348 /* event == holder means we are referenced through the in event holder */
349 if (old_holder != &old_event->holder)
350 fsnotify_destroy_event_holder(old_holder);
351
352 fsnotify_get_event(new_event); /* on the list take reference */
353 fsnotify_put_event(old_event); /* off the list, drop reference */
354
355 return 0;
356}
357
358struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
359{
360 struct fsnotify_event *event;
361
362 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
363 if (!event)
364 return NULL;
365
366 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
367
368 memcpy(event, old_event, sizeof(*event));
369 initialize_event(event);
370
371 if (event->name_len) {
372 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
373 if (!event->file_name) {
374 kmem_cache_free(fsnotify_event_cachep, event);
375 return NULL;
376 }
377 }
378 event->tgid = get_pid(old_event->tgid);
379 if (event->data_type == FSNOTIFY_EVENT_FILE)
380 get_file(event->file);
381
382 return event;
336} 383}
337 384
338/* 385/*
@@ -348,15 +395,18 @@ static void initialize_event(struct fsnotify_event *event)
348 * @name the filename, if available 395 * @name the filename, if available
349 */ 396 */
350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 397struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
351 int data_type, const char *name, u32 cookie, 398 int data_type, const unsigned char *name,
352 gfp_t gfp) 399 u32 cookie, gfp_t gfp)
353{ 400{
354 struct fsnotify_event *event; 401 struct fsnotify_event *event;
355 402
356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp); 403 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
357 if (!event) 404 if (!event)
358 return NULL; 405 return NULL;
359 406
407 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
408 __func__, event, to_tell, mask, data, data_type);
409
360 initialize_event(event); 410 initialize_event(event);
361 411
362 if (name) { 412 if (name) {
@@ -368,35 +418,36 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
368 event->name_len = strlen(event->file_name); 418 event->name_len = strlen(event->file_name);
369 } 419 }
370 420
421 event->tgid = get_pid(task_tgid(current));
371 event->sync_cookie = cookie; 422 event->sync_cookie = cookie;
372 event->to_tell = to_tell; 423 event->to_tell = to_tell;
424 event->data_type = data_type;
373 425
374 switch (data_type) { 426 switch (data_type) {
375 case FSNOTIFY_EVENT_FILE: { 427 case FSNOTIFY_EVENT_FILE: {
376 struct file *file = data; 428 event->file = data;
377 struct path *path = &file->f_path; 429 /*
378 event->path.dentry = path->dentry; 430 * if this file is about to disappear hold an extra reference
379 event->path.mnt = path->mnt; 431 * until we return to __fput so we don't have to worry about
380 path_get(&event->path); 432 * future get/put destroying the file under us or generating
381 event->data_type = FSNOTIFY_EVENT_PATH; 433 * additional events. Notice that we change f_mode without
382 break; 434 * holding f_lock. This is safe since this is the only possible
383 } 435 * reference to this object in the kernel (it was about to be
384 case FSNOTIFY_EVENT_PATH: { 436 * freed, remember?)
385 struct path *path = data; 437 */
386 event->path.dentry = path->dentry; 438 if (!atomic_long_read(&event->file->f_count)) {
387 event->path.mnt = path->mnt; 439 event->file->f_mode |= FMODE_NONOTIFY;
388 path_get(&event->path); 440 get_file(event->file);
389 event->data_type = FSNOTIFY_EVENT_PATH; 441 }
442 get_file(event->file);
390 break; 443 break;
391 } 444 }
392 case FSNOTIFY_EVENT_INODE: 445 case FSNOTIFY_EVENT_INODE:
393 event->inode = data; 446 event->inode = data;
394 event->data_type = FSNOTIFY_EVENT_INODE;
395 break; 447 break;
396 case FSNOTIFY_EVENT_NONE: 448 case FSNOTIFY_EVENT_NONE:
397 event->inode = NULL; 449 event->inode = NULL;
398 event->path.dentry = NULL; 450 event->file = NULL;
399 event->path.mnt = NULL;
400 break; 451 break;
401 default: 452 default:
402 BUG(); 453 BUG();
@@ -412,8 +463,11 @@ __init int fsnotify_notification_init(void)
412 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 463 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
413 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 464 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
414 465
415 initialize_event(&q_overflow_event); 466 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
416 q_overflow_event.mask = FS_Q_OVERFLOW; 467 FSNOTIFY_EVENT_NONE, NULL, 0,
468 GFP_KERNEL);
469 if (!q_overflow_event)
470 panic("unable to allocate fsnotify q_overflow_event\n");
417 471
418 return 0; 472 return 0;
419} 473}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000000..56772b578fbd
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/mount.h>
24#include <linux/mutex.h>
25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27
28#include <asm/atomic.h>
29
30#include <linux/fsnotify_backend.h>
31#include "fsnotify.h"
32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{
35 struct fsnotify_mark *mark, *lmark;
36 struct hlist_node *pos, *n;
37 LIST_HEAD(free_list);
38
39 spin_lock(&mnt->mnt_root->d_lock);
40 hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
41 list_add(&mark->m.free_m_list, &free_list);
42 hlist_del_init_rcu(&mark->m.m_list);
43 fsnotify_get_mark(mark);
44 }
45 spin_unlock(&mnt->mnt_root->d_lock);
46
47 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
48 fsnotify_destroy_mark(mark);
49 fsnotify_put_mark(mark);
50 }
51}
52
53void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
54{
55 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
56}
57
58/*
59 * Recalculate the mask of events relevant to a given vfsmount locked.
60 */
61static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
62{
63 struct fsnotify_mark *mark;
64 struct hlist_node *pos;
65 __u32 new_mask = 0;
66
67 assert_spin_locked(&mnt->mnt_root->d_lock);
68
69 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
70 new_mask |= mark->mask;
71 mnt->mnt_fsnotify_mask = new_mask;
72}
73
74/*
75 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
76 * any notifier is interested in hearing for this mount point
77 */
78void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
79{
80 spin_lock(&mnt->mnt_root->d_lock);
81 fsnotify_recalc_vfsmount_mask_locked(mnt);
82 spin_unlock(&mnt->mnt_root->d_lock);
83}
84
85void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
86{
87 struct vfsmount *mnt = mark->m.mnt;
88
89 assert_spin_locked(&mark->lock);
90 assert_spin_locked(&mark->group->mark_lock);
91
92 spin_lock(&mnt->mnt_root->d_lock);
93
94 hlist_del_init_rcu(&mark->m.m_list);
95 mark->m.mnt = NULL;
96
97 fsnotify_recalc_vfsmount_mask_locked(mnt);
98
99 spin_unlock(&mnt->mnt_root->d_lock);
100}
101
102static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
103 struct vfsmount *mnt)
104{
105 struct fsnotify_mark *mark;
106 struct hlist_node *pos;
107
108 assert_spin_locked(&mnt->mnt_root->d_lock);
109
110 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
111 if (mark->group == group) {
112 fsnotify_get_mark(mark);
113 return mark;
114 }
115 }
116 return NULL;
117}
118
119/*
120 * given a group and vfsmount, find the mark associated with that combination.
121 * if found take a reference to that mark and return it, else return NULL
122 */
123struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
124 struct vfsmount *mnt)
125{
126 struct fsnotify_mark *mark;
127
128 spin_lock(&mnt->mnt_root->d_lock);
129 mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
130 spin_unlock(&mnt->mnt_root->d_lock);
131
132 return mark;
133}
134
135/*
136 * Attach an initialized mark to a given group and vfsmount.
137 * These marks may be used for the fsnotify backend to determine which
138 * event types should be delivered to which groups.
139 */
140int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
141 struct fsnotify_group *group, struct vfsmount *mnt,
142 int allow_dups)
143{
144 struct fsnotify_mark *lmark;
145 struct hlist_node *node, *last = NULL;
146 int ret = 0;
147
148 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
149
150 assert_spin_locked(&mark->lock);
151 assert_spin_locked(&group->mark_lock);
152
153 spin_lock(&mnt->mnt_root->d_lock);
154
155 mark->m.mnt = mnt;
156
157 /* is mark the first mark? */
158 if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
159 hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
160 goto out;
161 }
162
163 /* should mark be in the middle of the current list? */
164 hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
165 last = node;
166
167 if ((lmark->group == group) && !allow_dups) {
168 ret = -EEXIST;
169 goto out;
170 }
171
172 if (mark->group < lmark->group)
173 continue;
174
175 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
176 goto out;
177 }
178
179 BUG_ON(last == NULL);
180 /* mark should be the last entry. last is the current last entry */
181 hlist_add_after_rcu(last, &mark->m.m_list);
182out:
183 fsnotify_recalc_vfsmount_mask_locked(mnt);
184 spin_unlock(&mnt->mnt_root->d_lock);
185
186 return ret;
187}
diff --git a/fs/open.c b/fs/open.c
index 0d1fa3dc0efb..b715d06fbe36 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/falloc.h> 29#include <linux/falloc.h>
30#include <linux/fs_struct.h> 30#include <linux/fs_struct.h>
31#include <linux/ima.h> 31#include <linux/ima.h>
32#include <linux/dnotify.h>
32 33
33#include "internal.h" 34#include "internal.h"
34 35
@@ -887,7 +888,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
887 put_unused_fd(fd); 888 put_unused_fd(fd);
888 fd = PTR_ERR(f); 889 fd = PTR_ERR(f);
889 } else { 890 } else {
890 fsnotify_open(f->f_path.dentry); 891 fsnotify_open(f);
891 fd_install(fd, f); 892 fd_install(fd, f);
892 } 893 }
893 } 894 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 9c0485236e68..74e36586e4d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -311,7 +311,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
311 else 311 else
312 ret = do_sync_read(file, buf, count, pos); 312 ret = do_sync_read(file, buf, count, pos);
313 if (ret > 0) { 313 if (ret > 0) {
314 fsnotify_access(file->f_path.dentry); 314 fsnotify_access(file);
315 add_rchar(current, ret); 315 add_rchar(current, ret);
316 } 316 }
317 inc_syscr(current); 317 inc_syscr(current);
@@ -367,7 +367,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
367 else 367 else
368 ret = do_sync_write(file, buf, count, pos); 368 ret = do_sync_write(file, buf, count, pos);
369 if (ret > 0) { 369 if (ret > 0) {
370 fsnotify_modify(file->f_path.dentry); 370 fsnotify_modify(file);
371 add_wchar(current, ret); 371 add_wchar(current, ret);
372 } 372 }
373 inc_syscw(current); 373 inc_syscw(current);
@@ -675,9 +675,9 @@ out:
675 kfree(iov); 675 kfree(iov);
676 if ((ret + (type == READ)) > 0) { 676 if ((ret + (type == READ)) > 0) {
677 if (type == READ) 677 if (type == READ)
678 fsnotify_access(file->f_path.dentry); 678 fsnotify_access(file);
679 else 679 else
680 fsnotify_modify(file->f_path.dentry); 680 fsnotify_modify(file);
681 } 681 }
682 return ret; 682 return ret;
683} 683}
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index fcd268ce0674..e3cbc38bdcc2 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -3,6 +3,14 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
6/*
7 * FMODE_EXEC is 0x20
8 * FMODE_NONOTIFY is 0x1000000
9 * These cannot be used by userspace O_* until internal and external open
10 * flags are split.
11 * -Eric Paris
12 */
13
6#define O_ACCMODE 00000003 14#define O_ACCMODE 00000003
7#define O_RDONLY 00000000 15#define O_RDONLY 00000000
8#define O_WRONLY 00000001 16#define O_WRONLY 00000001
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 9aa9bcadf869..2547daf2aef2 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -210,6 +210,7 @@ unifdef-y += ethtool.h
210unifdef-y += eventpoll.h 210unifdef-y += eventpoll.h
211unifdef-y += signalfd.h 211unifdef-y += signalfd.h
212unifdef-y += ext2_fs.h 212unifdef-y += ext2_fs.h
213unifdef-y += fanotify.h
213unifdef-y += fb.h 214unifdef-y += fb.h
214unifdef-y += fcntl.h 215unifdef-y += fcntl.h
215unifdef-y += filter.h 216unifdef-y += filter.h
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index ecc06286226d..3290555a52ee 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -28,6 +28,7 @@ struct dnotify_struct {
28 FS_CREATE | FS_DN_RENAME |\ 28 FS_CREATE | FS_DN_RENAME |\
29 FS_MOVED_FROM | FS_MOVED_TO) 29 FS_MOVED_FROM | FS_MOVED_TO)
30 30
31extern int dir_notify_enable;
31extern void dnotify_flush(struct file *, fl_owner_t); 32extern void dnotify_flush(struct file *, fl_owner_t);
32extern int fcntl_dirnotify(int, struct file *, unsigned long); 33extern int fcntl_dirnotify(int, struct file *, unsigned long);
33 34
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
new file mode 100644
index 000000000000..f0949a57ca9d
--- /dev/null
+++ b/include/linux/fanotify.h
@@ -0,0 +1,105 @@
1#ifndef _LINUX_FANOTIFY_H
2#define _LINUX_FANOTIFY_H
3
4#include <linux/types.h>
5
6/* the following events that user-space can register for */
7#define FAN_ACCESS 0x00000001 /* File was accessed */
8#define FAN_MODIFY 0x00000002 /* File was modified */
9#define FAN_CLOSE_WRITE 0x00000008 /* Unwrittable file closed */
10#define FAN_CLOSE_NOWRITE 0x00000010 /* Writtable file closed */
11#define FAN_OPEN 0x00000020 /* File was opened */
12
13#define FAN_EVENT_ON_CHILD 0x08000000 /* interested in child events */
14
15/* FIXME currently Q's have no limit.... */
16#define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
17
18#define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
19#define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
20
21/* helper events */
22#define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
23
24/* flags used for fanotify_init() */
25#define FAN_CLOEXEC 0x00000001
26#define FAN_NONBLOCK 0x00000002
27
28#define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK)
29
30/* flags used for fanotify_modify_mark() */
31#define FAN_MARK_ADD 0x00000001
32#define FAN_MARK_REMOVE 0x00000002
33#define FAN_MARK_DONT_FOLLOW 0x00000004
34#define FAN_MARK_ONLYDIR 0x00000008
35#define FAN_MARK_MOUNT 0x00000010
36#define FAN_MARK_IGNORED_MASK 0x00000020
37#define FAN_MARK_IGNORED_SURV_MODIFY 0x00000040
38#define FAN_MARK_FLUSH 0x00000080
39
40#define FAN_ALL_MARK_FLAGS (FAN_MARK_ADD |\
41 FAN_MARK_REMOVE |\
42 FAN_MARK_DONT_FOLLOW |\
43 FAN_MARK_ONLYDIR |\
44 FAN_MARK_MOUNT |\
45 FAN_MARK_IGNORED_MASK |\
46 FAN_MARK_IGNORED_SURV_MODIFY)
47
48/*
49 * All of the events - we build the list by hand so that we can add flags in
50 * the future and not break backward compatibility. Apps will get only the
51 * events that they originally wanted. Be sure to add new events here!
52 */
53#define FAN_ALL_EVENTS (FAN_ACCESS |\
54 FAN_MODIFY |\
55 FAN_CLOSE |\
56 FAN_OPEN)
57
58/*
59 * All events which require a permission response from userspace
60 */
61#define FAN_ALL_PERM_EVENTS (FAN_OPEN_PERM |\
62 FAN_ACCESS_PERM)
63
64#define FAN_ALL_OUTGOING_EVENTS (FAN_ALL_EVENTS |\
65 FAN_ALL_PERM_EVENTS |\
66 FAN_Q_OVERFLOW)
67
68#define FANOTIFY_METADATA_VERSION 1
69
70struct fanotify_event_metadata {
71 __u32 event_len;
72 __u32 vers;
73 __s32 fd;
74 __u64 mask;
75 __s64 pid;
76} __attribute__ ((packed));
77
78struct fanotify_response {
79 __s32 fd;
80 __u32 response;
81} __attribute__ ((packed));
82
83/* Legit userspace responses to a _PERM event */
84#define FAN_ALLOW 0x01
85#define FAN_DENY 0x02
86
87/* Helper functions to deal with fanotify_event_metadata buffers */
88#define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
89
90#define FAN_EVENT_NEXT(meta, len) ((len) -= (meta)->event_len, \
91 (struct fanotify_event_metadata*)(((char *)(meta)) + \
92 (meta)->event_len))
93
94#define FAN_EVENT_OK(meta, len) ((long)(len) >= (long)FAN_EVENT_METADATA_LEN && \
95 (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
96 (long)(meta)->event_len <= (long)(len))
97
98#ifdef __KERNEL__
99
100struct fanotify_wait {
101 struct fsnotify_event *event;
102 __s32 fd;
103};
104#endif /* __KERNEL__ */
105#endif /* _LINUX_FANOTIFY_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e221016a6a9..a8ccf85b8691 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -91,6 +91,9 @@ struct inodes_stat_t {
91/* Expect random access pattern */ 91/* Expect random access pattern */
92#define FMODE_RANDOM ((__force fmode_t)0x1000) 92#define FMODE_RANDOM ((__force fmode_t)0x1000)
93 93
94/* File was opened by fanotify and shouldn't generate fanotify events */
95#define FMODE_NONOTIFY ((__force fmode_t)16777216) /* 0x1000000 */
96
94/* 97/*
95 * The below are the various read and write types that we support. Some of 98 * The below are the various read and write types that we support. Some of
96 * them include behavioral modifiers that send information down to the 99 * them include behavioral modifiers that send information down to the
@@ -409,9 +412,6 @@ extern int get_max_files(void);
409extern int sysctl_nr_open; 412extern int sysctl_nr_open;
410extern struct inodes_stat_t inodes_stat; 413extern struct inodes_stat_t inodes_stat;
411extern int leases_enable, lease_break_time; 414extern int leases_enable, lease_break_time;
412#ifdef CONFIG_DNOTIFY
413extern int dir_notify_enable;
414#endif
415 415
416struct buffer_head; 416struct buffer_head;
417typedef int (get_block_t)(struct inode *inode, sector_t iblock, 417typedef int (get_block_t)(struct inode *inode, sector_t iblock,
@@ -772,12 +772,7 @@ struct inode {
772 772
773#ifdef CONFIG_FSNOTIFY 773#ifdef CONFIG_FSNOTIFY
774 __u32 i_fsnotify_mask; /* all events this inode cares about */ 774 __u32 i_fsnotify_mask; /* all events this inode cares about */
775 struct hlist_head i_fsnotify_mark_entries; /* fsnotify mark entries */ 775 struct hlist_head i_fsnotify_marks;
776#endif
777
778#ifdef CONFIG_INOTIFY
779 struct list_head inotify_watches; /* watches on this inode */
780 struct mutex inotify_mutex; /* protects the watches list */
781#endif 776#endif
782 777
783 unsigned long i_state; 778 unsigned long i_state;
@@ -2484,7 +2479,8 @@ int proc_nr_files(struct ctl_table *table, int write,
2484int __init get_filesystem_list(char *buf); 2479int __init get_filesystem_list(char *buf);
2485 2480
2486#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) 2481#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
2487#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) 2482#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
2483 (flag & FMODE_NONOTIFY)))
2488 2484
2489#endif /* __KERNEL__ */ 2485#endif /* __KERNEL__ */
2490#endif /* _LINUX_FS_H */ 2486#endif /* _LINUX_FS_H */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 01755909ce81..e4e2204187ee 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -11,8 +11,6 @@
11 * (C) Copyright 2005 Robert Love 11 * (C) Copyright 2005 Robert Love
12 */ 12 */
13 13
14#include <linux/dnotify.h>
15#include <linux/inotify.h>
16#include <linux/fsnotify_backend.h> 14#include <linux/fsnotify_backend.h>
17#include <linux/audit.h> 15#include <linux/audit.h>
18#include <linux/slab.h> 16#include <linux/slab.h>
@@ -21,35 +19,52 @@
21 * fsnotify_d_instantiate - instantiate a dentry for inode 19 * fsnotify_d_instantiate - instantiate a dentry for inode
22 * Called with dcache_lock held. 20 * Called with dcache_lock held.
23 */ 21 */
24static inline void fsnotify_d_instantiate(struct dentry *entry, 22static inline void fsnotify_d_instantiate(struct dentry *dentry,
25 struct inode *inode) 23 struct inode *inode)
26{ 24{
27 __fsnotify_d_instantiate(entry, inode); 25 __fsnotify_d_instantiate(dentry, inode);
28
29 inotify_d_instantiate(entry, inode);
30} 26}
31 27
32/* Notify this dentry's parent about a child's events. */ 28/* Notify this dentry's parent about a child's events. */
33static inline void fsnotify_parent(struct dentry *dentry, __u32 mask) 29static inline void fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
34{ 30{
35 __fsnotify_parent(dentry, mask); 31 if (!dentry)
32 dentry = file->f_path.dentry;
33
34 __fsnotify_parent(file, dentry, mask);
35}
36 36
37 inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name); 37/* simple call site for access decisions */
38static inline int fsnotify_perm(struct file *file, int mask)
39{
40 struct inode *inode = file->f_path.dentry->d_inode;
41 __u32 fsnotify_mask = 0;
42
43 if (file->f_mode & FMODE_NONOTIFY)
44 return 0;
45 if (!(mask & (MAY_READ | MAY_OPEN)))
46 return 0;
47 if (mask & MAY_OPEN)
48 fsnotify_mask = FS_OPEN_PERM;
49 else if (mask & MAY_READ)
50 fsnotify_mask = FS_ACCESS_PERM;
51 else
52 BUG();
53
54 return fsnotify(inode, fsnotify_mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
38} 55}
39 56
40/* 57/*
41 * fsnotify_d_move - entry has been moved 58 * fsnotify_d_move - dentry has been moved
42 * Called with dcache_lock and entry->d_lock held. 59 * Called with dcache_lock and dentry->d_lock held.
43 */ 60 */
44static inline void fsnotify_d_move(struct dentry *entry) 61static inline void fsnotify_d_move(struct dentry *dentry)
45{ 62{
46 /* 63 /*
47 * On move we need to update entry->d_flags to indicate if the new parent 64 * On move we need to update dentry->d_flags to indicate if the new parent
48 * cares about events from this entry. 65 * cares about events from this dentry.
49 */ 66 */
50 __fsnotify_update_dcache_flags(entry); 67 __fsnotify_update_dcache_flags(dentry);
51
52 inotify_d_move(entry);
53} 68}
54 69
55/* 70/*
@@ -57,8 +72,6 @@ static inline void fsnotify_d_move(struct dentry *entry)
57 */ 72 */
58static inline void fsnotify_link_count(struct inode *inode) 73static inline void fsnotify_link_count(struct inode *inode)
59{ 74{
60 inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
61
62 fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 75 fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
63} 76}
64 77
@@ -66,45 +79,31 @@ static inline void fsnotify_link_count(struct inode *inode)
66 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir 79 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
67 */ 80 */
68static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, 81static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
69 const char *old_name, 82 const unsigned char *old_name,
70 int isdir, struct inode *target, struct dentry *moved) 83 int isdir, struct inode *target, struct dentry *moved)
71{ 84{
72 struct inode *source = moved->d_inode; 85 struct inode *source = moved->d_inode;
73 u32 in_cookie = inotify_get_cookie();
74 u32 fs_cookie = fsnotify_get_cookie(); 86 u32 fs_cookie = fsnotify_get_cookie();
75 __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM); 87 __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
76 __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO); 88 __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
77 const char *new_name = moved->d_name.name; 89 const unsigned char *new_name = moved->d_name.name;
78 90
79 if (old_dir == new_dir) 91 if (old_dir == new_dir)
80 old_dir_mask |= FS_DN_RENAME; 92 old_dir_mask |= FS_DN_RENAME;
81 93
82 if (isdir) { 94 if (isdir) {
83 isdir = IN_ISDIR;
84 old_dir_mask |= FS_IN_ISDIR; 95 old_dir_mask |= FS_IN_ISDIR;
85 new_dir_mask |= FS_IN_ISDIR; 96 new_dir_mask |= FS_IN_ISDIR;
86 } 97 }
87 98
88 inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
89 source);
90 inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
91 source);
92
93 fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie); 99 fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
94 fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie); 100 fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
95 101
96 if (target) { 102 if (target)
97 inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
98 inotify_inode_is_dead(target);
99
100 /* this is really a link_count change not a removal */
101 fsnotify_link_count(target); 103 fsnotify_link_count(target);
102 }
103 104
104 if (source) { 105 if (source)
105 inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
106 fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); 106 fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
107 }
108 audit_inode_child(moved, new_dir); 107 audit_inode_child(moved, new_dir);
109} 108}
110 109
@@ -117,6 +116,14 @@ static inline void fsnotify_inode_delete(struct inode *inode)
117} 116}
118 117
119/* 118/*
119 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
120 */
121static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
122{
123 __fsnotify_vfsmount_delete(mnt);
124}
125
126/*
120 * fsnotify_nameremove - a filename was removed from a directory 127 * fsnotify_nameremove - a filename was removed from a directory
121 */ 128 */
122static inline void fsnotify_nameremove(struct dentry *dentry, int isdir) 129static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
@@ -126,7 +133,7 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
126 if (isdir) 133 if (isdir)
127 mask |= FS_IN_ISDIR; 134 mask |= FS_IN_ISDIR;
128 135
129 fsnotify_parent(dentry, mask); 136 fsnotify_parent(NULL, dentry, mask);
130} 137}
131 138
132/* 139/*
@@ -134,9 +141,6 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
134 */ 141 */
135static inline void fsnotify_inoderemove(struct inode *inode) 142static inline void fsnotify_inoderemove(struct inode *inode)
136{ 143{
137 inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
138 inotify_inode_is_dead(inode);
139
140 fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 144 fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
141 __fsnotify_inode_delete(inode); 145 __fsnotify_inode_delete(inode);
142} 146}
@@ -146,8 +150,6 @@ static inline void fsnotify_inoderemove(struct inode *inode)
146 */ 150 */
147static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) 151static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
148{ 152{
149 inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
150 dentry->d_inode);
151 audit_inode_child(dentry, inode); 153 audit_inode_child(dentry, inode);
152 154
153 fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); 155 fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
@@ -160,8 +162,6 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
160 */ 162 */
161static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) 163static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
162{ 164{
163 inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
164 inode);
165 fsnotify_link_count(inode); 165 fsnotify_link_count(inode);
166 audit_inode_child(new_dentry, dir); 166 audit_inode_child(new_dentry, dir);
167 167
@@ -176,7 +176,6 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
176 __u32 mask = (FS_CREATE | FS_IN_ISDIR); 176 __u32 mask = (FS_CREATE | FS_IN_ISDIR);
177 struct inode *d_inode = dentry->d_inode; 177 struct inode *d_inode = dentry->d_inode;
178 178
179 inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
180 audit_inode_child(dentry, inode); 179 audit_inode_child(dentry, inode);
181 180
182 fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); 181 fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
@@ -185,52 +184,52 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
185/* 184/*
186 * fsnotify_access - file was read 185 * fsnotify_access - file was read
187 */ 186 */
188static inline void fsnotify_access(struct dentry *dentry) 187static inline void fsnotify_access(struct file *file)
189{ 188{
190 struct inode *inode = dentry->d_inode; 189 struct inode *inode = file->f_path.dentry->d_inode;
191 __u32 mask = FS_ACCESS; 190 __u32 mask = FS_ACCESS;
192 191
193 if (S_ISDIR(inode->i_mode)) 192 if (S_ISDIR(inode->i_mode))
194 mask |= FS_IN_ISDIR; 193 mask |= FS_IN_ISDIR;
195 194
196 inotify_inode_queue_event(inode, mask, 0, NULL, NULL); 195 if (!(file->f_mode & FMODE_NONOTIFY)) {
197 196 fsnotify_parent(file, NULL, mask);
198 fsnotify_parent(dentry, mask); 197 fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
199 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 198 }
200} 199}
201 200
202/* 201/*
203 * fsnotify_modify - file was modified 202 * fsnotify_modify - file was modified
204 */ 203 */
205static inline void fsnotify_modify(struct dentry *dentry) 204static inline void fsnotify_modify(struct file *file)
206{ 205{
207 struct inode *inode = dentry->d_inode; 206 struct inode *inode = file->f_path.dentry->d_inode;
208 __u32 mask = FS_MODIFY; 207 __u32 mask = FS_MODIFY;
209 208
210 if (S_ISDIR(inode->i_mode)) 209 if (S_ISDIR(inode->i_mode))
211 mask |= FS_IN_ISDIR; 210 mask |= FS_IN_ISDIR;
212 211
213 inotify_inode_queue_event(inode, mask, 0, NULL, NULL); 212 if (!(file->f_mode & FMODE_NONOTIFY)) {
214 213 fsnotify_parent(file, NULL, mask);
215 fsnotify_parent(dentry, mask); 214 fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
216 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 215 }
217} 216}
218 217
219/* 218/*
220 * fsnotify_open - file was opened 219 * fsnotify_open - file was opened
221 */ 220 */
222static inline void fsnotify_open(struct dentry *dentry) 221static inline void fsnotify_open(struct file *file)
223{ 222{
224 struct inode *inode = dentry->d_inode; 223 struct inode *inode = file->f_path.dentry->d_inode;
225 __u32 mask = FS_OPEN; 224 __u32 mask = FS_OPEN;
226 225
227 if (S_ISDIR(inode->i_mode)) 226 if (S_ISDIR(inode->i_mode))
228 mask |= FS_IN_ISDIR; 227 mask |= FS_IN_ISDIR;
229 228
230 inotify_inode_queue_event(inode, mask, 0, NULL, NULL); 229 if (!(file->f_mode & FMODE_NONOTIFY)) {
231 230 fsnotify_parent(file, NULL, mask);
232 fsnotify_parent(dentry, mask); 231 fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
233 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 232 }
234} 233}
235 234
236/* 235/*
@@ -238,18 +237,17 @@ static inline void fsnotify_open(struct dentry *dentry)
238 */ 237 */
239static inline void fsnotify_close(struct file *file) 238static inline void fsnotify_close(struct file *file)
240{ 239{
241 struct dentry *dentry = file->f_path.dentry; 240 struct inode *inode = file->f_path.dentry->d_inode;
242 struct inode *inode = dentry->d_inode;
243 fmode_t mode = file->f_mode; 241 fmode_t mode = file->f_mode;
244 __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; 242 __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
245 243
246 if (S_ISDIR(inode->i_mode)) 244 if (S_ISDIR(inode->i_mode))
247 mask |= FS_IN_ISDIR; 245 mask |= FS_IN_ISDIR;
248 246
249 inotify_inode_queue_event(inode, mask, 0, NULL, NULL); 247 if (!(file->f_mode & FMODE_NONOTIFY)) {
250 248 fsnotify_parent(file, NULL, mask);
251 fsnotify_parent(dentry, mask); 249 fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
252 fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); 250 }
253} 251}
254 252
255/* 253/*
@@ -263,9 +261,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
263 if (S_ISDIR(inode->i_mode)) 261 if (S_ISDIR(inode->i_mode))
264 mask |= FS_IN_ISDIR; 262 mask |= FS_IN_ISDIR;
265 263
266 inotify_inode_queue_event(inode, mask, 0, NULL, NULL); 264 fsnotify_parent(NULL, dentry, mask);
267
268 fsnotify_parent(dentry, mask);
269 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 265 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
270} 266}
271 267
@@ -299,19 +295,18 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
299 if (mask) { 295 if (mask) {
300 if (S_ISDIR(inode->i_mode)) 296 if (S_ISDIR(inode->i_mode))
301 mask |= FS_IN_ISDIR; 297 mask |= FS_IN_ISDIR;
302 inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
303 298
304 fsnotify_parent(dentry, mask); 299 fsnotify_parent(NULL, dentry, mask);
305 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 300 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
306 } 301 }
307} 302}
308 303
309#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY) /* notify helpers */ 304#if defined(CONFIG_FSNOTIFY) /* notify helpers */
310 305
311/* 306/*
312 * fsnotify_oldname_init - save off the old filename before we change it 307 * fsnotify_oldname_init - save off the old filename before we change it
313 */ 308 */
314static inline const char *fsnotify_oldname_init(const char *name) 309static inline const unsigned char *fsnotify_oldname_init(const unsigned char *name)
315{ 310{
316 return kstrdup(name, GFP_KERNEL); 311 return kstrdup(name, GFP_KERNEL);
317} 312}
@@ -319,22 +314,22 @@ static inline const char *fsnotify_oldname_init(const char *name)
319/* 314/*
320 * fsnotify_oldname_free - free the name we got from fsnotify_oldname_init 315 * fsnotify_oldname_free - free the name we got from fsnotify_oldname_init
321 */ 316 */
322static inline void fsnotify_oldname_free(const char *old_name) 317static inline void fsnotify_oldname_free(const unsigned char *old_name)
323{ 318{
324 kfree(old_name); 319 kfree(old_name);
325} 320}
326 321
327#else /* CONFIG_INOTIFY || CONFIG_FSNOTIFY */ 322#else /* CONFIG_FSNOTIFY */
328 323
329static inline const char *fsnotify_oldname_init(const char *name) 324static inline const char *fsnotify_oldname_init(const unsigned char *name)
330{ 325{
331 return NULL; 326 return NULL;
332} 327}
333 328
334static inline void fsnotify_oldname_free(const char *old_name) 329static inline void fsnotify_oldname_free(const unsigned char *old_name)
335{ 330{
336} 331}
337 332
338#endif /* ! CONFIG_INOTIFY */ 333#endif /* CONFIG_FSNOTIFY */
339 334
340#endif /* _LINUX_FS_NOTIFY_H */ 335#endif /* _LINUX_FS_NOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4d6f47b51189..9bbfd7204b04 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -41,6 +41,10 @@
41#define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ 41#define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
42#define FS_IN_IGNORED 0x00008000 /* last inotify event here */ 42#define FS_IN_IGNORED 0x00008000 /* last inotify event here */
43 43
44#define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
45#define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
46
47#define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */
44#define FS_IN_ISDIR 0x40000000 /* event occurred against dir */ 48#define FS_IN_ISDIR 0x40000000 /* event occurred against dir */
45#define FS_IN_ONESHOT 0x80000000 /* only send event once */ 49#define FS_IN_ONESHOT 0x80000000 /* only send event once */
46 50
@@ -58,13 +62,20 @@
58 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\ 62 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
59 FS_DELETE) 63 FS_DELETE)
60 64
61/* listeners that hard code group numbers near the top */ 65#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO)
62#define DNOTIFY_GROUP_NUM UINT_MAX 66
63#define INOTIFY_GROUP_NUM (DNOTIFY_GROUP_NUM-1) 67#define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
68 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN | \
69 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \
70 FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
71 FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
72 FS_OPEN_PERM | FS_ACCESS_PERM | FS_EXCL_UNLINK | \
73 FS_IN_ISDIR | FS_IN_ONESHOT | FS_DN_RENAME | \
74 FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
64 75
65struct fsnotify_group; 76struct fsnotify_group;
66struct fsnotify_event; 77struct fsnotify_event;
67struct fsnotify_mark_entry; 78struct fsnotify_mark;
68struct fsnotify_event_private_data; 79struct fsnotify_event_private_data;
69 80
70/* 81/*
@@ -80,10 +91,16 @@ struct fsnotify_event_private_data;
80 * valid group and inode to use to clean up. 91 * valid group and inode to use to clean up.
81 */ 92 */
82struct fsnotify_ops { 93struct fsnotify_ops {
83 bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask); 94 bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
84 int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event); 95 struct fsnotify_mark *inode_mark,
96 struct fsnotify_mark *vfsmount_mark,
97 __u32 mask, void *data, int data_type);
98 int (*handle_event)(struct fsnotify_group *group,
99 struct fsnotify_mark *inode_mark,
100 struct fsnotify_mark *vfsmount_mark,
101 struct fsnotify_event *event);
85 void (*free_group_priv)(struct fsnotify_group *group); 102 void (*free_group_priv)(struct fsnotify_group *group);
86 void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); 103 void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
87 void (*free_event_priv)(struct fsnotify_event_private_data *priv); 104 void (*free_event_priv)(struct fsnotify_event_private_data *priv);
88}; 105};
89 106
@@ -95,22 +112,6 @@ struct fsnotify_ops {
95 */ 112 */
96struct fsnotify_group { 113struct fsnotify_group {
97 /* 114 /*
98 * global list of all groups receiving events from fsnotify.
99 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
100 * or fsnotify_grp_srcu depending on write vs read.
101 */
102 struct list_head group_list;
103
104 /*
105 * Defines all of the event types in which this group is interested.
106 * This mask is a bitwise OR of the FS_* events from above. Each time
107 * this mask changes for a group (if it changes) the correct functions
108 * must be called to update the global structures which indicate global
109 * interest in event types.
110 */
111 __u32 mask;
112
113 /*
114 * How the refcnt is used is up to each group. When the refcnt hits 0 115 * How the refcnt is used is up to each group. When the refcnt hits 0
115 * fsnotify will clean up all of the resources associated with this group. 116 * fsnotify will clean up all of the resources associated with this group.
116 * As an example, the dnotify group will always have a refcnt=1 and that 117 * As an example, the dnotify group will always have a refcnt=1 and that
@@ -119,7 +120,6 @@ struct fsnotify_group {
119 * closed. 120 * closed.
120 */ 121 */
121 atomic_t refcnt; /* things with interest in this group */ 122 atomic_t refcnt; /* things with interest in this group */
122 unsigned int group_num; /* simply prevents accidental group collision */
123 123
124 const struct fsnotify_ops *ops; /* how this group handles things */ 124 const struct fsnotify_ops *ops; /* how this group handles things */
125 125
@@ -130,15 +130,12 @@ struct fsnotify_group {
130 unsigned int q_len; /* events on the queue */ 130 unsigned int q_len; /* events on the queue */
131 unsigned int max_events; /* maximum events allowed on the list */ 131 unsigned int max_events; /* maximum events allowed on the list */
132 132
133 /* stores all fastapth entries assoc with this group so they can be cleaned on unregister */ 133 /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
134 spinlock_t mark_lock; /* protect mark_entries list */ 134 spinlock_t mark_lock; /* protect marks_list */
135 atomic_t num_marks; /* 1 for each mark entry and 1 for not being 135 atomic_t num_marks; /* 1 for each mark and 1 for not being
136 * past the point of no return when freeing 136 * past the point of no return when freeing
137 * a group */ 137 * a group */
138 struct list_head mark_entries; /* all inode mark entries for this group */ 138 struct list_head marks_list; /* all inode marks for this group */
139
140 /* prevents double list_del of group_list. protected by global fsnotify_grp_mutex */
141 bool on_group_list;
142 139
143 /* groups can define private fields here or use the void *private */ 140 /* groups can define private fields here or use the void *private */
144 union { 141 union {
@@ -152,6 +149,17 @@ struct fsnotify_group {
152 struct user_struct *user; 149 struct user_struct *user;
153 } inotify_data; 150 } inotify_data;
154#endif 151#endif
152#ifdef CONFIG_FANOTIFY
153 struct fanotify_group_private_data {
154#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
155 /* allows a group to block waiting for a userspace response */
156 struct mutex access_mutex;
157 struct list_head access_list;
158 wait_queue_head_t access_waitq;
159#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
160 int f_flags;
161 } fanotify_data;
162#endif /* CONFIG_FANOTIFY */
155 }; 163 };
156}; 164};
157 165
@@ -195,35 +203,57 @@ struct fsnotify_event {
195 /* to_tell may ONLY be dereferenced during handle_event(). */ 203 /* to_tell may ONLY be dereferenced during handle_event(). */
196 struct inode *to_tell; /* either the inode the event happened to or its parent */ 204 struct inode *to_tell; /* either the inode the event happened to or its parent */
197 /* 205 /*
198 * depending on the event type we should have either a path or inode 206 * depending on the event type we should have either a file or inode
199 * We hold a reference on path, but NOT on inode. Since we have the ref on 207 * We hold a reference on file, but NOT on inode. Since we have the ref on
200 * the path, it may be dereferenced at any point during this object's 208 * the file, it may be dereferenced at any point during this object's
201 * lifetime. That reference is dropped when this object's refcnt hits 209 * lifetime. That reference is dropped when this object's refcnt hits
202 * 0. If this event contains an inode instead of a path, the inode may 210 * 0. If this event contains an inode instead of a file, the inode may
203 * ONLY be used during handle_event(). 211 * ONLY be used during handle_event().
204 */ 212 */
205 union { 213 union {
206 struct path path; 214 struct file *file;
207 struct inode *inode; 215 struct inode *inode;
208 }; 216 };
209/* when calling fsnotify tell it if the data is a path or inode */ 217/* when calling fsnotify tell it if the data is a path or inode */
210#define FSNOTIFY_EVENT_NONE 0 218#define FSNOTIFY_EVENT_NONE 0
211#define FSNOTIFY_EVENT_PATH 1 219#define FSNOTIFY_EVENT_FILE 1
212#define FSNOTIFY_EVENT_INODE 2 220#define FSNOTIFY_EVENT_INODE 2
213#define FSNOTIFY_EVENT_FILE 3
214 int data_type; /* which of the above union we have */ 221 int data_type; /* which of the above union we have */
215 atomic_t refcnt; /* how many groups still are using/need to send this event */ 222 atomic_t refcnt; /* how many groups still are using/need to send this event */
216 __u32 mask; /* the type of access, bitwise OR for FS_* event types */ 223 __u32 mask; /* the type of access, bitwise OR for FS_* event types */
217 224
218 u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ 225 u32 sync_cookie; /* used to corrolate events, namely inotify mv events */
219 char *file_name; 226 const unsigned char *file_name;
220 size_t name_len; 227 size_t name_len;
228 struct pid *tgid;
229
230#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
231 __u32 response; /* userspace answer to question */
232#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
221 233
222 struct list_head private_data_list; /* groups can store private data here */ 234 struct list_head private_data_list; /* groups can store private data here */
223}; 235};
224 236
225/* 237/*
226 * a mark is simply an entry attached to an in core inode which allows an 238 * Inode specific fields in an fsnotify_mark
239 */
240struct fsnotify_inode_mark {
241 struct inode *inode; /* inode this mark is associated with */
242 struct hlist_node i_list; /* list of marks by inode->i_fsnotify_marks */
243 struct list_head free_i_list; /* tmp list used when freeing this mark */
244};
245
246/*
247 * Mount point specific fields in an fsnotify_mark
248 */
249struct fsnotify_vfsmount_mark {
250 struct vfsmount *mnt; /* vfsmount this mark is associated with */
251 struct hlist_node m_list; /* list of marks by inode->i_fsnotify_marks */
252 struct list_head free_m_list; /* tmp list used when freeing this mark */
253};
254
255/*
256 * a mark is simply an object attached to an in core inode which allows an
227 * fsnotify listener to indicate they are either no longer interested in events 257 * fsnotify listener to indicate they are either no longer interested in events
228 * of a type matching mask or only interested in those events. 258 * of a type matching mask or only interested in those events.
229 * 259 *
@@ -232,19 +262,28 @@ struct fsnotify_event {
232 * (such as dnotify) will flush these when the open fd is closed and not at 262 * (such as dnotify) will flush these when the open fd is closed and not at
233 * inode eviction or modification. 263 * inode eviction or modification.
234 */ 264 */
235struct fsnotify_mark_entry { 265struct fsnotify_mark {
236 __u32 mask; /* mask this mark entry is for */ 266 __u32 mask; /* mask this mark is for */
237 /* we hold ref for each i_list and g_list. also one ref for each 'thing' 267 /* we hold ref for each i_list and g_list. also one ref for each 'thing'
238 * in kernel that found and may be using this mark. */ 268 * in kernel that found and may be using this mark. */
239 atomic_t refcnt; /* active things looking at this mark */ 269 atomic_t refcnt; /* active things looking at this mark */
240 struct inode *inode; /* inode this entry is associated with */ 270 struct fsnotify_group *group; /* group this mark is for */
241 struct fsnotify_group *group; /* group this mark entry is for */ 271 struct list_head g_list; /* list of marks by group->i_fsnotify_marks */
242 struct hlist_node i_list; /* list of mark_entries by inode->i_fsnotify_mark_entries */ 272 spinlock_t lock; /* protect group and inode */
243 struct list_head g_list; /* list of mark_entries by group->i_fsnotify_mark_entries */ 273 union {
244 spinlock_t lock; /* protect group, inode, and killme */ 274 struct fsnotify_inode_mark i;
245 struct list_head free_i_list; /* tmp list used when freeing this mark */ 275 struct fsnotify_vfsmount_mark m;
276 };
277 __u32 ignored_mask; /* events types to ignore */
246 struct list_head free_g_list; /* tmp list used when freeing this mark */ 278 struct list_head free_g_list; /* tmp list used when freeing this mark */
247 void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */ 279#define FSNOTIFY_MARK_FLAG_INODE 0x01
280#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02
281#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04
282#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
283#define FSNOTIFY_MARK_FLAG_ALIVE 0x10
284 unsigned int flags; /* vfsmount or inode mark? */
285 struct list_head destroy_list;
286 void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
248}; 287};
249 288
250#ifdef CONFIG_FSNOTIFY 289#ifdef CONFIG_FSNOTIFY
@@ -252,10 +291,11 @@ struct fsnotify_mark_entry {
252/* called from the vfs helpers */ 291/* called from the vfs helpers */
253 292
254/* main fsnotify call to send events */ 293/* main fsnotify call to send events */
255extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, 294extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
256 const char *name, u32 cookie); 295 const unsigned char *name, u32 cookie);
257extern void __fsnotify_parent(struct dentry *dentry, __u32 mask); 296extern void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask);
258extern void __fsnotify_inode_delete(struct inode *inode); 297extern void __fsnotify_inode_delete(struct inode *inode);
298extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
259extern u32 fsnotify_get_cookie(void); 299extern u32 fsnotify_get_cookie(void);
260 300
261static inline int fsnotify_inode_watches_children(struct inode *inode) 301static inline int fsnotify_inode_watches_children(struct inode *inode)
@@ -304,15 +344,9 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
304 344
305/* called from fsnotify listeners, such as fanotify or dnotify */ 345/* called from fsnotify listeners, such as fanotify or dnotify */
306 346
307/* must call when a group changes its ->mask */
308extern void fsnotify_recalc_global_mask(void);
309/* get a reference to an existing or create a new group */ 347/* get a reference to an existing or create a new group */
310extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, 348extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
311 __u32 mask, 349/* drop reference on a group from fsnotify_alloc_group */
312 const struct fsnotify_ops *ops);
313/* run all marks associated with this group and update group->mask */
314extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
315/* drop reference on a group from fsnotify_obtain_group */
316extern void fsnotify_put_group(struct fsnotify_group *group); 350extern void fsnotify_put_group(struct fsnotify_group *group);
317 351
318/* take a reference to an event */ 352/* take a reference to an event */
@@ -323,8 +357,11 @@ extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struc
323 struct fsnotify_event *event); 357 struct fsnotify_event *event);
324 358
325/* attach the event to the group notification queue */ 359/* attach the event to the group notification queue */
326extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 360extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
327 struct fsnotify_event_private_data *priv); 361 struct fsnotify_event *event,
362 struct fsnotify_event_private_data *priv,
363 struct fsnotify_event *(*merge)(struct list_head *,
364 struct fsnotify_event *));
328/* true if the group notification queue is empty */ 365/* true if the group notification queue is empty */
329extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); 366extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
330/* return, but do not dequeue the first event on the notification queue */ 367/* return, but do not dequeue the first event on the notification queue */
@@ -334,38 +371,66 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
334 371
335/* functions used to manipulate the marks attached to inodes */ 372/* functions used to manipulate the marks attached to inodes */
336 373
374/* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
375extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
337/* run all marks associated with an inode and update inode->i_fsnotify_mask */ 376/* run all marks associated with an inode and update inode->i_fsnotify_mask */
338extern void fsnotify_recalc_inode_mask(struct inode *inode); 377extern void fsnotify_recalc_inode_mask(struct inode *inode);
339extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry)); 378extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
340/* find (and take a reference) to a mark associated with group and inode */ 379/* find (and take a reference) to a mark associated with group and inode */
341extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode); 380extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
381/* find (and take a reference) to a mark associated with group and vfsmount */
382extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
383/* copy the values from old into new */
384extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
385/* set the ignored_mask of a mark */
386extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask);
387/* set the mask of a mark (might pin the object into memory */
388extern void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask);
342/* attach the mark to both the group and the inode */ 389/* attach the mark to both the group and the inode */
343extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode); 390extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
391 struct inode *inode, struct vfsmount *mnt, int allow_dups);
344/* given a mark, flag it to be freed when all references are dropped */ 392/* given a mark, flag it to be freed when all references are dropped */
345extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry); 393extern void fsnotify_destroy_mark(struct fsnotify_mark *mark);
394/* run all the marks in a group, and clear all of the vfsmount marks */
395extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
396/* run all the marks in a group, and clear all of the inode marks */
397extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
398/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
399extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
346/* run all the marks in a group, and flag them to be freed */ 400/* run all the marks in a group, and flag them to be freed */
347extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); 401extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
348extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry); 402extern void fsnotify_get_mark(struct fsnotify_mark *mark);
349extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry); 403extern void fsnotify_put_mark(struct fsnotify_mark *mark);
350extern void fsnotify_unmount_inodes(struct list_head *list); 404extern void fsnotify_unmount_inodes(struct list_head *list);
351 405
352/* put here because inotify does some weird stuff when destroying watches */ 406/* put here because inotify does some weird stuff when destroying watches */
353extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, 407extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
354 void *data, int data_is, const char *name, 408 void *data, int data_is,
409 const unsigned char *name,
355 u32 cookie, gfp_t gfp); 410 u32 cookie, gfp_t gfp);
356 411
412/* fanotify likes to change events after they are on lists... */
413extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
414extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
415 struct fsnotify_event *new_event);
416
357#else 417#else
358 418
359static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, 419static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
360 const char *name, u32 cookie) 420 const unsigned char *name, u32 cookie)
361{} 421{
422 return 0;
423}
362 424
363static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask) 425static inline void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
364{} 426{}
365 427
366static inline void __fsnotify_inode_delete(struct inode *inode) 428static inline void __fsnotify_inode_delete(struct inode *inode)
367{} 429{}
368 430
431static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
432{}
433
369static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) 434static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
370{} 435{}
371 436
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 37ea2894b3c0..d33041e2a42a 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -51,6 +51,7 @@ struct inotify_event {
51/* special flags */ 51/* special flags */
52#define IN_ONLYDIR 0x01000000 /* only watch the path if it is a directory */ 52#define IN_ONLYDIR 0x01000000 /* only watch the path if it is a directory */
53#define IN_DONT_FOLLOW 0x02000000 /* don't follow a sym link */ 53#define IN_DONT_FOLLOW 0x02000000 /* don't follow a sym link */
54#define IN_EXCL_UNLINK 0x04000000 /* exclude events on unlinked objects */
54#define IN_MASK_ADD 0x20000000 /* add to the mask of an already existing watch */ 55#define IN_MASK_ADD 0x20000000 /* add to the mask of an already existing watch */
55#define IN_ISDIR 0x40000000 /* event occurred against dir */ 56#define IN_ISDIR 0x40000000 /* event occurred against dir */
56#define IN_ONESHOT 0x80000000 /* only send event once */ 57#define IN_ONESHOT 0x80000000 /* only send event once */
@@ -70,177 +71,17 @@ struct inotify_event {
70#define IN_NONBLOCK O_NONBLOCK 71#define IN_NONBLOCK O_NONBLOCK
71 72
72#ifdef __KERNEL__ 73#ifdef __KERNEL__
73 74#include <linux/sysctl.h>
74#include <linux/dcache.h> 75extern struct ctl_table inotify_table[]; /* for sysctl */
75#include <linux/fs.h> 76
76 77#define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
77/* 78 IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
78 * struct inotify_watch - represents a watch request on a specific inode 79 IN_MOVED_TO | IN_CREATE | IN_DELETE | \
79 * 80 IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | \
80 * h_list is protected by ih->mutex of the associated inotify_handle. 81 IN_Q_OVERFLOW | IN_IGNORED | IN_ONLYDIR | \
81 * i_list, mask are protected by inode->inotify_mutex of the associated inode. 82 IN_DONT_FOLLOW | IN_EXCL_UNLINK | IN_MASK_ADD | \
82 * ih, inode, and wd are never written to once the watch is created. 83 IN_ISDIR | IN_ONESHOT)
83 * 84
84 * Callers must use the established inotify interfaces to access inotify_watch 85#endif
85 * contents. The content of this structure is private to the inotify
86 * implementation.
87 */
88struct inotify_watch {
89 struct list_head h_list; /* entry in inotify_handle's list */
90 struct list_head i_list; /* entry in inode's list */
91 atomic_t count; /* reference count */
92 struct inotify_handle *ih; /* associated inotify handle */
93 struct inode *inode; /* associated inode */
94 __s32 wd; /* watch descriptor */
95 __u32 mask; /* event mask for this watch */
96};
97
98struct inotify_operations {
99 void (*handle_event)(struct inotify_watch *, u32, u32, u32,
100 const char *, struct inode *);
101 void (*destroy_watch)(struct inotify_watch *);
102};
103
104#ifdef CONFIG_INOTIFY
105
106/* Kernel API for producing events */
107
108extern void inotify_d_instantiate(struct dentry *, struct inode *);
109extern void inotify_d_move(struct dentry *);
110extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
111 const char *, struct inode *);
112extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
113 const char *);
114extern void inotify_unmount_inodes(struct list_head *);
115extern void inotify_inode_is_dead(struct inode *);
116extern u32 inotify_get_cookie(void);
117
118/* Kernel Consumer API */
119
120extern struct inotify_handle *inotify_init(const struct inotify_operations *);
121extern void inotify_init_watch(struct inotify_watch *);
122extern void inotify_destroy(struct inotify_handle *);
123extern __s32 inotify_find_watch(struct inotify_handle *, struct inode *,
124 struct inotify_watch **);
125extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *,
126 u32);
127extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
128 struct inode *, __u32);
129extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *);
130extern void inotify_evict_watch(struct inotify_watch *);
131extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *);
132extern int inotify_rm_wd(struct inotify_handle *, __u32);
133extern void inotify_remove_watch_locked(struct inotify_handle *,
134 struct inotify_watch *);
135extern void get_inotify_watch(struct inotify_watch *);
136extern void put_inotify_watch(struct inotify_watch *);
137extern int pin_inotify_watch(struct inotify_watch *);
138extern void unpin_inotify_watch(struct inotify_watch *);
139
140#else
141
142static inline void inotify_d_instantiate(struct dentry *dentry,
143 struct inode *inode)
144{
145}
146
147static inline void inotify_d_move(struct dentry *dentry)
148{
149}
150
151static inline void inotify_inode_queue_event(struct inode *inode,
152 __u32 mask, __u32 cookie,
153 const char *filename,
154 struct inode *n_inode)
155{
156}
157
158static inline void inotify_dentry_parent_queue_event(struct dentry *dentry,
159 __u32 mask, __u32 cookie,
160 const char *filename)
161{
162}
163
164static inline void inotify_unmount_inodes(struct list_head *list)
165{
166}
167
168static inline void inotify_inode_is_dead(struct inode *inode)
169{
170}
171
172static inline u32 inotify_get_cookie(void)
173{
174 return 0;
175}
176
177static inline struct inotify_handle *inotify_init(const struct inotify_operations *ops)
178{
179 return ERR_PTR(-EOPNOTSUPP);
180}
181
182static inline void inotify_init_watch(struct inotify_watch *watch)
183{
184}
185
186static inline void inotify_destroy(struct inotify_handle *ih)
187{
188}
189
190static inline __s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
191 struct inotify_watch **watchp)
192{
193 return -EOPNOTSUPP;
194}
195
196static inline __s32 inotify_find_update_watch(struct inotify_handle *ih,
197 struct inode *inode, u32 mask)
198{
199 return -EOPNOTSUPP;
200}
201
202static inline __s32 inotify_add_watch(struct inotify_handle *ih,
203 struct inotify_watch *watch,
204 struct inode *inode, __u32 mask)
205{
206 return -EOPNOTSUPP;
207}
208
209static inline int inotify_rm_watch(struct inotify_handle *ih,
210 struct inotify_watch *watch)
211{
212 return -EOPNOTSUPP;
213}
214
215static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd)
216{
217 return -EOPNOTSUPP;
218}
219
220static inline void inotify_remove_watch_locked(struct inotify_handle *ih,
221 struct inotify_watch *watch)
222{
223}
224
225static inline void get_inotify_watch(struct inotify_watch *watch)
226{
227}
228
229static inline void put_inotify_watch(struct inotify_watch *watch)
230{
231}
232
233extern inline int pin_inotify_watch(struct inotify_watch *watch)
234{
235 return 0;
236}
237
238extern inline void unpin_inotify_watch(struct inotify_watch *watch)
239{
240}
241
242#endif /* CONFIG_INOTIFY */
243
244#endif /* __KERNEL __ */
245 86
246#endif /* _LINUX_INOTIFY_H */ 87#endif /* _LINUX_INOTIFY_H */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4bd05474d11d..907210bd9f9c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -56,7 +56,11 @@ struct vfsmount {
56 struct list_head mnt_mounts; /* list of children, anchored here */ 56 struct list_head mnt_mounts; /* list of children, anchored here */
57 struct list_head mnt_child; /* and going through their mnt_child */ 57 struct list_head mnt_child; /* and going through their mnt_child */
58 int mnt_flags; 58 int mnt_flags;
59 /* 4 bytes hole on 64bits arches */ 59 /* 4 bytes hole on 64bits arches without fsnotify */
60#ifdef CONFIG_FSNOTIFY
61 __u32 mnt_fsnotify_mask;
62 struct hlist_head mnt_fsnotify_marks;
63#endif
60 const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ 64 const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
61 struct list_head mnt_list; 65 struct list_head mnt_list;
62 struct list_head mnt_expire; /* link in fs-specific expiry list */ 66 struct list_head mnt_expire; /* link in fs-specific expiry list */
diff --git a/include/linux/security.h b/include/linux/security.h
index 723a93df756a..5bcb395a49d4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -23,6 +23,7 @@
23#define __LINUX_SECURITY_H 23#define __LINUX_SECURITY_H
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/fsnotify.h>
26#include <linux/binfmts.h> 27#include <linux/binfmts.h>
27#include <linux/signal.h> 28#include <linux/signal.h>
28#include <linux/resource.h> 29#include <linux/resource.h>
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a6bfd1367d2a..2ab198a1e38d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -811,6 +811,10 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
811asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, 811asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
812 struct timespec __user *, const sigset_t __user *, 812 struct timespec __user *, const sigset_t __user *,
813 size_t); 813 size_t);
814asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
815asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
816 u64 mask, int fd,
817 const char __user *pathname);
814 818
815int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 819int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
816 820
diff --git a/init/Kconfig b/init/Kconfig
index cb64c5889e02..24932b9c03e8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -320,13 +320,17 @@ config AUDITSYSCALL
320 help 320 help
321 Enable low-overhead system-call auditing infrastructure that 321 Enable low-overhead system-call auditing infrastructure that
322 can be used independently or with another kernel subsystem, 322 can be used independently or with another kernel subsystem,
323 such as SELinux. To use audit's filesystem watch feature, please 323 such as SELinux.
324 ensure that INOTIFY is configured. 324
325config AUDIT_WATCH
326 def_bool y
327 depends on AUDITSYSCALL
328 select FSNOTIFY
325 329
326config AUDIT_TREE 330config AUDIT_TREE
327 def_bool y 331 def_bool y
328 depends on AUDITSYSCALL 332 depends on AUDITSYSCALL
329 select INOTIFY 333 select FSNOTIFY
330 334
331menu "RCU Subsystem" 335menu "RCU Subsystem"
332 336
diff --git a/kernel/Makefile b/kernel/Makefile
index c53e491e25a8..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,10 +70,11 @@ obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_SMP) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_GCOV_KERNEL) += gcov/
77obj-$(CONFIG_KPROBES) += kprobes.o 78obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += debug/ 79obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 8296aa516c5a..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#include <linux/netlink.h> 58#include <linux/netlink.h>
59#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
62 61
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
103extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
104extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
105 105
106extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
107
106/* audit watch functions */ 108/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch); 109#ifdef CONFIG_AUDIT_WATCH
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch); 110extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch); 111extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 112extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule); 113extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
113extern void audit_remove_watch(struct audit_watch *watch); 114extern void audit_remove_watch_rule(struct audit_krule *krule);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch); 115extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch); 116extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
118 117#else
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, 118#define audit_put_watch(w) {}
120 struct audit_watch *watch); 119#define audit_get_watch(w) {}
120#define audit_to_watch(k, p, l, o) (-EINVAL)
121#define audit_add_watch(k, l) (-EINVAL)
122#define audit_remove_watch_rule(k) BUG()
123#define audit_watch_path(w) ""
124#define audit_watch_compare(w, i, d) 0
125
126#endif /* CONFIG_AUDIT_WATCH */
121 127
122#ifdef CONFIG_AUDIT_TREE 128#ifdef CONFIG_AUDIT_TREE
123extern struct audit_chunk *audit_tree_lookup(const struct inode *); 129extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
1#include "audit.h" 1#include "audit.h"
2#include <linux/inotify.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
22 22
23struct audit_chunk { 23struct audit_chunk {
24 struct list_head hash; 24 struct list_head hash;
25 struct inotify_watch watch; 25 struct fsnotify_mark mark;
26 struct list_head trees; /* with root here */ 26 struct list_head trees; /* with root here */
27 int dead; 27 int dead;
28 int count; 28 int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
59 * tree is refcounted; one reference for "some rules on rules_list refer to 59 * tree is refcounted; one reference for "some rules on rules_list refer to
60 * it", one for each chunk with pointer to it. 60 * it", one for each chunk with pointer to it.
61 * 61 *
62 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount 62 * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
63 * of watch contributes 1 to .refs). 63 * of watch contributes 1 to .refs).
64 * 64 *
65 * node.index allows to get from node.list to containing chunk. 65 * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
68 * that makes a difference. Some. 68 * that makes a difference. Some.
69 */ 69 */
70 70
71static struct inotify_handle *rtree_ih; 71static struct fsnotify_group *audit_tree_group;
72 72
73static struct audit_tree *alloc_tree(const char *s) 73static struct audit_tree *alloc_tree(const char *s)
74{ 74{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
111 return tree->pathname; 111 return tree->pathname;
112} 112}
113 113
114static struct audit_chunk *alloc_chunk(int count)
115{
116 struct audit_chunk *chunk;
117 size_t size;
118 int i;
119
120 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
121 chunk = kzalloc(size, GFP_KERNEL);
122 if (!chunk)
123 return NULL;
124
125 INIT_LIST_HEAD(&chunk->hash);
126 INIT_LIST_HEAD(&chunk->trees);
127 chunk->count = count;
128 atomic_long_set(&chunk->refs, 1);
129 for (i = 0; i < count; i++) {
130 INIT_LIST_HEAD(&chunk->owners[i].list);
131 chunk->owners[i].index = i;
132 }
133 inotify_init_watch(&chunk->watch);
134 return chunk;
135}
136
137static void free_chunk(struct audit_chunk *chunk) 114static void free_chunk(struct audit_chunk *chunk)
138{ 115{
139 int i; 116 int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
157 audit_put_chunk(chunk); 134 audit_put_chunk(chunk);
158} 135}
159 136
137static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
138{
139 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
140 call_rcu(&chunk->head, __put_chunk);
141}
142
143static struct audit_chunk *alloc_chunk(int count)
144{
145 struct audit_chunk *chunk;
146 size_t size;
147 int i;
148
149 size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
150 chunk = kzalloc(size, GFP_KERNEL);
151 if (!chunk)
152 return NULL;
153
154 INIT_LIST_HEAD(&chunk->hash);
155 INIT_LIST_HEAD(&chunk->trees);
156 chunk->count = count;
157 atomic_long_set(&chunk->refs, 1);
158 for (i = 0; i < count; i++) {
159 INIT_LIST_HEAD(&chunk->owners[i].list);
160 chunk->owners[i].index = i;
161 }
162 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
163 return chunk;
164}
165
160enum {HASH_SIZE = 128}; 166enum {HASH_SIZE = 128};
161static struct list_head chunk_hash_heads[HASH_SIZE]; 167static struct list_head chunk_hash_heads[HASH_SIZE];
162static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); 168static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
167 return chunk_hash_heads + n % HASH_SIZE; 173 return chunk_hash_heads + n % HASH_SIZE;
168} 174}
169 175
170/* hash_lock is held by caller */ 176/* hash_lock & entry->lock is held by caller */
171static void insert_hash(struct audit_chunk *chunk) 177static void insert_hash(struct audit_chunk *chunk)
172{ 178{
173 struct list_head *list = chunk_hash(chunk->watch.inode); 179 struct fsnotify_mark *entry = &chunk->mark;
180 struct list_head *list;
181
182 if (!entry->i.inode)
183 return;
184 list = chunk_hash(entry->i.inode);
174 list_add_rcu(&chunk->hash, list); 185 list_add_rcu(&chunk->hash, list);
175} 186}
176 187
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
181 struct audit_chunk *p; 192 struct audit_chunk *p;
182 193
183 list_for_each_entry_rcu(p, list, hash) { 194 list_for_each_entry_rcu(p, list, hash) {
184 if (p->watch.inode == inode) { 195 /* mark.inode may have gone NULL, but who cares? */
196 if (p->mark.i.inode == inode) {
185 atomic_long_inc(&p->refs); 197 atomic_long_inc(&p->refs);
186 return p; 198 return p;
187 } 199 }
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
210static void untag_chunk(struct node *p) 222static void untag_chunk(struct node *p)
211{ 223{
212 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark;
213 struct audit_chunk *new; 226 struct audit_chunk *new;
214 struct audit_tree *owner; 227 struct audit_tree *owner;
215 int size = chunk->count - 1; 228 int size = chunk->count - 1;
216 int i, j; 229 int i, j;
217 230
218 if (!pin_inotify_watch(&chunk->watch)) { 231 fsnotify_get_mark(entry);
219 /*
220 * Filesystem is shutting down; all watches are getting
221 * evicted, just take it off the node list for this
222 * tree and let the eviction logics take care of the
223 * rest.
224 */
225 owner = p->owner;
226 if (owner->root == chunk) {
227 list_del_init(&owner->same_root);
228 owner->root = NULL;
229 }
230 list_del_init(&p->list);
231 p->owner = NULL;
232 put_tree(owner);
233 return;
234 }
235 232
236 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
237 234
238 /* 235 spin_lock(&entry->lock);
239 * pin_inotify_watch() succeeded, so the watch won't go away 236 if (chunk->dead || !entry->i.inode) {
240 * from under us. 237 spin_unlock(&entry->lock);
241 */
242 mutex_lock(&chunk->watch.inode->inotify_mutex);
243 if (chunk->dead) {
244 mutex_unlock(&chunk->watch.inode->inotify_mutex);
245 goto out; 238 goto out;
246 } 239 }
247 240
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
256 list_del_init(&p->list); 249 list_del_init(&p->list);
257 list_del_rcu(&chunk->hash); 250 list_del_rcu(&chunk->hash);
258 spin_unlock(&hash_lock); 251 spin_unlock(&hash_lock);
259 inotify_evict_watch(&chunk->watch); 252 spin_unlock(&entry->lock);
260 mutex_unlock(&chunk->watch.inode->inotify_mutex); 253 fsnotify_destroy_mark(entry);
261 put_inotify_watch(&chunk->watch); 254 fsnotify_put_mark(entry);
262 goto out; 255 goto out;
263 } 256 }
264 257
265 new = alloc_chunk(size); 258 new = alloc_chunk(size);
266 if (!new) 259 if (!new)
267 goto Fallback; 260 goto Fallback;
268 if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { 261 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
269 free_chunk(new); 263 free_chunk(new);
270 goto Fallback; 264 goto Fallback;
271 } 265 }
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
298 list_for_each_entry(owner, &new->trees, same_root) 292 list_for_each_entry(owner, &new->trees, same_root)
299 owner->root = new; 293 owner->root = new;
300 spin_unlock(&hash_lock); 294 spin_unlock(&hash_lock);
301 inotify_evict_watch(&chunk->watch); 295 spin_unlock(&entry->lock);
302 mutex_unlock(&chunk->watch.inode->inotify_mutex); 296 fsnotify_destroy_mark(entry);
303 put_inotify_watch(&chunk->watch); 297 fsnotify_put_mark(entry);
304 goto out; 298 goto out;
305 299
306Fallback: 300Fallback:
@@ -314,31 +308,33 @@ Fallback:
314 p->owner = NULL; 308 p->owner = NULL;
315 put_tree(owner); 309 put_tree(owner);
316 spin_unlock(&hash_lock); 310 spin_unlock(&hash_lock);
317 mutex_unlock(&chunk->watch.inode->inotify_mutex); 311 spin_unlock(&entry->lock);
318out: 312out:
319 unpin_inotify_watch(&chunk->watch); 313 fsnotify_put_mark(entry);
320 spin_lock(&hash_lock); 314 spin_lock(&hash_lock);
321} 315}
322 316
323static int create_chunk(struct inode *inode, struct audit_tree *tree) 317static int create_chunk(struct inode *inode, struct audit_tree *tree)
324{ 318{
319 struct fsnotify_mark *entry;
325 struct audit_chunk *chunk = alloc_chunk(1); 320 struct audit_chunk *chunk = alloc_chunk(1);
326 if (!chunk) 321 if (!chunk)
327 return -ENOMEM; 322 return -ENOMEM;
328 323
329 if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { 324 entry = &chunk->mark;
325 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
330 free_chunk(chunk); 326 free_chunk(chunk);
331 return -ENOSPC; 327 return -ENOSPC;
332 } 328 }
333 329
334 mutex_lock(&inode->inotify_mutex); 330 spin_lock(&entry->lock);
335 spin_lock(&hash_lock); 331 spin_lock(&hash_lock);
336 if (tree->goner) { 332 if (tree->goner) {
337 spin_unlock(&hash_lock); 333 spin_unlock(&hash_lock);
338 chunk->dead = 1; 334 chunk->dead = 1;
339 inotify_evict_watch(&chunk->watch); 335 spin_unlock(&entry->lock);
340 mutex_unlock(&inode->inotify_mutex); 336 fsnotify_destroy_mark(entry);
341 put_inotify_watch(&chunk->watch); 337 fsnotify_put_mark(entry);
342 return 0; 338 return 0;
343 } 339 }
344 chunk->owners[0].index = (1U << 31); 340 chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
351 } 347 }
352 insert_hash(chunk); 348 insert_hash(chunk);
353 spin_unlock(&hash_lock); 349 spin_unlock(&hash_lock);
354 mutex_unlock(&inode->inotify_mutex); 350 spin_unlock(&entry->lock);
355 return 0; 351 return 0;
356} 352}
357 353
358/* the first tagged inode becomes root of tree */ 354/* the first tagged inode becomes root of tree */
359static int tag_chunk(struct inode *inode, struct audit_tree *tree) 355static int tag_chunk(struct inode *inode, struct audit_tree *tree)
360{ 356{
361 struct inotify_watch *watch; 357 struct fsnotify_mark *old_entry, *chunk_entry;
362 struct audit_tree *owner; 358 struct audit_tree *owner;
363 struct audit_chunk *chunk, *old; 359 struct audit_chunk *chunk, *old;
364 struct node *p; 360 struct node *p;
365 int n; 361 int n;
366 362
367 if (inotify_find_watch(rtree_ih, inode, &watch) < 0) 363 old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
364 if (!old_entry)
368 return create_chunk(inode, tree); 365 return create_chunk(inode, tree);
369 366
370 old = container_of(watch, struct audit_chunk, watch); 367 old = container_of(old_entry, struct audit_chunk, mark);
371 368
372 /* are we already there? */ 369 /* are we already there? */
373 spin_lock(&hash_lock); 370 spin_lock(&hash_lock);
374 for (n = 0; n < old->count; n++) { 371 for (n = 0; n < old->count; n++) {
375 if (old->owners[n].owner == tree) { 372 if (old->owners[n].owner == tree) {
376 spin_unlock(&hash_lock); 373 spin_unlock(&hash_lock);
377 put_inotify_watch(&old->watch); 374 fsnotify_put_mark(old_entry);
378 return 0; 375 return 0;
379 } 376 }
380 } 377 }
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
382 379
383 chunk = alloc_chunk(old->count + 1); 380 chunk = alloc_chunk(old->count + 1);
384 if (!chunk) { 381 if (!chunk) {
385 put_inotify_watch(&old->watch); 382 fsnotify_put_mark(old_entry);
386 return -ENOMEM; 383 return -ENOMEM;
387 } 384 }
388 385
389 mutex_lock(&inode->inotify_mutex); 386 chunk_entry = &chunk->mark;
390 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 387
391 mutex_unlock(&inode->inotify_mutex); 388 spin_lock(&old_entry->lock);
392 put_inotify_watch(&old->watch); 389 if (!old_entry->i.inode) {
390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry);
393 free_chunk(chunk); 393 free_chunk(chunk);
394 return -ENOENT;
395 }
396
397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock);
400 free_chunk(chunk);
401 fsnotify_put_mark(old_entry);
394 return -ENOSPC; 402 return -ENOSPC;
395 } 403 }
404
405 /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
406 spin_lock(&chunk_entry->lock);
396 spin_lock(&hash_lock); 407 spin_lock(&hash_lock);
408
409 /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
397 if (tree->goner) { 410 if (tree->goner) {
398 spin_unlock(&hash_lock); 411 spin_unlock(&hash_lock);
399 chunk->dead = 1; 412 chunk->dead = 1;
400 inotify_evict_watch(&chunk->watch); 413 spin_unlock(&chunk_entry->lock);
401 mutex_unlock(&inode->inotify_mutex); 414 spin_unlock(&old_entry->lock);
402 put_inotify_watch(&old->watch); 415
403 put_inotify_watch(&chunk->watch); 416 fsnotify_destroy_mark(chunk_entry);
417
418 fsnotify_put_mark(chunk_entry);
419 fsnotify_put_mark(old_entry);
404 return 0; 420 return 0;
405 } 421 }
406 list_replace_init(&old->trees, &chunk->trees); 422 list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
426 list_add(&tree->same_root, &chunk->trees); 442 list_add(&tree->same_root, &chunk->trees);
427 } 443 }
428 spin_unlock(&hash_lock); 444 spin_unlock(&hash_lock);
429 inotify_evict_watch(&old->watch); 445 spin_unlock(&chunk_entry->lock);
430 mutex_unlock(&inode->inotify_mutex); 446 spin_unlock(&old_entry->lock);
431 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ 447 fsnotify_destroy_mark(old_entry);
432 put_inotify_watch(&old->watch); /* and kill it */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 fsnotify_put_mark(old_entry); /* and kill it */
433 return 0; 450 return 0;
434} 451}
435 452
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
584 601
585 spin_lock(&hash_lock); 602 spin_lock(&hash_lock);
586 list_for_each_entry(node, &tree->chunks, list) { 603 list_for_each_entry(node, &tree->chunks, list) {
587 struct inode *inode = find_chunk(node)->watch.inode; 604 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */
606 struct inode *inode = chunk->mark.i.inode;
588 node->index |= 1U<<31; 607 node->index |= 1U<<31;
589 if (iterate_mounts(compare_root, inode, root_mnt)) 608 if (iterate_mounts(compare_root, inode, root_mnt))
590 node->index &= ~(1U<<31); 609 node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
846 * Here comes the stuff asynchronous to auditctl operations 865 * Here comes the stuff asynchronous to auditctl operations
847 */ 866 */
848 867
849/* inode->inotify_mutex is locked */
850static void evict_chunk(struct audit_chunk *chunk) 868static void evict_chunk(struct audit_chunk *chunk)
851{ 869{
852 struct audit_tree *owner; 870 struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
885 mutex_unlock(&audit_filter_mutex); 903 mutex_unlock(&audit_filter_mutex);
886} 904}
887 905
888static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, 906static int audit_tree_handle_event(struct fsnotify_group *group,
889 u32 cookie, const char *dname, struct inode *inode) 907 struct fsnotify_mark *inode_mark,
908 struct fsnotify_mark *vfsmonut_mark,
909 struct fsnotify_event *event)
910{
911 BUG();
912 return -EOPNOTSUPP;
913}
914
915static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
890{ 916{
891 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 917 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
892 918
893 if (mask & IN_IGNORED) { 919 evict_chunk(chunk);
894 evict_chunk(chunk); 920 fsnotify_put_mark(entry);
895 put_inotify_watch(watch);
896 }
897} 921}
898 922
899static void destroy_watch(struct inotify_watch *watch) 923static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
924 struct fsnotify_mark *inode_mark,
925 struct fsnotify_mark *vfsmount_mark,
926 __u32 mask, void *data, int data_type)
900{ 927{
901 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 928 return false;
902 call_rcu(&chunk->head, __put_chunk);
903} 929}
904 930
905static const struct inotify_operations rtree_inotify_ops = { 931static const struct fsnotify_ops audit_tree_ops = {
906 .handle_event = handle_event, 932 .handle_event = audit_tree_handle_event,
907 .destroy_watch = destroy_watch, 933 .should_send_event = audit_tree_send_event,
934 .free_group_priv = NULL,
935 .free_event_priv = NULL,
936 .freeing_mark = audit_tree_freeing_mark,
908}; 937};
909 938
910static int __init audit_tree_init(void) 939static int __init audit_tree_init(void)
911{ 940{
912 int i; 941 int i;
913 942
914 rtree_ih = inotify_init(&rtree_inotify_ops); 943 audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
915 if (IS_ERR(rtree_ih)) 944 if (IS_ERR(audit_tree_group))
916 audit_panic("cannot initialize inotify handle for rectree watches"); 945 audit_panic("cannot initialize fsnotify group for rectree watches");
917 946
918 for (i = 0; i < HASH_SIZE; i++) 947 for (i = 0; i < HASH_SIZE; i++)
919 INIT_LIST_HEAD(&chunk_hash_heads[i]); 948 INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..6bf2306be7d6 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fsnotify_backend.h>
27#include <linux/namei.h> 28#include <linux/namei.h>
28#include <linux/netlink.h> 29#include <linux/netlink.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/slab.h> 31#include <linux/slab.h>
31#include <linux/inotify.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include "audit.h" 33#include "audit.h"
34 34
35/* 35/*
36 * Reference counting: 36 * Reference counting:
37 * 37 *
38 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED 38 * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
39 * event. Each audit_watch holds a reference to its associated parent. 39 * event. Each audit_watch holds a reference to its associated parent.
40 * 40 *
41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to 41 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
51 unsigned long ino; /* associated inode number */ 51 unsigned long ino; /* associated inode number */
52 struct audit_parent *parent; /* associated parent */ 52 struct audit_parent *parent; /* associated parent */
53 struct list_head wlist; /* entry in parent->watches list */ 53 struct list_head wlist; /* entry in parent->watches list */
54 struct list_head rules; /* associated rules */ 54 struct list_head rules; /* anchor for krule->rlist */
55}; 55};
56 56
57struct audit_parent { 57struct audit_parent {
58 struct list_head ilist; /* entry in inotify registration list */ 58 struct list_head watches; /* anchor for audit_watch->wlist */
59 struct list_head watches; /* associated watches */ 59 struct fsnotify_mark mark; /* fsnotify mark on the inode */
60 struct inotify_watch wdata; /* inotify watch data */
61 unsigned flags; /* status flags */
62}; 60};
63 61
64/* Inotify handle. */ 62/* fsnotify handle. */
65struct inotify_handle *audit_ih; 63struct fsnotify_group *audit_watch_group;
66 64
67/* 65/* fsnotify events we care about. */
68 * audit_parent status flags: 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
69 * 67 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
70 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
71 * a filesystem event to ensure we're adding audit watches to a valid parent.
72 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
73 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
74 * we can receive while holding nameidata.
75 */
76#define AUDIT_PARENT_INVALID 0x001
77 68
78/* Inotify events we care about. */ 69static void audit_free_parent(struct audit_parent *parent)
79#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 70{
71 WARN_ON(!list_empty(&parent->watches));
72 kfree(parent);
73}
80 74
81static void audit_free_parent(struct inotify_watch *i_watch) 75static void audit_watch_free_mark(struct fsnotify_mark *entry)
82{ 76{
83 struct audit_parent *parent; 77 struct audit_parent *parent;
84 78
85 parent = container_of(i_watch, struct audit_parent, wdata); 79 parent = container_of(entry, struct audit_parent, mark);
86 WARN_ON(!list_empty(&parent->watches)); 80 audit_free_parent(parent);
87 kfree(parent); 81}
82
83static void audit_get_parent(struct audit_parent *parent)
84{
85 if (likely(parent))
86 fsnotify_get_mark(&parent->mark);
87}
88
89static void audit_put_parent(struct audit_parent *parent)
90{
91 if (likely(parent))
92 fsnotify_put_mark(&parent->mark);
93}
94
95/*
96 * Find and return the audit_parent on the given inode. If found a reference
97 * is taken on this parent.
98 */
99static inline struct audit_parent *audit_find_parent(struct inode *inode)
100{
101 struct audit_parent *parent = NULL;
102 struct fsnotify_mark *entry;
103
104 entry = fsnotify_find_inode_mark(audit_watch_group, inode);
105 if (entry)
106 parent = container_of(entry, struct audit_parent, mark);
107
108 return parent;
88} 109}
89 110
90void audit_get_watch(struct audit_watch *watch) 111void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
105void audit_remove_watch(struct audit_watch *watch) 126void audit_remove_watch(struct audit_watch *watch)
106{ 127{
107 list_del(&watch->wlist); 128 list_del(&watch->wlist);
108 put_inotify_watch(&watch->parent->wdata); 129 audit_put_parent(watch->parent);
109 watch->parent = NULL; 130 watch->parent = NULL;
110 audit_put_watch(watch); /* match initial get */ 131 audit_put_watch(watch); /* match initial get */
111} 132}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
115 return watch->path; 136 return watch->path;
116} 137}
117 138
118struct list_head *audit_watch_rules(struct audit_watch *watch) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
119{
120 return &watch->rules;
121}
122
123unsigned long audit_watch_inode(struct audit_watch *watch)
124{ 140{
125 return watch->ino; 141 return (watch->ino != (unsigned long)-1) &&
126} 142 (watch->ino == ino) &&
127 143 (watch->dev == dev);
128dev_t audit_watch_dev(struct audit_watch *watch)
129{
130 return watch->dev;
131} 144}
132 145
133/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
134static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct nameidata *ndp)
135{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode;
136 struct audit_parent *parent; 150 struct audit_parent *parent;
137 s32 wd; 151 int ret;
138 152
139 parent = kzalloc(sizeof(*parent), GFP_KERNEL); 153 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
140 if (unlikely(!parent)) 154 if (unlikely(!parent))
141 return ERR_PTR(-ENOMEM); 155 return ERR_PTR(-ENOMEM);
142 156
143 INIT_LIST_HEAD(&parent->watches); 157 INIT_LIST_HEAD(&parent->watches);
144 parent->flags = 0; 158
145 159 fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
146 inotify_init_watch(&parent->wdata); 160 parent->mark.mask = AUDIT_FS_WATCH;
147 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 161 ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
148 get_inotify_watch(&parent->wdata); 162 if (ret < 0) {
149 wd = inotify_add_watch(audit_ih, &parent->wdata, 163 audit_free_parent(parent);
150 ndp->path.dentry->d_inode, AUDIT_IN_WATCH); 164 return ERR_PTR(ret);
151 if (wd < 0) {
152 audit_free_parent(&parent->wdata);
153 return ERR_PTR(wd);
154 } 165 }
155 166
156 return parent; 167 return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
179{ 190{
180 struct audit_watch *watch; 191 struct audit_watch *watch;
181 192
182 if (!audit_ih) 193 if (!audit_watch_group)
183 return -EOPNOTSUPP; 194 return -EOPNOTSUPP;
184 195
185 if (path[0] != '/' || path[len-1] == '/' || 196 if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
217 228
218 new->dev = old->dev; 229 new->dev = old->dev;
219 new->ino = old->ino; 230 new->ino = old->ino;
220 get_inotify_watch(&old->parent->wdata); 231 audit_get_parent(old->parent);
221 new->parent = old->parent; 232 new->parent = old->parent;
222 233
223out: 234out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
251 struct audit_entry *oentry, *nentry; 262 struct audit_entry *oentry, *nentry;
252 263
253 mutex_lock(&audit_filter_mutex); 264 mutex_lock(&audit_filter_mutex);
265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */
254 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
255 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path, NULL))
256 continue; 269 continue;
257 270
258 /* If the update involves invalidating rules, do the inode-based 271 /* If the update involves invalidating rules, do the inode-based
259 * filtering now, so we don't omit records. */ 272 * filtering now, so we don't omit records. */
260 if (invalidating && current->audit_context) 273 if (invalidating && !audit_dummy_context())
261 audit_filter_inodes(current, current->audit_context); 274 audit_filter_inodes(current, current->audit_context);
262 275
276 /* updating ino will likely change which audit_hash_list we
277 * are on so we need a new watch for the new list */
263 nwatch = audit_dupe_watch(owatch); 278 nwatch = audit_dupe_watch(owatch);
264 if (IS_ERR(nwatch)) { 279 if (IS_ERR(nwatch)) {
265 mutex_unlock(&audit_filter_mutex); 280 mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
275 list_del(&oentry->rule.rlist); 290 list_del(&oentry->rule.rlist);
276 list_del_rcu(&oentry->list); 291 list_del_rcu(&oentry->list);
277 292
278 nentry = audit_dupe_rule(&oentry->rule, nwatch); 293 nentry = audit_dupe_rule(&oentry->rule);
279 if (IS_ERR(nentry)) { 294 if (IS_ERR(nentry)) {
280 list_del(&oentry->rule.list); 295 list_del(&oentry->rule.list);
281 audit_panic("error updating watch, removing"); 296 audit_panic("error updating watch, removing");
282 } else { 297 } else {
283 int h = audit_hash_ino((u32)ino); 298 int h = audit_hash_ino((u32)ino);
299
300 /*
301 * nentry->rule.watch == oentry->rule.watch so
302 * we must drop that reference and set it to our
303 * new watch.
304 */
305 audit_put_watch(nentry->rule.watch);
306 audit_get_watch(nwatch);
307 nentry->rule.watch = nwatch;
284 list_add(&nentry->rule.rlist, &nwatch->rules); 308 list_add(&nentry->rule.rlist, &nwatch->rules);
285 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 309 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
286 list_replace(&oentry->rule.list, 310 list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
312 struct audit_entry *e; 336 struct audit_entry *e;
313 337
314 mutex_lock(&audit_filter_mutex); 338 mutex_lock(&audit_filter_mutex);
315 parent->flags |= AUDIT_PARENT_INVALID;
316 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 339 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
317 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 340 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
318 e = container_of(r, struct audit_entry, rule); 341 e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
325 audit_remove_watch(w); 348 audit_remove_watch(w);
326 } 349 }
327 mutex_unlock(&audit_filter_mutex); 350 mutex_unlock(&audit_filter_mutex);
328}
329
330/* Unregister inotify watches for parents on in_list.
331 * Generates an IN_IGNORED event. */
332void audit_inotify_unregister(struct list_head *in_list)
333{
334 struct audit_parent *p, *n;
335 351
336 list_for_each_entry_safe(p, n, in_list, ilist) { 352 fsnotify_destroy_mark(&parent->mark);
337 list_del(&p->ilist);
338 inotify_rm_watch(audit_ih, &p->wdata);
339 /* the unpin matching the pin in audit_do_del_rule() */
340 unpin_inotify_watch(&p->wdata);
341 }
342} 353}
343 354
344/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
389 } 400 }
390} 401}
391 402
392/* Associate the given rule with an existing parent inotify_watch. 403/* Associate the given rule with an existing parent.
393 * Caller must hold audit_filter_mutex. */ 404 * Caller must hold audit_filter_mutex. */
394static void audit_add_to_parent(struct audit_krule *krule, 405static void audit_add_to_parent(struct audit_krule *krule,
395 struct audit_parent *parent) 406 struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
397 struct audit_watch *w, *watch = krule->watch; 408 struct audit_watch *w, *watch = krule->watch;
398 int watch_found = 0; 409 int watch_found = 0;
399 410
411 BUG_ON(!mutex_is_locked(&audit_filter_mutex));
412
400 list_for_each_entry(w, &parent->watches, wlist) { 413 list_for_each_entry(w, &parent->watches, wlist) {
401 if (strcmp(watch->path, w->path)) 414 if (strcmp(watch->path, w->path))
402 continue; 415 continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
413 } 426 }
414 427
415 if (!watch_found) { 428 if (!watch_found) {
416 get_inotify_watch(&parent->wdata); 429 audit_get_parent(parent);
417 watch->parent = parent; 430 watch->parent = parent;
418 431
419 list_add(&watch->wlist, &parent->watches); 432 list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
423 436
424/* Find a matching watch entry, or add this one. 437/* Find a matching watch entry, or add this one.
425 * Caller must hold audit_filter_mutex. */ 438 * Caller must hold audit_filter_mutex. */
426int audit_add_watch(struct audit_krule *krule) 439int audit_add_watch(struct audit_krule *krule, struct list_head **list)
427{ 440{
428 struct audit_watch *watch = krule->watch; 441 struct audit_watch *watch = krule->watch;
429 struct inotify_watch *i_watch;
430 struct audit_parent *parent; 442 struct audit_parent *parent;
431 struct nameidata *ndp = NULL, *ndw = NULL; 443 struct nameidata *ndp = NULL, *ndw = NULL;
432 int ret = 0; 444 int h, ret = 0;
433 445
434 mutex_unlock(&audit_filter_mutex); 446 mutex_unlock(&audit_filter_mutex);
435 447
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
441 goto error; 453 goto error;
442 } 454 }
443 455
456 mutex_lock(&audit_filter_mutex);
457
444 /* update watch filter fields */ 458 /* update watch filter fields */
445 if (ndw) { 459 if (ndw) {
446 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; 460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
447 watch->ino = ndw->path.dentry->d_inode->i_ino; 461 watch->ino = ndw->path.dentry->d_inode->i_ino;
448 } 462 }
449 463
450 /* The audit_filter_mutex must not be held during inotify calls because 464 /* either find an old parent or attach a new one */
451 * we hold it during inotify event callback processing. If an existing 465 parent = audit_find_parent(ndp->path.dentry->d_inode);
452 * inotify watch is found, inotify_find_watch() grabs a reference before 466 if (!parent) {
453 * returning.
454 */
455 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
456 &i_watch) < 0) {
457 parent = audit_init_parent(ndp); 467 parent = audit_init_parent(ndp);
458 if (IS_ERR(parent)) { 468 if (IS_ERR(parent)) {
459 /* caller expects mutex locked */
460 mutex_lock(&audit_filter_mutex);
461 ret = PTR_ERR(parent); 469 ret = PTR_ERR(parent);
462 goto error; 470 goto error;
463 } 471 }
464 } else 472 }
465 parent = container_of(i_watch, struct audit_parent, wdata);
466
467 mutex_lock(&audit_filter_mutex);
468 473
469 /* parent was moved before we took audit_filter_mutex */ 474 audit_add_to_parent(krule, parent);
470 if (parent->flags & AUDIT_PARENT_INVALID)
471 ret = -ENOENT;
472 else
473 audit_add_to_parent(krule, parent);
474 475
475 /* match get in audit_init_parent or inotify_find_watch */ 476 /* match get in audit_find_parent or audit_init_parent */
476 put_inotify_watch(&parent->wdata); 477 audit_put_parent(parent);
477 478
479 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h];
478error: 481error:
479 audit_put_nd(ndp, ndw); /* NULL args OK */ 482 audit_put_nd(ndp, ndw); /* NULL args OK */
480 return ret; 483 return ret;
481 484
482} 485}
483 486
484void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) 487void audit_remove_watch_rule(struct audit_krule *krule)
485{ 488{
486 struct audit_watch *watch = krule->watch; 489 struct audit_watch *watch = krule->watch;
487 struct audit_parent *parent = watch->parent; 490 struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
492 audit_remove_watch(watch); 495 audit_remove_watch(watch);
493 496
494 if (list_empty(&parent->watches)) { 497 if (list_empty(&parent->watches)) {
495 /* Put parent on the inotify un-registration 498 audit_get_parent(parent);
496 * list. Grab a reference before releasing 499 fsnotify_destroy_mark(&parent->mark);
497 * audit_filter_mutex, to be released in 500 audit_put_parent(parent);
498 * audit_inotify_unregister().
499 * If filesystem is going away, just leave
500 * the sucker alone, eviction will take
501 * care of it. */
502 if (pin_inotify_watch(&parent->wdata))
503 list_add(&parent->ilist, list);
504 } 501 }
505 } 502 }
506} 503}
507 504
508/* Update watch data in audit rules based on inotify events. */ 505static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
509static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, 506 struct fsnotify_mark *inode_mark,
510 u32 cookie, const char *dname, struct inode *inode) 507 struct fsnotify_mark *vfsmount_mark,
508 __u32 mask, void *data, int data_type)
509{
510 return true;
511}
512
513/* Update watch data in audit rules based on fsnotify events. */
514static int audit_watch_handle_event(struct fsnotify_group *group,
515 struct fsnotify_mark *inode_mark,
516 struct fsnotify_mark *vfsmount_mark,
517 struct fsnotify_event *event)
511{ 518{
519 struct inode *inode;
520 __u32 mask = event->mask;
521 const char *dname = event->file_name;
512 struct audit_parent *parent; 522 struct audit_parent *parent;
513 523
514 parent = container_of(i_watch, struct audit_parent, wdata); 524 parent = container_of(inode_mark, struct audit_parent, mark);
515 525
516 if (mask & (IN_CREATE|IN_MOVED_TO) && inode) 526 BUG_ON(group != audit_watch_group);
517 audit_update_watch(parent, dname, inode->i_sb->s_dev, 527
518 inode->i_ino, 0); 528 switch (event->data_type) {
519 else if (mask & (IN_DELETE|IN_MOVED_FROM)) 529 case (FSNOTIFY_EVENT_FILE):
530 inode = event->file->f_path.dentry->d_inode;
531 break;
532 case (FSNOTIFY_EVENT_INODE):
533 inode = event->inode;
534 break;
535 default:
536 BUG();
537 inode = NULL;
538 break;
539 };
540
541 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
542 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
543 else if (mask & (FS_DELETE|FS_MOVED_FROM))
520 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 544 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
521 /* inotify automatically removes the watch and sends IN_IGNORED */ 545 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
522 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
523 audit_remove_parent_watches(parent);
524 /* inotify does not remove the watch, so remove it manually */
525 else if(mask & IN_MOVE_SELF) {
526 audit_remove_parent_watches(parent); 546 audit_remove_parent_watches(parent);
527 inotify_remove_watch_locked(audit_ih, i_watch); 547
528 } else if (mask & IN_IGNORED) 548 return 0;
529 put_inotify_watch(i_watch);
530} 549}
531 550
532static const struct inotify_operations audit_inotify_ops = { 551static const struct fsnotify_ops audit_watch_fsnotify_ops = {
533 .handle_event = audit_handle_ievent, 552 .should_send_event = audit_watch_should_send_event,
534 .destroy_watch = audit_free_parent, 553 .handle_event = audit_watch_handle_event,
554 .free_group_priv = NULL,
555 .freeing_mark = NULL,
556 .free_event_priv = NULL,
535}; 557};
536 558
537static int __init audit_watch_init(void) 559static int __init audit_watch_init(void)
538{ 560{
539 audit_ih = inotify_init(&audit_inotify_ops); 561 audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
540 if (IS_ERR(audit_ih)) 562 if (IS_ERR(audit_watch_group)) {
541 audit_panic("cannot initialize inotify handle"); 563 audit_watch_group = NULL;
564 audit_panic("cannot create audit fsnotify group");
565 }
542 return 0; 566 return 0;
543} 567}
544subsys_initcall(audit_watch_init); 568device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
71{ 71{
72 int i; 72 int i;
73 struct audit_krule *erule = &e->rule; 73 struct audit_krule *erule = &e->rule;
74
74 /* some rules don't have associated watches */ 75 /* some rules don't have associated watches */
75 if (erule->watch) 76 if (erule->watch)
76 audit_put_watch(erule->watch); 77 audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
746 * rule with the new rule in the filterlist, then free the old rule. 747 * rule with the new rule in the filterlist, then free the old rule.
747 * The rlist element is undefined; list manipulations are handled apart from 748 * The rlist element is undefined; list manipulations are handled apart from
748 * the initial copy. */ 749 * the initial copy. */
749struct audit_entry *audit_dupe_rule(struct audit_krule *old, 750struct audit_entry *audit_dupe_rule(struct audit_krule *old)
750 struct audit_watch *watch)
751{ 751{
752 u32 fcount = old->field_count; 752 u32 fcount = old->field_count;
753 struct audit_entry *entry; 753 struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
769 new->prio = old->prio; 769 new->prio = old->prio;
770 new->buflen = old->buflen; 770 new->buflen = old->buflen;
771 new->inode_f = old->inode_f; 771 new->inode_f = old->inode_f;
772 new->watch = NULL;
773 new->field_count = old->field_count; 772 new->field_count = old->field_count;
773
774 /* 774 /*
775 * note that we are OK with not refcounting here; audit_match_tree() 775 * note that we are OK with not refcounting here; audit_match_tree()
776 * never dereferences tree and we can't get false positives there 776 * never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
811 } 811 }
812 } 812 }
813 813
814 if (watch) { 814 if (old->watch) {
815 audit_get_watch(watch); 815 audit_get_watch(old->watch);
816 new->watch = watch; 816 new->watch = old->watch;
817 } 817 }
818 818
819 return entry; 819 return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
866 struct audit_watch *watch = entry->rule.watch; 866 struct audit_watch *watch = entry->rule.watch;
867 struct audit_tree *tree = entry->rule.tree; 867 struct audit_tree *tree = entry->rule.tree;
868 struct list_head *list; 868 struct list_head *list;
869 int h, err; 869 int err;
870#ifdef CONFIG_AUDITSYSCALL 870#ifdef CONFIG_AUDITSYSCALL
871 int dont_count = 0; 871 int dont_count = 0;
872 872
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
889 889
890 if (watch) { 890 if (watch) {
891 /* audit_filter_mutex is dropped and re-taken during this call */ 891 /* audit_filter_mutex is dropped and re-taken during this call */
892 err = audit_add_watch(&entry->rule); 892 err = audit_add_watch(&entry->rule, &list);
893 if (err) { 893 if (err) {
894 mutex_unlock(&audit_filter_mutex); 894 mutex_unlock(&audit_filter_mutex);
895 goto error; 895 goto error;
896 } 896 }
897 /* entry->rule.watch may have changed during audit_add_watch() */
898 watch = entry->rule.watch;
899 h = audit_hash_ino((u32)audit_watch_inode(watch));
900 list = &audit_inode_hash[h];
901 } 897 }
902 if (tree) { 898 if (tree) {
903 err = audit_add_tree_rule(&entry->rule); 899 err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
949 struct audit_watch *watch = entry->rule.watch; 945 struct audit_watch *watch = entry->rule.watch;
950 struct audit_tree *tree = entry->rule.tree; 946 struct audit_tree *tree = entry->rule.tree;
951 struct list_head *list; 947 struct list_head *list;
952 LIST_HEAD(inotify_list);
953 int ret = 0; 948 int ret = 0;
954#ifdef CONFIG_AUDITSYSCALL 949#ifdef CONFIG_AUDITSYSCALL
955 int dont_count = 0; 950 int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
969 } 964 }
970 965
971 if (e->rule.watch) 966 if (e->rule.watch)
972 audit_remove_watch_rule(&e->rule, &inotify_list); 967 audit_remove_watch_rule(&e->rule);
973 968
974 if (e->rule.tree) 969 if (e->rule.tree)
975 audit_remove_tree_rule(&e->rule); 970 audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
987#endif 982#endif
988 mutex_unlock(&audit_filter_mutex); 983 mutex_unlock(&audit_filter_mutex);
989 984
990 if (!list_empty(&inotify_list))
991 audit_inotify_unregister(&inotify_list);
992
993out: 985out:
994 if (watch) 986 if (watch)
995 audit_put_watch(watch); /* match initial get */ 987 audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
1323{ 1315{
1324 struct audit_entry *entry = container_of(r, struct audit_entry, rule); 1316 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1325 struct audit_entry *nentry; 1317 struct audit_entry *nentry;
1326 struct audit_watch *watch;
1327 struct audit_tree *tree;
1328 int err = 0; 1318 int err = 0;
1329 1319
1330 if (!security_audit_rule_known(r)) 1320 if (!security_audit_rule_known(r))
1331 return 0; 1321 return 0;
1332 1322
1333 watch = r->watch; 1323 nentry = audit_dupe_rule(r);
1334 tree = r->tree;
1335 nentry = audit_dupe_rule(r, watch);
1336 if (IS_ERR(nentry)) { 1324 if (IS_ERR(nentry)) {
1337 /* save the first error encountered for the 1325 /* save the first error encountered for the
1338 * return value */ 1326 * return value */
1339 err = PTR_ERR(nentry); 1327 err = PTR_ERR(nentry);
1340 audit_panic("error updating LSM filters"); 1328 audit_panic("error updating LSM filters");
1341 if (watch) 1329 if (r->watch)
1342 list_del(&r->rlist); 1330 list_del(&r->rlist);
1343 list_del_rcu(&entry->list); 1331 list_del_rcu(&entry->list);
1344 list_del(&r->list); 1332 list_del(&r->list);
1345 } else { 1333 } else {
1346 if (watch) { 1334 if (r->watch || r->tree)
1347 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1348 list_del(&r->rlist);
1349 } else if (tree)
1350 list_replace_init(&r->rlist, &nentry->rule.rlist); 1335 list_replace_init(&r->rlist, &nentry->rule.rlist);
1351 list_replace_rcu(&entry->list, &nentry->list); 1336 list_replace_rcu(&entry->list, &nentry->list);
1352 list_replace(&r->list, &nentry->rule.list); 1337 list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..b87a63beb66c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
65#include <linux/binfmts.h> 65#include <linux/binfmts.h>
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/inotify.h>
69#include <linux/capability.h> 68#include <linux/capability.h>
70#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
71 70
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
549 } 548 }
550 break; 549 break;
551 case AUDIT_WATCH: 550 case AUDIT_WATCH:
552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) 551 if (name)
553 result = (name->dev == audit_watch_dev(rule->watch) && 552 result = audit_watch_compare(rule->watch, name->ino, name->dev);
554 name->ino == audit_watch_inode(rule->watch));
555 break; 553 break;
556 case AUDIT_DIR: 554 case AUDIT_DIR:
557 if (ctx) 555 if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
1726 struct audit_tree_refs *p; 1724 struct audit_tree_refs *p;
1727 struct audit_chunk *chunk; 1725 struct audit_chunk *chunk;
1728 int count; 1726 int count;
1729 if (likely(list_empty(&inode->inotify_watches))) 1727 if (likely(hlist_empty(&inode->i_fsnotify_marks)))
1730 return; 1728 return;
1731 context = current->audit_context; 1729 context = current->audit_context;
1732 p = context->trees; 1730 p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
1769 seq = read_seqbegin(&rename_lock); 1767 seq = read_seqbegin(&rename_lock);
1770 for(;;) { 1768 for(;;) {
1771 struct inode *inode = d->d_inode; 1769 struct inode *inode = d->d_inode;
1772 if (inode && unlikely(!list_empty(&inode->inotify_watches))) { 1770 if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
1773 struct audit_chunk *chunk; 1771 struct audit_chunk *chunk;
1774 chunk = audit_tree_lookup(inode); 1772 chunk = audit_tree_lookup(inode);
1775 if (chunk) { 1773 if (chunk) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
181 181
182/* performance counters: */ 182/* performance counters: */
183cond_syscall(sys_perf_event_open); 183cond_syscall(sys_perf_event_open);
184
185/* fanotify! */
186cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b005e4912b5..ca38e8e3e907 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,6 +44,7 @@
44#include <linux/times.h> 44#include <linux/times.h>
45#include <linux/limits.h> 45#include <linux/limits.h>
46#include <linux/dcache.h> 46#include <linux/dcache.h>
47#include <linux/dnotify.h>
47#include <linux/syscalls.h> 48#include <linux/syscalls.h>
48#include <linux/vmstat.h> 49#include <linux/vmstat.h>
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
@@ -131,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
131 132
132static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
133 134
135#ifdef CONFIG_INOTIFY_USER
136#include <linux/inotify.h>
137#endif
134#ifdef CONFIG_SPARC 138#ifdef CONFIG_SPARC
135#include <asm/system.h> 139#include <asm/system.h>
136#endif 140#endif
@@ -207,9 +211,6 @@ static struct ctl_table fs_table[];
207static struct ctl_table debug_table[]; 211static struct ctl_table debug_table[];
208static struct ctl_table dev_table[]; 212static struct ctl_table dev_table[];
209extern struct ctl_table random_table[]; 213extern struct ctl_table random_table[];
210#ifdef CONFIG_INOTIFY_USER
211extern struct ctl_table inotify_table[];
212#endif
213#ifdef CONFIG_EPOLL 214#ifdef CONFIG_EPOLL
214extern struct ctl_table epoll_table[]; 215extern struct ctl_table epoll_table[];
215#endif 216#endif
diff --git a/security/security.c b/security/security.c
index e8c87b8601b4..7461b1bc296c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -619,7 +619,13 @@ void security_inode_getsecid(const struct inode *inode, u32 *secid)
619 619
620int security_file_permission(struct file *file, int mask) 620int security_file_permission(struct file *file, int mask)
621{ 621{
622 return security_ops->file_permission(file, mask); 622 int ret;
623
624 ret = security_ops->file_permission(file, mask);
625 if (ret)
626 return ret;
627
628 return fsnotify_perm(file, mask);
623} 629}
624 630
625int security_file_alloc(struct file *file) 631int security_file_alloc(struct file *file)
@@ -683,7 +689,13 @@ int security_file_receive(struct file *file)
683 689
684int security_dentry_open(struct file *file, const struct cred *cred) 690int security_dentry_open(struct file *file, const struct cred *cred)
685{ 691{
686 return security_ops->dentry_open(file, cred); 692 int ret;
693
694 ret = security_ops->dentry_open(file, cred);
695 if (ret)
696 return ret;
697
698 return fsnotify_perm(file, MAY_OPEN);
687} 699}
688 700
689int security_task_create(unsigned long clone_flags) 701int security_task_create(unsigned long clone_flags)