[PATCH] inotify (1/5): split kernel API from userspace support

The following series of patches introduces a kernel API for inotify, making it possible for kernel modules to benefit from inotify's mechanism for watching inodes. With these patches, inotify will maintain for each caller a list of watches (via an embedded struct inotify_watch), where each inotify_watch is associated with a corresponding struct inode. The caller registers an event handler and specifies for which filesystem events their event handler should be called per inotify_watch. Signed-off-by: Amy Griffis <amy.griffis@hp.com> Acked-by: Robert Love <rml@novell.com> Acked-by: John McCutchan <john@johnmccutchan.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Amy Griffis <amy.griffis@hp.com> 2006-06-01 16:10:59 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2006-06-20 05:25:17 -0400
commit: 2d9048e201bfb67ba21f05e647b1286b8a4a5667 (patch)
tree: 1df2ca6780d403f3209cf445f8b0b27f45098434 /fs/inotify_user.c
parent: 90204e0b7b51e9f2a6905adca12dc331128602c7 (diff)
1 files changed, 717 insertions, 0 deletions
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
new file mode 100644
index 000000000000..845dc79a4e9c
--- /dev/null
+++ b/fs/inotify_user.c
@@ -0,0 +1,717 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+#include <asm/ioctls.h>
+static kmem_cache_t *watch_cachep __read_mostly;
+static kmem_cache_t *event_cachep __read_mostly;
+static struct vfsmount *inotify_mnt __read_mostly;
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances __read_mostly;
+int inotify_max_user_watches __read_mostly;
+int inotify_max_queued_events __read_mostly;
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ *      inode->inotify_mutex (protects inode's watch list)
+ *              inotify_handle->mutex (protects inotify_handle's watch list)
+ *                      inotify_dev->ev_mutex (protects device's event queue)
+ */
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+        wait_queue_head_t       wq;             /* wait queue for i/o */
+        struct mutex            ev_mutex;       /* protects event queue */
+        struct mutex            up_mutex;       /* synchronizes watch updates */
+        struct list_head        events;         /* list of queued events */
+        atomic_t                count;          /* reference count */
+        struct user_struct      *user;          /* user who opened this dev */
+        struct inotify_handle   *ih;            /* inotify handle */
+        unsigned int            queue_size;     /* size of the queue (bytes) */
+        unsigned int            event_count;    /* number of pending events */
+        unsigned int            max_events;     /* maximum number of events */
+};
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+        struct inotify_event    event;  /* the user-space event */
+        struct list_head        list;   /* entry in inotify_device's list */
+        char                    *name;  /* filename, if any */
+};
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+        struct inotify_device   *dev;   /* associated device */
+        struct inotify_watch    wdata;  /* inotify watch data */
+};
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+static int zero;
+ctl_table inotify_table[] = {
+        {
+                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
+                .procname       = "max_user_instances",
+                .data           = &inotify_max_user_instances,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+        },
+        {
+                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
+                .procname       = "max_user_watches",
+                .data           = &inotify_max_user_watches,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+        },
+        {
+                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
+                .procname       = "max_queued_events",
+                .data           = &inotify_max_queued_events,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero
+        },
+        { .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+        atomic_inc(&dev->count);
+}
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+        if (atomic_dec_and_test(&dev->count)) {
+                atomic_dec(&dev->user->inotify_devs);
+                free_uid(dev->user);
+                kfree(dev);
+        }
+}
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+        struct inotify_user_watch *watch;
+        struct inotify_device *dev;
+        watch = container_of(w, struct inotify_user_watch, wdata);
+        dev = watch->dev;
+        atomic_dec(&dev->user->inotify_watches);
+        put_inotify_dev(dev);
+        kmem_cache_free(watch_cachep, watch);
+}
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+                                                  const char *name)
+{
+        struct inotify_kernel_event *kevent;
+        kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+        if (unlikely(!kevent))
+                return NULL;
+        /* we hand this out to user-space, so zero it just in case */
+        memset(&kevent->event, 0, sizeof(struct inotify_event));
+        kevent->event.wd = wd;
+        kevent->event.mask = mask;
+        kevent->event.cookie = cookie;
+        INIT_LIST_HEAD(&kevent->list);
+        if (name) {
+                size_t len, rem, event_size = sizeof(struct inotify_event);
+                /*
+                 * We need to pad the filename so as to properly align an
+                 * array of inotify_event structures.  Because the structure is
+                 * small and the common case is a small filename, we just round
+                 * up to the next multiple of the structure's sizeof.  This is
+                 * simple and safe for all architectures.
+                 */
+                len = strlen(name) + 1;
+                rem = event_size - len;
+                if (len > event_size) {
+                        rem = event_size - (len % event_size);
+                        if (len % event_size == 0)
+                                rem = 0;
+                }
+                kevent->name = kmalloc(len + rem, GFP_KERNEL);
+                if (unlikely(!kevent->name)) {
+                        kmem_cache_free(event_cachep, kevent);
+                        return NULL;
+                }
+                memcpy(kevent->name, name, len);
+                if (rem)
+                        memset(kevent->name + len, 0, rem);
+                kevent->event.len = len + rem;
+        } else {
+                kevent->event.len = 0;
+                kevent->name = NULL;
+        }
+        return kevent;
+}
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+        return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+                                    u32 cookie, const char *name)
+{
+        struct inotify_user_watch *watch;
+        struct inotify_device *dev;
+        struct inotify_kernel_event *kevent, *last;
+        watch = container_of(w, struct inotify_user_watch, wdata);
+        dev = watch->dev;
+        mutex_lock(&dev->ev_mutex);
+        /* we can safely put the watch as we don't reference it while
+         * generating the event
+         */
+        if (mask & IN_IGNORED || mask & IN_ONESHOT)
+                put_inotify_watch(w); /* final put */
+        /* coalescing: drop this event if it is a dupe of the previous */
+        last = inotify_dev_get_event(dev);
+        if (last && last->event.mask == mask && last->event.wd == wd &&
+                        last->event.cookie == cookie) {
+                const char *lastname = last->name;
+                if (!name && !lastname)
+                        goto out;
+                if (name && lastname && !strcmp(lastname, name))
+                        goto out;
+        }
+        /* the queue overflowed and we already sent the Q_OVERFLOW event */
+        if (unlikely(dev->event_count > dev->max_events))
+                goto out;
+        /* if the queue overflows, we need to notify user space */
+        if (unlikely(dev->event_count == dev->max_events))
+                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+        else
+                kevent = kernel_event(wd, mask, cookie, name);
+        if (unlikely(!kevent))
+                goto out;
+        /* queue the event and wake up anyone waiting */
+        dev->event_count++;
+        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+        list_add_tail(&kevent->list, &dev->events);
+        wake_up_interruptible(&dev->wq);
+out:
+        mutex_unlock(&dev->ev_mutex);
+}
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+                          struct inotify_kernel_event *kevent)
+{
+        list_del(&kevent->list);
+        dev->event_count--;
+        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+        kfree(kevent->name);
+        kmem_cache_free(event_cachep, kevent);
+}
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+        if (!list_empty(&dev->events)) {
+                struct inotify_kernel_event *kevent;
+                kevent = inotify_dev_get_event(dev);
+                remove_kevent(dev, kevent);
+        }
+}
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd,
+                      unsigned flags)
+{
+        int error;
+        error = __user_walk(dirname, flags, nd);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = vfs_permission(nd, MAY_READ);
+        if (error)
+                path_release(nd);
+        return error;
+}
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+                        u32 mask)
+{
+        struct inotify_user_watch *watch;
+        int ret;
+        if (atomic_read(&dev->user->inotify_watches) >=
+                        inotify_max_user_watches)
+                return -ENOSPC;
+        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+        if (unlikely(!watch))
+                return -ENOMEM;
+        /* save a reference to device and bump the count to make it official */
+        get_inotify_dev(dev);
+        watch->dev = dev;
+        atomic_inc(&dev->user->inotify_watches);
+        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+        if (ret < 0)
+                free_inotify_user_watch(&watch->wdata);
+        return ret;
+}
+/* Device Interface */
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+        struct inotify_device *dev = file->private_data;
+        int ret = 0;
+        poll_wait(file, &dev->wq, wait);
+        mutex_lock(&dev->ev_mutex);
+        if (!list_empty(&dev->events))
+                ret = POLLIN | POLLRDNORM;
+        mutex_unlock(&dev->ev_mutex);
+        return ret;
+}
+static ssize_t inotify_read(struct file *file, char __user *buf,
+                            size_t count, loff_t *pos)
+{
+        size_t event_size = sizeof (struct inotify_event);
+        struct inotify_device *dev;
+        char __user *start;
+        int ret;
+        DEFINE_WAIT(wait);
+        start = buf;
+        dev = file->private_data;
+        while (1) {
+                int events;
+                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&dev->ev_mutex);
+                events = !list_empty(&dev->events);
+                mutex_unlock(&dev->ev_mutex);
+                if (events) {
+                        ret = 0;
+                        break;
+                }
+                if (file->f_flags & O_NONBLOCK) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                schedule();
+        }
+        finish_wait(&dev->wq, &wait);
+        if (ret)
+                return ret;
+        mutex_lock(&dev->ev_mutex);
+        while (1) {
+                struct inotify_kernel_event *kevent;
+                ret = buf - start;
+                if (list_empty(&dev->events))
+                        break;
+                kevent = inotify_dev_get_event(dev);
+                if (event_size + kevent->event.len > count)
+                        break;
+                if (copy_to_user(buf, &kevent->event, event_size)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                buf += event_size;
+                count -= event_size;
+                if (kevent->name) {
+                        if (copy_to_user(buf, kevent->name, kevent->event.len)){
+                                ret = -EFAULT;
+                                break;
+                        }
+                        buf += kevent->event.len;
+                        count -= kevent->event.len;
+                }
+                remove_kevent(dev, kevent);
+        }
+        mutex_unlock(&dev->ev_mutex);
+        return ret;
+}
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+        struct inotify_device *dev = file->private_data;
+        inotify_destroy(dev->ih);
+        /* destroy all of the events on this device */
+        mutex_lock(&dev->ev_mutex);
+        while (!list_empty(&dev->events))
+                inotify_dev_event_dequeue(dev);
+        mutex_unlock(&dev->ev_mutex);
+        /* free this device: the put matching the get in inotify_init() */
+        put_inotify_dev(dev);
+        return 0;
+}
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+                          unsigned long arg)
+{
+        struct inotify_device *dev;
+        void __user *p;
+        int ret = -ENOTTY;
+        dev = file->private_data;
+        p = (void __user *) arg;
+        switch (cmd) {
+        case FIONREAD:
+                ret = put_user(dev->queue_size, (int __user *) p);
+                break;
+        }
+        return ret;
+}
+static const struct file_operations inotify_fops = {
+        .poll           = inotify_poll,
+        .read           = inotify_read,
+        .release        = inotify_release,
+        .unlocked_ioctl = inotify_ioctl,
+        .compat_ioctl   = inotify_ioctl,
+};
+static const struct inotify_operations inotify_user_ops = {
+        .handle_event   = inotify_dev_queue_event,
+        .destroy_watch  = free_inotify_user_watch,
+};
+asmlinkage long sys_inotify_init(void)
+{
+        struct inotify_device *dev;
+        struct inotify_handle *ih;
+        struct user_struct *user;
+        struct file *filp;
+        int fd, ret;
+        fd = get_unused_fd();
+        if (fd < 0)
+                return fd;
+        filp = get_empty_filp();
+        if (!filp) {
+                ret = -ENFILE;
+                goto out_put_fd;
+        }
+        user = get_uid(current->user);
+        if (unlikely(atomic_read(&user->inotify_devs) >=
+                        inotify_max_user_instances)) {
+                ret = -EMFILE;
+                goto out_free_uid;
+        }
+        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        if (unlikely(!dev)) {
+                ret = -ENOMEM;
+                goto out_free_uid;
+        }
+        ih = inotify_init(&inotify_user_ops);
+        if (unlikely(IS_ERR(ih))) {
+                ret = PTR_ERR(ih);
+                goto out_free_dev;
+        }
+        dev->ih = ih;
+        filp->f_op = &inotify_fops;
+        filp->f_vfsmnt = mntget(inotify_mnt);
+        filp->f_dentry = dget(inotify_mnt->mnt_root);
+        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+        filp->f_mode = FMODE_READ;
+        filp->f_flags = O_RDONLY;
+        filp->private_data = dev;
+        INIT_LIST_HEAD(&dev->events);
+        init_waitqueue_head(&dev->wq);
+        mutex_init(&dev->ev_mutex);
+        mutex_init(&dev->up_mutex);
+        dev->event_count = 0;
+        dev->queue_size = 0;
+        dev->max_events = inotify_max_queued_events;
+        dev->user = user;
+        atomic_set(&dev->count, 0);
+        get_inotify_dev(dev);
+        atomic_inc(&user->inotify_devs);
+        fd_install(fd, filp);
+        return fd;
+out_free_dev:
+        kfree(dev);
+out_free_uid:
+        free_uid(user);
+        put_filp(filp);
+out_put_fd:
+        put_unused_fd(fd);
+        return ret;
+}
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+{
+        struct inode *inode;
+        struct inotify_device *dev;
+        struct nameidata nd;
+        struct file *filp;
+        int ret, fput_needed;
+        unsigned flags = 0;
+        filp = fget_light(fd, &fput_needed);
+        if (unlikely(!filp))
+                return -EBADF;
+        /* verify that this is indeed an inotify instance */
+        if (unlikely(filp->f_op != &inotify_fops)) {
+                ret = -EINVAL;
+                goto fput_and_out;
+        }
+        if (!(mask & IN_DONT_FOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        if (mask & IN_ONLYDIR)
+                flags |= LOOKUP_DIRECTORY;
+        ret = find_inode(path, &nd, flags);
+        if (unlikely(ret))
+                goto fput_and_out;
+        /* inode held in place by reference to nd; dev by fget on fd */
+        inode = nd.dentry->d_inode;
+        dev = filp->private_data;
+        mutex_lock(&dev->up_mutex);
+        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        if (ret == -ENOENT)
+                ret = create_watch(dev, inode, mask);
+        mutex_unlock(&dev->up_mutex);
+        path_release(&nd);
+fput_and_out:
+        fput_light(filp, fput_needed);
+        return ret;
+}
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+        struct file *filp;
+        struct inotify_device *dev;
+        int ret, fput_needed;
+        filp = fget_light(fd, &fput_needed);
+        if (unlikely(!filp))
+                return -EBADF;
+        /* verify that this is indeed an inotify instance */
+        if (unlikely(filp->f_op != &inotify_fops)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        dev = filp->private_data;
+        /* we free our watch data when we get IN_IGNORED */
+        ret = inotify_rm_wd(dev->ih, wd);
+out:
+        fput_light(filp, fput_needed);
+        return ret;
+}
+static struct super_block *
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+               const char *dev_name, void *data)
+{
+    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+}
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+        int ret;
+        ret = register_filesystem(&inotify_fs_type);
+        if (unlikely(ret))
+                panic("inotify: register_filesystem returned %d!\n", ret);
+        inotify_mnt = kern_mount(&inotify_fs_type);
+        if (IS_ERR(inotify_mnt))
+                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_max_queued_events = 16384;
+        inotify_max_user_instances = 128;
+        inotify_max_user_watches = 8192;
+        watch_cachep = kmem_cache_create("inotify_watch_cache",
+                                         sizeof(struct inotify_user_watch),
+                                         0, SLAB_PANIC, NULL, NULL);
+        event_cachep = kmem_cache_create("inotify_event_cache",
+                                         sizeof(struct inotify_kernel_event),
+                                         0, SLAB_PANIC, NULL, NULL);
+        return 0;
+}
+module_init(inotify_user_setup);
author	Amy Griffis <amy.griffis@hp.com>	2006-06-01 16:10:59 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2006-06-20 05:25:17 -0400
commit	2d9048e201bfb67ba21f05e647b1286b8a4a5667 (patch)
tree	1df2ca6780d403f3209cf445f8b0b27f45098434 /fs/inotify_user.c
parent	90204e0b7b51e9f2a6905adca12dc331128602c7 (diff)

diff --git a/fs/inotify_user.c b/fs/inotify_user.c new file mode 100644 index 000000000000..845dc79a4e9c --- /dev/null +++ b/fs/inotify_user.c
@@ -0,0 +1,717 @@
	1	/*
	2	* fs/inotify_user.c - inotify support for userspace
	3	*
	4	* Authors:
	5	* John McCutchan <ttb@tentacle.dhs.org>
	6	* Robert Love <rml@novell.com>
	7	*
	8	* Copyright (C) 2005 John McCutchan
	9	* Copyright 2006 Hewlett-Packard Development Company, L.P.
	10	*
	11	* This program is free software; you can redistribute it and/or modify it
	12	* under the terms of the GNU General Public License as published by the
	13	* Free Software Foundation; either version 2, or (at your option) any
	14	* later version.
	15	*
	16	* This program is distributed in the hope that it will be useful, but
	17	* WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	19	* General Public License for more details.
	20	*/
	21
	22	#include <linux/kernel.h>
	23	#include <linux/sched.h>
	24	#include <linux/slab.h>
	25	#include <linux/fs.h>
	26	#include <linux/file.h>
	27	#include <linux/mount.h>
	28	#include <linux/namei.h>
	29	#include <linux/poll.h>
	30	#include <linux/init.h>
	31	#include <linux/list.h>
	32	#include <linux/inotify.h>
	33	#include <linux/syscalls.h>
	34
	35	#include <asm/ioctls.h>
	36
	37	static kmem_cache_t *watch_cachep __read_mostly;
	38	static kmem_cache_t *event_cachep __read_mostly;
	39
	40	static struct vfsmount *inotify_mnt __read_mostly;
	41
	42	/* these are configurable via /proc/sys/fs/inotify/ */
	43	int inotify_max_user_instances __read_mostly;
	44	int inotify_max_user_watches __read_mostly;
	45	int inotify_max_queued_events __read_mostly;
	46
	47	/*
	48	* Lock ordering:
	49	*
	50	* inotify_dev->up_mutex (ensures we don't re-add the same watch)
	51	* inode->inotify_mutex (protects inode's watch list)
	52	* inotify_handle->mutex (protects inotify_handle's watch list)
	53	* inotify_dev->ev_mutex (protects device's event queue)
	54	*/
	55
	56	/*
	57	* Lifetimes of the main data structures:
	58	*
	59	* inotify_device: Lifetime is managed by reference count, from
	60	* sys_inotify_init() until release. Additional references can bump the count
	61	* via get_inotify_dev() and drop the count via put_inotify_dev().
	62	*
	63	* inotify_user_watch: Lifetime is from create_watch() to the receipt of an
	64	* IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
	65	* first event, or to inotify_destroy().
	66	*/
	67
	68	/*
	69	* struct inotify_device - represents an inotify instance
	70	*
	71	* This structure is protected by the mutex 'mutex'.
	72	*/
	73	struct inotify_device {
	74	wait_queue_head_t wq; /* wait queue for i/o */
	75	struct mutex ev_mutex; /* protects event queue */
	76	struct mutex up_mutex; /* synchronizes watch updates */
	77	struct list_head events; /* list of queued events */
	78	atomic_t count; /* reference count */
	79	struct user_struct user; / user who opened this dev */
	80	struct inotify_handle ih; / inotify handle */
	81	unsigned int queue_size; /* size of the queue (bytes) */
	82	unsigned int event_count; /* number of pending events */
	83	unsigned int max_events; /* maximum number of events */
	84	};
	85
	86	/*
	87	* struct inotify_kernel_event - An inotify event, originating from a watch and
	88	* queued for user-space. A list of these is attached to each instance of the
	89	* device. In read(), this list is walked and all events that can fit in the
	90	* buffer are returned.
	91	*
	92	* Protected by dev->ev_mutex of the device in which we are queued.
	93	*/
	94	struct inotify_kernel_event {
	95	struct inotify_event event; /* the user-space event */
	96	struct list_head list; /* entry in inotify_device's list */
	97	char name; / filename, if any */
	98	};
	99
	100	/*
	101	* struct inotify_user_watch - our version of an inotify_watch, we add
	102	* a reference to the associated inotify_device.
	103	*/
	104	struct inotify_user_watch {
	105	struct inotify_device dev; / associated device */
	106	struct inotify_watch wdata; /* inotify watch data */
	107	};
	108
	109	#ifdef CONFIG_SYSCTL
	110
	111	#include <linux/sysctl.h>
	112
	113	static int zero;
	114
	115	ctl_table inotify_table[] = {
	116	{
	117	.ctl_name = INOTIFY_MAX_USER_INSTANCES,
	118	.procname = "max_user_instances",
	119	.data = &inotify_max_user_instances,
	120	.maxlen = sizeof(int),
	121	.mode = 0644,
	122	.proc_handler = &proc_dointvec_minmax,
	123	.strategy = &sysctl_intvec,
	124	.extra1 = &zero,
	125	},
	126	{
	127	.ctl_name = INOTIFY_MAX_USER_WATCHES,
	128	.procname = "max_user_watches",
	129	.data = &inotify_max_user_watches,
	130	.maxlen = sizeof(int),
	131	.mode = 0644,
	132	.proc_handler = &proc_dointvec_minmax,
	133	.strategy = &sysctl_intvec,
	134	.extra1 = &zero,
	135	},
	136	{
	137	.ctl_name = INOTIFY_MAX_QUEUED_EVENTS,
	138	.procname = "max_queued_events",
	139	.data = &inotify_max_queued_events,
	140	.maxlen = sizeof(int),
	141	.mode = 0644,
	142	.proc_handler = &proc_dointvec_minmax,
	143	.strategy = &sysctl_intvec,
	144	.extra1 = &zero
	145	},
	146	{ .ctl_name = 0 }
	147	};
	148	#endif /* CONFIG_SYSCTL */
	149
	150	static inline void get_inotify_dev(struct inotify_device *dev)
	151	{
	152	atomic_inc(&dev->count);
	153	}
	154
	155	static inline void put_inotify_dev(struct inotify_device *dev)
	156	{
	157	if (atomic_dec_and_test(&dev->count)) {
	158	atomic_dec(&dev->user->inotify_devs);
	159	free_uid(dev->user);
	160	kfree(dev);
	161	}
	162	}
	163
	164	/*
	165	* free_inotify_user_watch - cleans up the watch and its references
	166	*/
	167	static void free_inotify_user_watch(struct inotify_watch *w)
	168	{
	169	struct inotify_user_watch *watch;
	170	struct inotify_device *dev;
	171
	172	watch = container_of(w, struct inotify_user_watch, wdata);
	173	dev = watch->dev;
	174
	175	atomic_dec(&dev->user->inotify_watches);
	176	put_inotify_dev(dev);
	177	kmem_cache_free(watch_cachep, watch);
	178	}
	179
	180	/*
	181	* kernel_event - create a new kernel event with the given parameters
	182	*
	183	* This function can sleep.
	184	*/
	185	static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
	186	const char *name)
	187	{
	188	struct inotify_kernel_event *kevent;
	189
	190	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
	191	if (unlikely(!kevent))
	192	return NULL;
	193
	194	/* we hand this out to user-space, so zero it just in case */
	195	memset(&kevent->event, 0, sizeof(struct inotify_event));
	196
	197	kevent->event.wd = wd;
	198	kevent->event.mask = mask;
	199	kevent->event.cookie = cookie;
	200
	201	INIT_LIST_HEAD(&kevent->list);
	202
	203	if (name) {
	204	size_t len, rem, event_size = sizeof(struct inotify_event);
	205
	206	/*
	207	* We need to pad the filename so as to properly align an
	208	* array of inotify_event structures. Because the structure is
	209	* small and the common case is a small filename, we just round
	210	* up to the next multiple of the structure's sizeof. This is
	211	* simple and safe for all architectures.
	212	*/
	213	len = strlen(name) + 1;
	214	rem = event_size - len;
	215	if (len > event_size) {
	216	rem = event_size - (len % event_size);
	217	if (len % event_size == 0)
	218	rem = 0;
	219	}
	220
	221	kevent->name = kmalloc(len + rem, GFP_KERNEL);
	222	if (unlikely(!kevent->name)) {
	223	kmem_cache_free(event_cachep, kevent);
	224	return NULL;
	225	}
	226	memcpy(kevent->name, name, len);
	227	if (rem)
	228	memset(kevent->name + len, 0, rem);
	229	kevent->event.len = len + rem;
	230	} else {
	231	kevent->event.len = 0;
	232	kevent->name = NULL;
	233	}
	234
	235	return kevent;
	236	}
	237
	238	/*
	239	* inotify_dev_get_event - return the next event in the given dev's queue
	240	*
	241	* Caller must hold dev->ev_mutex.
	242	*/
	243	static inline struct inotify_kernel_event *
	244	inotify_dev_get_event(struct inotify_device *dev)
	245	{
	246	return list_entry(dev->events.next, struct inotify_kernel_event, list);
	247	}
	248
	249	/*
	250	* inotify_dev_queue_event - event handler registered with core inotify, adds
	251	* a new event to the given device
	252	*
	253	* Can sleep (calls kernel_event()).
	254	*/
	255	static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
	256	u32 cookie, const char *name)
	257	{
	258	struct inotify_user_watch *watch;
	259	struct inotify_device *dev;
	260	struct inotify_kernel_event kevent, last;
	261
	262	watch = container_of(w, struct inotify_user_watch, wdata);
	263	dev = watch->dev;
	264
	265	mutex_lock(&dev->ev_mutex);
	266
	267	/* we can safely put the watch as we don't reference it while
	268	* generating the event
	269	*/
	270	if (mask & IN_IGNORED \|\| mask & IN_ONESHOT)
	271	put_inotify_watch(w); /* final put */
	272
	273	/* coalescing: drop this event if it is a dupe of the previous */
	274	last = inotify_dev_get_event(dev);
	275	if (last && last->event.mask == mask && last->event.wd == wd &&
	276	last->event.cookie == cookie) {
	277	const char *lastname = last->name;
	278
	279	if (!name && !lastname)
	280	goto out;
	281	if (name && lastname && !strcmp(lastname, name))
	282	goto out;
	283	}
	284
	285	/* the queue overflowed and we already sent the Q_OVERFLOW event */
	286	if (unlikely(dev->event_count > dev->max_events))
	287	goto out;
	288
	289	/* if the queue overflows, we need to notify user space */
	290	if (unlikely(dev->event_count == dev->max_events))
	291	kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
	292	else
	293	kevent = kernel_event(wd, mask, cookie, name);
	294
	295	if (unlikely(!kevent))
	296	goto out;
	297
	298	/* queue the event and wake up anyone waiting */
	299	dev->event_count++;
	300	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
	301	list_add_tail(&kevent->list, &dev->events);
	302	wake_up_interruptible(&dev->wq);
	303
	304	out:
	305	mutex_unlock(&dev->ev_mutex);
	306	}
	307
	308	/*
	309	* remove_kevent - cleans up and ultimately frees the given kevent
	310	*
	311	* Caller must hold dev->ev_mutex.
	312	*/
	313	static void remove_kevent(struct inotify_device *dev,
	314	struct inotify_kernel_event *kevent)
	315	{
	316	list_del(&kevent->list);
	317
	318	dev->event_count--;
	319	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
	320
	321	kfree(kevent->name);
	322	kmem_cache_free(event_cachep, kevent);
	323	}
	324
	325	/*
	326	* inotify_dev_event_dequeue - destroy an event on the given device
	327	*
	328	* Caller must hold dev->ev_mutex.
	329	*/
	330	static void inotify_dev_event_dequeue(struct inotify_device *dev)
	331	{
	332	if (!list_empty(&dev->events)) {
	333	struct inotify_kernel_event *kevent;
	334	kevent = inotify_dev_get_event(dev);
	335	remove_kevent(dev, kevent);
	336	}
	337	}
	338
	339	/*
	340	* find_inode - resolve a user-given path to a specific inode and return a nd
	341	*/
	342	static int find_inode(const char __user dirname, struct nameidata nd,
	343	unsigned flags)
	344	{
	345	int error;
	346
	347	error = __user_walk(dirname, flags, nd);
	348	if (error)
	349	return error;
	350	/* you can only watch an inode if you have read permissions on it */
	351	error = vfs_permission(nd, MAY_READ);
	352	if (error)
	353	path_release(nd);
	354	return error;
	355	}
	356
	357	/*
	358	* create_watch - creates a watch on the given device.
	359	*
	360	* Callers must hold dev->up_mutex.
	361	*/
	362	static int create_watch(struct inotify_device dev, struct inode inode,
	363	u32 mask)
	364	{
	365	struct inotify_user_watch *watch;
	366	int ret;
	367
	368	if (atomic_read(&dev->user->inotify_watches) >=
	369	inotify_max_user_watches)
	370	return -ENOSPC;
	371
	372	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
	373	if (unlikely(!watch))
	374	return -ENOMEM;
	375
	376	/* save a reference to device and bump the count to make it official */
	377	get_inotify_dev(dev);
	378	watch->dev = dev;
	379
	380	atomic_inc(&dev->user->inotify_watches);
	381
	382	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
	383	if (ret < 0)
	384	free_inotify_user_watch(&watch->wdata);
	385
	386	return ret;
	387	}
	388
	389	/* Device Interface */
	390
	391	static unsigned int inotify_poll(struct file file, poll_table wait)
	392	{
	393	struct inotify_device *dev = file->private_data;
	394	int ret = 0;
	395
	396	poll_wait(file, &dev->wq, wait);
	397	mutex_lock(&dev->ev_mutex);
	398	if (!list_empty(&dev->events))
	399	ret = POLLIN \| POLLRDNORM;
	400	mutex_unlock(&dev->ev_mutex);
	401
	402	return ret;
	403	}
	404
	405	static ssize_t inotify_read(struct file file, char __user buf,
	406	size_t count, loff_t *pos)
	407	{
	408	size_t event_size = sizeof (struct inotify_event);
	409	struct inotify_device *dev;
	410	char __user *start;
	411	int ret;
	412	DEFINE_WAIT(wait);
	413
	414	start = buf;
	415	dev = file->private_data;
	416
	417	while (1) {
	418	int events;
	419
	420	prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
	421
	422	mutex_lock(&dev->ev_mutex);
	423	events = !list_empty(&dev->events);
	424	mutex_unlock(&dev->ev_mutex);
	425	if (events) {
	426	ret = 0;
	427	break;
	428	}
	429
	430	if (file->f_flags & O_NONBLOCK) {
	431	ret = -EAGAIN;
	432	break;
	433	}
	434
	435	if (signal_pending(current)) {
	436	ret = -EINTR;
	437	break;
	438	}
	439
	440	schedule();
	441	}
	442
	443	finish_wait(&dev->wq, &wait);
	444	if (ret)
	445	return ret;
	446
	447	mutex_lock(&dev->ev_mutex);
	448	while (1) {
	449	struct inotify_kernel_event *kevent;
	450
	451	ret = buf - start;
	452	if (list_empty(&dev->events))
	453	break;
	454
	455	kevent = inotify_dev_get_event(dev);
	456	if (event_size + kevent->event.len > count)
	457	break;
	458
	459	if (copy_to_user(buf, &kevent->event, event_size)) {
	460	ret = -EFAULT;
	461	break;
	462	}
	463	buf += event_size;
	464	count -= event_size;
	465
	466	if (kevent->name) {
	467	if (copy_to_user(buf, kevent->name, kevent->event.len)){
	468	ret = -EFAULT;
	469	break;
	470	}
	471	buf += kevent->event.len;
	472	count -= kevent->event.len;
	473	}
	474
	475	remove_kevent(dev, kevent);
	476	}
	477	mutex_unlock(&dev->ev_mutex);
	478
	479	return ret;
	480	}
	481
	482	static int inotify_release(struct inode ignored, struct file file)
	483	{
	484	struct inotify_device *dev = file->private_data;
	485
	486	inotify_destroy(dev->ih);
	487
	488	/* destroy all of the events on this device */
	489	mutex_lock(&dev->ev_mutex);
	490	while (!list_empty(&dev->events))
	491	inotify_dev_event_dequeue(dev);
	492	mutex_unlock(&dev->ev_mutex);
	493
	494	/* free this device: the put matching the get in inotify_init() */
	495	put_inotify_dev(dev);
	496
	497	return 0;
	498	}
	499
	500	static long inotify_ioctl(struct file *file, unsigned int cmd,
	501	unsigned long arg)
	502	{
	503	struct inotify_device *dev;
	504	void __user *p;
	505	int ret = -ENOTTY;
	506
	507	dev = file->private_data;
	508	p = (void __user *) arg;
	509
	510	switch (cmd) {
	511	case FIONREAD:
	512	ret = put_user(dev->queue_size, (int __user *) p);
	513	break;
	514	}
	515
	516	return ret;
	517	}
	518
	519	static const struct file_operations inotify_fops = {
	520	.poll = inotify_poll,
	521	.read = inotify_read,
	522	.release = inotify_release,
	523	.unlocked_ioctl = inotify_ioctl,
	524	.compat_ioctl = inotify_ioctl,
	525	};
	526
	527	static const struct inotify_operations inotify_user_ops = {
	528	.handle_event = inotify_dev_queue_event,
	529	.destroy_watch = free_inotify_user_watch,
	530	};
	531
	532	asmlinkage long sys_inotify_init(void)
	533	{
	534	struct inotify_device *dev;
	535	struct inotify_handle *ih;
	536	struct user_struct *user;
	537	struct file *filp;
	538	int fd, ret;
	539
	540	fd = get_unused_fd();
	541	if (fd < 0)
	542	return fd;
	543
	544	filp = get_empty_filp();
	545	if (!filp) {
	546	ret = -ENFILE;
	547	goto out_put_fd;
	548	}
	549
	550	user = get_uid(current->user);
	551	if (unlikely(atomic_read(&user->inotify_devs) >=
	552	inotify_max_user_instances)) {
	553	ret = -EMFILE;
	554	goto out_free_uid;
	555	}
	556
	557	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
	558	if (unlikely(!dev)) {
	559	ret = -ENOMEM;
	560	goto out_free_uid;
	561	}
	562
	563	ih = inotify_init(&inotify_user_ops);
	564	if (unlikely(IS_ERR(ih))) {
	565	ret = PTR_ERR(ih);
	566	goto out_free_dev;
	567	}
	568	dev->ih = ih;
	569
	570	filp->f_op = &inotify_fops;
	571	filp->f_vfsmnt = mntget(inotify_mnt);
	572	filp->f_dentry = dget(inotify_mnt->mnt_root);
	573	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
	574	filp->f_mode = FMODE_READ;
	575	filp->f_flags = O_RDONLY;
	576	filp->private_data = dev;
	577
	578	INIT_LIST_HEAD(&dev->events);
	579	init_waitqueue_head(&dev->wq);
	580	mutex_init(&dev->ev_mutex);
	581	mutex_init(&dev->up_mutex);
	582	dev->event_count = 0;
	583	dev->queue_size = 0;
	584	dev->max_events = inotify_max_queued_events;
	585	dev->user = user;
	586	atomic_set(&dev->count, 0);
	587
	588	get_inotify_dev(dev);
	589	atomic_inc(&user->inotify_devs);
	590	fd_install(fd, filp);
	591
	592	return fd;
	593	out_free_dev:
	594	kfree(dev);
	595	out_free_uid:
	596	free_uid(user);
	597	put_filp(filp);
	598	out_put_fd:
	599	put_unused_fd(fd);
	600	return ret;
	601	}
	602
	603	asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
	604	{
	605	struct inode *inode;
	606	struct inotify_device *dev;
	607	struct nameidata nd;
	608	struct file *filp;
	609	int ret, fput_needed;
	610	unsigned flags = 0;
	611
	612	filp = fget_light(fd, &fput_needed);
	613	if (unlikely(!filp))
	614	return -EBADF;
	615
	616	/* verify that this is indeed an inotify instance */
	617	if (unlikely(filp->f_op != &inotify_fops)) {
	618	ret = -EINVAL;
	619	goto fput_and_out;
	620	}
	621
	622	if (!(mask & IN_DONT_FOLLOW))
	623	flags \|= LOOKUP_FOLLOW;
	624	if (mask & IN_ONLYDIR)
	625	flags \|= LOOKUP_DIRECTORY;
	626
	627	ret = find_inode(path, &nd, flags);
	628	if (unlikely(ret))
	629	goto fput_and_out;
	630
	631	/* inode held in place by reference to nd; dev by fget on fd */
	632	inode = nd.dentry->d_inode;
	633	dev = filp->private_data;
	634
	635	mutex_lock(&dev->up_mutex);
	636	ret = inotify_find_update_watch(dev->ih, inode, mask);
	637	if (ret == -ENOENT)
	638	ret = create_watch(dev, inode, mask);
	639	mutex_unlock(&dev->up_mutex);
	640
	641	path_release(&nd);
	642	fput_and_out:
	643	fput_light(filp, fput_needed);
	644	return ret;
	645	}
	646
	647	asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
	648	{
	649	struct file *filp;
	650	struct inotify_device *dev;
	651	int ret, fput_needed;
	652
	653	filp = fget_light(fd, &fput_needed);
	654	if (unlikely(!filp))
	655	return -EBADF;
	656
	657	/* verify that this is indeed an inotify instance */
	658	if (unlikely(filp->f_op != &inotify_fops)) {
	659	ret = -EINVAL;
	660	goto out;
	661	}
	662
	663	dev = filp->private_data;
	664
	665	/* we free our watch data when we get IN_IGNORED */
	666	ret = inotify_rm_wd(dev->ih, wd);
	667
	668	out:
	669	fput_light(filp, fput_needed);
	670	return ret;
	671	}
	672
	673	static struct super_block *
	674	inotify_get_sb(struct file_system_type *fs_type, int flags,
	675	const char dev_name, void data)
	676	{
	677	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
	678	}
	679
	680	static struct file_system_type inotify_fs_type = {
	681	.name = "inotifyfs",
	682	.get_sb = inotify_get_sb,
	683	.kill_sb = kill_anon_super,
	684	};
	685
	686	/*
	687	* inotify_user_setup - Our initialization function. Note that we cannnot return
	688	* error because we have compiled-in VFS hooks. So an (unlikely) failure here
	689	* must result in panic().
	690	*/
	691	static int __init inotify_user_setup(void)
	692	{
	693	int ret;
	694
	695	ret = register_filesystem(&inotify_fs_type);
	696	if (unlikely(ret))
	697	panic("inotify: register_filesystem returned %d!\n", ret);
	698
	699	inotify_mnt = kern_mount(&inotify_fs_type);
	700	if (IS_ERR(inotify_mnt))
	701	panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
	702
	703	inotify_max_queued_events = 16384;
	704	inotify_max_user_instances = 128;
	705	inotify_max_user_watches = 8192;
	706
	707	watch_cachep = kmem_cache_create("inotify_watch_cache",
	708	sizeof(struct inotify_user_watch),
	709	0, SLAB_PANIC, NULL, NULL);
	710	event_cachep = kmem_cache_create("inotify_event_cache",
	711	sizeof(struct inotify_kernel_event),
	712	0, SLAB_PANIC, NULL, NULL);
	713
	714	return 0;
	715	}
	716
	717	module_init(inotify_user_setup);