44 files changed, 1428 insertions, 936 deletions
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 977ef208c051..3662dd44896b 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
 *                    of the file
 *
 * @name:    [in]    name of the "class" of the new file
- * @fops     [in]    file operations for the new file
+ * @fops:    [in]    file operations for the new file
- * @priv     [in]    private data for the new file (will be file's private_data)
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
@@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
 * setup.  Returns new descriptor or -error.
 */
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
-                     void *priv)
+                     void *priv, int flags)
 {
        struct qstr this;
        struct dentry *dentry;
@@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
-        error = get_unused_fd();
+        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;
@@ -115,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        file->f_mapping = anon_inode_inode->i_mapping;
        file->f_pos = 0;
-        file->f_flags = O_RDWR;
+        file->f_flags = O_RDWR | (flags & O_NONBLOCK);
        file->f_version = 0;
        file->private_data = priv;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c3d352d7fa93..69a2f5c92319 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,7 +52,10 @@ struct autofs_info {
        int             flags;
-        struct list_head rehash;
+        struct completion expire_complete;
+        struct list_head active;
+        struct list_head expiring;
        struct autofs_sb_info *sbi;
        unsigned long last_used;
@@ -68,15 +71,14 @@ struct autofs_info {
 };
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
 struct autofs_wait_queue {
        wait_queue_head_t queue;
        struct autofs_wait_queue *next;
        autofs_wqt_t wait_queue_token;
        /* We use the following to see what we are waiting for */
-        unsigned int hash;
+        struct qstr name;
-        unsigned int len;
-        char *name;
        u32 dev;
        u64 ino;
        uid_t uid;
@@ -85,7 +87,7 @@ struct autofs_wait_queue {
        pid_t tgid;
        /* This is for status reporting upon return */
        int status;
-        atomic_t wait_ctr;
+        unsigned int wait_ctr;
 };
 #define AUTOFS_SBI_MAGIC 0x6d4a556d
@@ -112,8 +114,9 @@ struct autofs_sb_info {
        struct mutex wq_mutex;
        spinlock_t fs_lock;
        struct autofs_wait_queue *queues; /* Wait queue pointer */
-        spinlock_t rehash_lock;
+        spinlock_t lookup_lock;
-        struct list_head rehash_list;
+        struct list_head active_list;
+        struct list_head expiring_list;
 };
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -138,18 +141,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
 static inline int autofs4_ispending(struct dentry *dentry)
 {
        struct autofs_info *inf = autofs4_dentry_ino(dentry);
-        int pending = 0;
        if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
                return 1;
-        if (inf) {
+        if (inf->flags & AUTOFS_INF_EXPIRING)
-                spin_lock(&inf->sbi->fs_lock);
+                return 1;
-                pending = inf->flags & AUTOFS_INF_EXPIRING;
-                spin_unlock(&inf->sbi->fs_lock);
-        }
-        return pending;
+        return 0;
 }
 static inline void autofs4_copy_atime(struct file *src, struct file *dst)
@@ -164,6 +163,7 @@ void autofs4_free_ino(struct autofs_info *);
 /* Expiration */
 int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *,
                        struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 894fee54d4d8..cdabb796ff01 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
        now = jiffies;
        timeout = sbi->exp_timeout;
-        /* Lock the tree as we must expire as a whole */
        spin_lock(&sbi->fs_lock);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
+                if (d_mountpoint(root)) {
-                /* Set this flag early to catch sys_chdir and the like */
+                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
+                        root->d_mounted--;
+                }
                ino->flags |= AUTOFS_INF_EXPIRING;
+                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                return root;
        }
@@ -292,6 +294,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
        struct list_head *next;
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
+        struct autofs_info *ino;
+        unsigned int ino_count;
        if (!root)
                return NULL;
@@ -316,6 +320,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
                dentry = dget(dentry);
                spin_unlock(&dcache_lock);
+                spin_lock(&sbi->fs_lock);
+                ino = autofs4_dentry_ino(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
                 *         (autofs-4.1).
@@ -326,6 +333,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        DPRINTK("checking mountpoint %p %.*s",
                                dentry, (int)dentry->d_name.len, dentry->d_name.name);
+                        /* Path walk currently on this dentry? */
+                        ino_count = atomic_read(&ino->count) + 2;
+                        if (atomic_read(&dentry->d_count) > ino_count)
+                                goto next;
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, dentry))
                                goto next;
@@ -343,23 +355,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
                /* Case 2: tree mount, expire iff entire tree is not busy */
                if (!exp_leaves) {
-                        /* Lock the tree as we must expire as a whole */
+                        /* Path walk currently on this dentry? */
-                        spin_lock(&sbi->fs_lock);
+                        ino_count = atomic_read(&ino->count) + 1;
-                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
+                        if (atomic_read(&dentry->d_count) > ino_count)
-                                struct autofs_info *inf = autofs4_dentry_ino(dentry);
+                                goto next;
-                                /* Set this flag early to catch sys_chdir and the like */
+                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
-                                inf->flags |= AUTOFS_INF_EXPIRING;
-                                spin_unlock(&sbi->fs_lock);
                                expired = dentry;
                                goto found;
                        }
-                        spin_unlock(&sbi->fs_lock);
                /*
                 * Case 3: pseudo direct mount, expire individual leaves
                 *         (autofs-4.1).
                 */
                } else {
+                        /* Path walk currently on this dentry? */
+                        ino_count = atomic_read(&ino->count) + 1;
+                        if (atomic_read(&dentry->d_count) > ino_count)
+                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
                        if (expired) {
                                dput(dentry);
@@ -367,6 +381,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
+                spin_unlock(&sbi->fs_lock);
                dput(dentry);
                spin_lock(&dcache_lock);
                next = next->next;
@@ -377,12 +392,45 @@ next:
 found:
        DPRINTK("returning %p %.*s",
                expired, (int)expired->d_name.len, expired->d_name.name);
+        ino = autofs4_dentry_ino(expired);
+        ino->flags |= AUTOFS_INF_EXPIRING;
+        init_completion(&ino->expire_complete);
+        spin_unlock(&sbi->fs_lock);
        spin_lock(&dcache_lock);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
        spin_unlock(&dcache_lock);
        return expired;
 }
+int autofs4_expire_wait(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        /* Block on any pending expire */
+        spin_lock(&sbi->fs_lock);
+        if (ino->flags & AUTOFS_INF_EXPIRING) {
+                spin_unlock(&sbi->fs_lock);
+                DPRINTK("waiting for expire %p name=%.*s",
+                         dentry, dentry->d_name.len, dentry->d_name.name);
+                status = autofs4_wait(sbi, dentry, NFY_NONE);
+                wait_for_completion(&ino->expire_complete);
+                DPRINTK("expire done status=%d", status);
+                if (d_unhashed(dentry))
+                        return -EAGAIN;
+                return status;
+        }
+        spin_unlock(&sbi->fs_lock);
+        return 0;
+}
 /* Perform an expiry operation */
 int autofs4_expire_run(struct super_block *sb,
                      struct vfsmount *mnt,
@@ -390,7 +438,9 @@ int autofs4_expire_run(struct super_block *sb,
                      struct autofs_packet_expire __user *pkt_p)
 {
        struct autofs_packet_expire pkt;
+        struct autofs_info *ino;
        struct dentry *dentry;
+        int ret = 0;
        memset(&pkt,0,sizeof pkt);
@@ -406,9 +456,15 @@ int autofs4_expire_run(struct super_block *sb,
        dput(dentry);
        if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
-                return -EFAULT;
+                ret = -EFAULT;
-        return 0;
+        spin_lock(&sbi->fs_lock);
+        ino = autofs4_dentry_ino(dentry);
+        ino->flags &= ~AUTOFS_INF_EXPIRING;
+        complete_all(&ino->expire_complete);
+        spin_unlock(&sbi->fs_lock);
+        return ret;
 }
 /* Call repeatedly until it returns -EAGAIN, meaning there's nothing
@@ -433,9 +489,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                /* This is synchronous because it makes the daemon a
                   little easier */
-                ino->flags |= AUTOFS_INF_EXPIRING;
                ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+                spin_lock(&sbi->fs_lock);
+                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+                        sb->s_root->d_mounted++;
+                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
+                complete_all(&ino->expire_complete);
+                spin_unlock(&sbi->fs_lock);
                dput(dentry);
        }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2fdcf5e1d236..7bb3e5ba0537 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -24,8 +24,10 @@
 static void ino_lnkfree(struct autofs_info *ino)
 {
-        kfree(ino->u.symlink);
+        if (ino->u.symlink) {
-        ino->u.symlink = NULL;
+                kfree(ino->u.symlink);
+                ino->u.symlink = NULL;
+        }
 }
 struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
@@ -41,16 +43,18 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
        if (ino == NULL)
                return NULL;
-        ino->flags = 0;
+        if (!reinit) {
-        ino->mode = mode;
+                ino->flags = 0;
-        ino->inode = NULL;
+                ino->inode = NULL;
-        ino->dentry = NULL;
+                ino->dentry = NULL;
-        ino->size = 0;
+                ino->size = 0;
+                INIT_LIST_HEAD(&ino->active);
-        INIT_LIST_HEAD(&ino->rehash);
+                INIT_LIST_HEAD(&ino->expiring);
+                atomic_set(&ino->count, 0);
+        }
+        ino->mode = mode;
        ino->last_used = jiffies;
-        atomic_set(&ino->count, 0);
        ino->sbi = sbi;
@@ -159,8 +163,8 @@ void autofs4_kill_sb(struct super_block *sb)
        if (!sbi)
                goto out_kill_sb;
-        if (!sbi->catatonic)
+        /* Free wait queues, close pipe */
-                autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */
+        autofs4_catatonic_mode(sbi);
        /* Clean up and release dangling references */
        autofs4_force_release(sbi);
@@ -338,8 +342,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        mutex_init(&sbi->wq_mutex);
        spin_lock_init(&sbi->fs_lock);
        sbi->queues = NULL;
-        spin_lock_init(&sbi->rehash_lock);
+        spin_lock_init(&sbi->lookup_lock);
-        INIT_LIST_HEAD(&sbi->rehash_list);
+        INIT_LIST_HEAD(&sbi->active_list);
+        INIT_LIST_HEAD(&sbi->expiring_list);
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
        s->s_magic = AUTOFS_SUPER_MAGIC;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index edf5b6bddb52..bcfb2dc0a61b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -25,25 +25,25 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static int autofs4_dir_close(struct inode *inode, struct file *file);
-static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir);
-static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
+#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
 const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = autofs4_root_readdir,
+        .readdir        = dcache_readdir,
        .ioctl          = autofs4_root_ioctl,
 };
 const struct file_operations autofs4_dir_operations = {
        .open           = autofs4_dir_open,
-        .release        = autofs4_dir_close,
+        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = autofs4_dir_readdir,
+        .readdir        = dcache_readdir,
 };
 const struct inode_operations autofs4_indirect_root_inode_operations = {
@@ -70,42 +70,10 @@ const struct inode_operations autofs4_dir_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
-static int autofs4_root_readdir(struct file *file, void *dirent,
-                                filldir_t filldir)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb);
-        int oz_mode = autofs4_oz_mode(sbi);
-        DPRINTK("called, filp->f_pos = %lld", file->f_pos);
-        /*
-         * Don't set reghost flag if:
-         * 1) f_pos is larger than zero -- we've already been here.
-         * 2) we haven't even enabled reghosting in the 1st place.
-         * 3) this is the daemon doing a readdir
-         */
-        if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled)
-                sbi->needs_reghost = 1;
-        DPRINTK("needs_reghost = %d", sbi->needs_reghost);
-        return dcache_readdir(file, dirent, filldir);
-}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
-        struct vfsmount *mnt = file->f_path.mnt;
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct dentry *cursor;
-        int status;
-        status = dcache_dir_open(inode, file);
-        if (status)
-                goto out;
-        cursor = file->private_data;
-        cursor->d_fsdata = NULL;
        DPRINTK("file=%p dentry=%p %.*s",
                file, dentry, dentry->d_name.len, dentry->d_name.name);
@@ -113,159 +81,32 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
        if (autofs4_oz_mode(sbi))
                goto out;
-        if (autofs4_ispending(dentry)) {
+        /*
-                DPRINTK("dentry busy");
+         * An empty directory in an autofs file system is always a
-                dcache_dir_close(inode, file);
+         * mount point. The daemon must have failed to mount this
-                status = -EBUSY;
+         * during lookup so it doesn't exist. This can happen, for
-                goto out;
+         * example, if user space returns an incorrect status for a
-        }
+         * mount request. Otherwise we're doing a readdir on the
+         * autofs file system so just let the libfs routines handle
-        status = -ENOENT;
+         * it.
-        if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) {
+         */
-                struct nameidata nd;
+        spin_lock(&dcache_lock);
-                int empty, ret;
+        if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
-                /* In case there are stale directory dentrys from a failed mount */
-                spin_lock(&dcache_lock);
-                empty = list_empty(&dentry->d_subdirs);
                spin_unlock(&dcache_lock);
+                return -ENOENT;
-                if (!empty)
-                        d_invalidate(dentry);
-                nd.flags = LOOKUP_DIRECTORY;
-                ret = (dentry->d_op->d_revalidate)(dentry, &nd);
-                if (ret <= 0) {
-                        if (ret < 0)
-                                status = ret;
-                        dcache_dir_close(inode, file);
-                        goto out;
-                }
        }
+        spin_unlock(&dcache_lock);
-        if (d_mountpoint(dentry)) {
-                struct file *fp = NULL;
-                struct path fp_path = { .dentry = dentry, .mnt = mnt };
-                path_get(&fp_path);
-                if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) {
-                        path_put(&fp_path);
-                        dcache_dir_close(inode, file);
-                        goto out;
-                }
-                fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags);
-                status = PTR_ERR(fp);
-                if (IS_ERR(fp)) {
-                        dcache_dir_close(inode, file);
-                        goto out;
-                }
-                cursor->d_fsdata = fp;
-        }
-        return 0;
-out:
-        return status;
-}
-static int autofs4_dir_close(struct inode *inode, struct file *file)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct dentry *cursor = file->private_data;
-        int status = 0;
-        DPRINTK("file=%p dentry=%p %.*s",
-                file, dentry, dentry->d_name.len, dentry->d_name.name);
-        if (autofs4_oz_mode(sbi))
-                goto out;
-        if (autofs4_ispending(dentry)) {
-                DPRINTK("dentry busy");
-                status = -EBUSY;
-                goto out;
-        }
-        if (d_mountpoint(dentry)) {
-                struct file *fp = cursor->d_fsdata;
-                if (!fp) {
-                        status = -ENOENT;
-                        goto out;
-                }
-                filp_close(fp, current->files);
-        }
-out:
-        dcache_dir_close(inode, file);
-        return status;
-}
-static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct dentry *cursor = file->private_data;
-        int status;
-        DPRINTK("file=%p dentry=%p %.*s",
-                file, dentry, dentry->d_name.len, dentry->d_name.name);
-        if (autofs4_oz_mode(sbi))
-                goto out;
-        if (autofs4_ispending(dentry)) {
-                DPRINTK("dentry busy");
-                return -EBUSY;
-        }
-        if (d_mountpoint(dentry)) {
-                struct file *fp = cursor->d_fsdata;
-                if (!fp)
-                        return -ENOENT;
-                if (!fp->f_op || !fp->f_op->readdir)
-                        goto out;
-                status = vfs_readdir(fp, filldir, dirent);
-                file->f_pos = fp->f_pos;
-                if (status)
-                        autofs4_copy_atime(file, fp);
-                return status;
-        }
 out:
-        return dcache_readdir(file, dirent, filldir);
+        return dcache_dir_open(inode, file);
 }
 static int try_to_fill_dentry(struct dentry *dentry, int flags)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        struct dentry *new;
        int status;
-        /* Block on any pending expiry here; invalidate the dentry
-           when expiration is done to trigger mount request with a new
-           dentry */
-        if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
-                DPRINTK("waiting for expire %p name=%.*s",
-                         dentry, dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_NONE);
-                DPRINTK("expire done status=%d", status);
-                /*
-                 * If the directory still exists the mount request must
-                 * continue otherwise it can't be followed at the right
-                 * time during the walk.
-                 */
-                status = d_invalidate(dentry);
-                if (status != -EBUSY)
-                        return -EAGAIN;
-        }
        DPRINTK("dentry=%p %.*s ino=%p",
                 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -292,7 +133,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                        return status;
                }
        /* Trigger mount for path component or follow link */
-        } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) ||
+        } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+                        flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
                        current->link_count) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
@@ -320,26 +162,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
        spin_unlock(&dentry->d_lock);
-        /*
-         * The dentry that is passed in from lookup may not be the one
-         * we end up using, as mkdir can create a new one.  If this
-         * happens, and another process tries the lookup at the same time,
-         * it will set the PENDING flag on this new dentry, but add itself
-         * to our waitq.  Then, if after the lookup succeeds, the first
-         * process that requested the mount performs another lookup of the
-         * same directory, it will show up as still pending!  So, we need
-         * to redo the lookup here and clear pending on that dentry.
-         */
-        if (d_unhashed(dentry)) {
-                new = d_lookup(dentry->d_parent, &dentry->d_name);
-                if (new) {
-                        spin_lock(&new->d_lock);
-                        new->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                        spin_unlock(&new->d_lock);
-                        dput(new);
-                }
-        }
        return 0;
 }
@@ -355,51 +177,63 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
        DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
                dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
                nd->flags);
+        /*
-        /* If it's our master or we shouldn't trigger a mount we're done */
+         * For an expire of a covered direct or offset mount we need
-        lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY);
+         * to beeak out of follow_down() at the autofs mount trigger
-        if (oz_mode || !lookup_type)
+         * (d_mounted--), so we can see the expiring flag, and manage
+         * the blocking and following here until the expire is completed.
+         */
+        if (oz_mode) {
+                spin_lock(&sbi->fs_lock);
+                if (ino->flags & AUTOFS_INF_EXPIRING) {
+                        spin_unlock(&sbi->fs_lock);
+                        /* Follow down to our covering mount. */
+                        if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+                                goto done;
+                        goto follow;
+                }
+                spin_unlock(&sbi->fs_lock);
                goto done;
+        }
-        /* If an expire request is pending wait for it. */
+        /* If an expire request is pending everyone must wait. */
-        if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+        autofs4_expire_wait(dentry);
-                DPRINTK("waiting for active request %p name=%.*s",
-                        dentry, dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_NONE);
-                DPRINTK("request done status=%d", status);
+        /* We trigger a mount for almost all flags */
-        }
+        lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+        if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+                goto follow;
        /*
-         * If the dentry contains directories then it is an
+         * If the dentry contains directories then it is an autofs
-         * autofs multi-mount with no root mount offset. So
+         * multi-mount with no root mount offset. So don't try to
-         * don't try to mount it again.
+         * mount it again.
         */
        spin_lock(&dcache_lock);
-        if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+        if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+            (!d_mountpoint(dentry) && __simple_empty(dentry))) {
                spin_unlock(&dcache_lock);
                status = try_to_fill_dentry(dentry, 0);
                if (status)
                        goto out_error;
-                /*
+                goto follow;
-                 * The mount succeeded but if there is no root mount
-                 * it must be an autofs multi-mount with no root offset
-                 * so we don't need to follow the mount.
-                 */
-                if (d_mountpoint(dentry)) {
-                        if (!autofs4_follow_mount(&nd->path.mnt,
-                                                  &nd->path.dentry)) {
-                                status = -ENOENT;
-                                goto out_error;
-                        }
-                }
-                goto done;
        }
        spin_unlock(&dcache_lock);
+follow:
+        /*
+         * If there is no root mount it must be an autofs
+         * multi-mount with no root offset so we don't need
+         * to follow it.
+         */
+        if (d_mountpoint(dentry)) {
+                if (!autofs4_follow_mount(&nd->path.mnt,
+                                          &nd->path.dentry)) {
+                        status = -ENOENT;
+                        goto out_error;
+                }
+        }
 done:
        return NULL;
@@ -424,12 +258,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
        int status = 1;
        /* Pending dentry */
+        spin_lock(&sbi->fs_lock);
        if (autofs4_ispending(dentry)) {
                /* The daemon never causes a mount to trigger */
+                spin_unlock(&sbi->fs_lock);
                if (oz_mode)
                        return 1;
                /*
+                 * If the directory has gone away due to an expire
+                 * we have been called as ->d_revalidate() and so
+                 * we need to return false and proceed to ->lookup().
+                 */
+                if (autofs4_expire_wait(dentry) == -EAGAIN)
+                        return 0;
+                /*
                 * A zero status is success otherwise we have a
                 * negative error code.
                 */
@@ -437,17 +282,9 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                if (status == 0)
                        return 1;
-                /*
-                 * A status of EAGAIN here means that the dentry has gone
-                 * away while waiting for an expire to complete. If we are
-                 * racing with expire lookup will wait for it so this must
-                 * be a revalidate and we need to send it to lookup.
-                 */
-                if (status == -EAGAIN)
-                        return 0;
                return status;
        }
+        spin_unlock(&sbi->fs_lock);
        /* Negative dentry.. invalidate if "old" */
        if (dentry->d_inode == NULL)
@@ -461,6 +298,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
                spin_unlock(&dcache_lock);
                /* The daemon never causes a mount to trigger */
                if (oz_mode)
                        return 1;
@@ -493,10 +331,12 @@ void autofs4_dentry_release(struct dentry *de)
                struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
                if (sbi) {
-                        spin_lock(&sbi->rehash_lock);
+                        spin_lock(&sbi->lookup_lock);
-                        if (!list_empty(&inf->rehash))
+                        if (!list_empty(&inf->active))
-                                list_del(&inf->rehash);
+                                list_del(&inf->active);
-                        spin_unlock(&sbi->rehash_lock);
+                        if (!list_empty(&inf->expiring))
+                                list_del(&inf->expiring);
+                        spin_unlock(&sbi->lookup_lock);
                }
                inf->dentry = NULL;
@@ -518,7 +358,7 @@ static struct dentry_operations autofs4_dentry_operations = {
        .d_release      = autofs4_dentry_release,
 };
-static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
 {
        unsigned int len = name->len;
        unsigned int hash = name->hash;
@@ -526,14 +366,66 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
        struct list_head *p, *head;
        spin_lock(&dcache_lock);
-        spin_lock(&sbi->rehash_lock);
+        spin_lock(&sbi->lookup_lock);
-        head = &sbi->rehash_list;
+        head = &sbi->active_list;
        list_for_each(p, head) {
                struct autofs_info *ino;
                struct dentry *dentry;
                struct qstr *qstr;
-                ino = list_entry(p, struct autofs_info, rehash);
+                ino = list_entry(p, struct autofs_info, active);
+                dentry = ino->dentry;
+                spin_lock(&dentry->d_lock);
+                /* Already gone? */
+                if (atomic_read(&dentry->d_count) == 0)
+                        goto next;
+                qstr = &dentry->d_name;
+                if (dentry->d_name.hash != hash)
+                        goto next;
+                if (dentry->d_parent != parent)
+                        goto next;
+                if (qstr->len != len)
+                        goto next;
+                if (memcmp(qstr->name, str, len))
+                        goto next;
+                if (d_unhashed(dentry)) {
+                        dget(dentry);
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&sbi->lookup_lock);
+                        spin_unlock(&dcache_lock);
+                        return dentry;
+                }
+next:
+                spin_unlock(&dentry->d_lock);
+        }
+        spin_unlock(&sbi->lookup_lock);
+        spin_unlock(&dcache_lock);
+        return NULL;
+}
+static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct list_head *p, *head;
+        spin_lock(&dcache_lock);
+        spin_lock(&sbi->lookup_lock);
+        head = &sbi->expiring_list;
+        list_for_each(p, head) {
+                struct autofs_info *ino;
+                struct dentry *dentry;
+                struct qstr *qstr;
+                ino = list_entry(p, struct autofs_info, expiring);
                dentry = ino->dentry;
                spin_lock(&dentry->d_lock);
@@ -555,33 +447,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
                        goto next;
                if (d_unhashed(dentry)) {
-                        struct inode *inode = dentry->d_inode;
-                        ino = autofs4_dentry_ino(dentry);
-                        list_del_init(&ino->rehash);
                        dget(dentry);
-                        /*
-                         * Make the rehashed dentry negative so the VFS
-                         * behaves as it should.
-                         */
-                        if (inode) {
-                                dentry->d_inode = NULL;
-                                list_del_init(&dentry->d_alias);
-                                spin_unlock(&dentry->d_lock);
-                                spin_unlock(&sbi->rehash_lock);
-                                spin_unlock(&dcache_lock);
-                                iput(inode);
-                                return dentry;
-                        }
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&sbi->rehash_lock);
+                        spin_unlock(&sbi->lookup_lock);
                        spin_unlock(&dcache_lock);
                        return dentry;
                }
 next:
                spin_unlock(&dentry->d_lock);
        }
-        spin_unlock(&sbi->rehash_lock);
+        spin_unlock(&sbi->lookup_lock);
        spin_unlock(&dcache_lock);
        return NULL;
@@ -591,7 +466,8 @@ next:
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
-        struct dentry *unhashed;
+        struct autofs_info *ino;
+        struct dentry *expiring, *unhashed;
        int oz_mode;
        DPRINTK("name = %.*s",
@@ -607,8 +483,26 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
-        unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
+        expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
-        if (!unhashed) {
+        if (expiring) {
+                /*
+                 * If we are racing with expire the request might not
+                 * be quite complete but the directory has been removed
+                 * so it must have been successful, so just wait for it.
+                 */
+                ino = autofs4_dentry_ino(expiring);
+                autofs4_expire_wait(expiring);
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->expiring))
+                        list_del_init(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
+                dput(expiring);
+        }
+        unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+        if (unhashed)
+                dentry = unhashed;
+        else {
                /*
                 * Mark the dentry incomplete but don't hash it. We do this
                 * to serialize our inode creation operations (symlink and
@@ -622,39 +516,34 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                 */
                dentry->d_op = &autofs4_root_dentry_operations;
-                dentry->d_fsdata = NULL;
-                d_instantiate(dentry, NULL);
-        } else {
-                struct autofs_info *ino = autofs4_dentry_ino(unhashed);
-                DPRINTK("rehash %p with %p", dentry, unhashed);
                /*
-                 * If we are racing with expire the request might not
+                 * And we need to ensure that the same dentry is used for
-                 * be quite complete but the directory has been removed
+                 * all following lookup calls until it is hashed so that
-                 * so it must have been successful, so just wait for it.
+                 * the dentry flags are persistent throughout the request.
-                 * We need to ensure the AUTOFS_INF_EXPIRING flag is clear
-                 * before continuing as revalidate may fail when calling
-                 * try_to_fill_dentry (returning EAGAIN) if we don't.
                 */
-                while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+                ino = autofs4_init_ino(NULL, sbi, 0555);
-                        DPRINTK("wait for incomplete expire %p name=%.*s",
+                if (!ino)
-                                unhashed, unhashed->d_name.len,
+                        return ERR_PTR(-ENOMEM);
-                                unhashed->d_name.name);
-                        autofs4_wait(sbi, unhashed, NFY_NONE);
+                dentry->d_fsdata = ino;
-                        DPRINTK("request completed");
+                ino->dentry = dentry;
-                }
-                dentry = unhashed;
+                spin_lock(&sbi->lookup_lock);
+                list_add(&ino->active, &sbi->active_list);
+                spin_unlock(&sbi->lookup_lock);
+                d_instantiate(dentry, NULL);
        }
        if (!oz_mode) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
                spin_unlock(&dentry->d_lock);
-        }
+                if (dentry->d_op && dentry->d_op->d_revalidate) {
+                        mutex_unlock(&dir->i_mutex);
-        if (dentry->d_op && dentry->d_op->d_revalidate) {
+                        (dentry->d_op->d_revalidate)(dentry, nd);
-                mutex_unlock(&dir->i_mutex);
+                        mutex_lock(&dir->i_mutex);
-                (dentry->d_op->d_revalidate)(dentry, nd);
+                }
-                mutex_lock(&dir->i_mutex);
        }
        /*
@@ -673,9 +562,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                            return ERR_PTR(-ERESTARTNOINTR);
                        }
                }
-                spin_lock(&dentry->d_lock);
+                if (!oz_mode) {
-                dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                        spin_lock(&dentry->d_lock);
-                spin_unlock(&dentry->d_lock);
+                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                        spin_unlock(&dentry->d_lock);
+                }
        }
        /*
@@ -706,7 +597,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        }
        if (unhashed)
-                return dentry;
+                return unhashed;
        return NULL;
 }
@@ -728,20 +619,31 @@ static int autofs4_dir_symlink(struct inode *dir,
                return -EACCES;
        ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
-        if (ino == NULL)
+        if (!ino)
-                return -ENOSPC;
+                return -ENOMEM;
-        ino->size = strlen(symname);
+        spin_lock(&sbi->lookup_lock);
-        ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL);
+        if (!list_empty(&ino->active))
+                list_del_init(&ino->active);
+        spin_unlock(&sbi->lookup_lock);
-        if (cp == NULL) {
+        ino->size = strlen(symname);
-                kfree(ino);
+        cp = kmalloc(ino->size + 1, GFP_KERNEL);
-                return -ENOSPC;
+        if (!cp) {
+                if (!dentry->d_fsdata)
+                        kfree(ino);
+                return -ENOMEM;
        }
        strcpy(cp, symname);
        inode = autofs4_get_inode(dir->i_sb, ino);
+        if (!inode) {
+                kfree(cp);
+                if (!dentry->d_fsdata)
+                        kfree(ino);
+                return -ENOMEM;
+        }
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
@@ -757,6 +659,7 @@ static int autofs4_dir_symlink(struct inode *dir,
                atomic_inc(&p_ino->count);
        ino->inode = inode;
+        ino->u.symlink = cp;
        dir->i_mtime = CURRENT_TIME;
        return 0;
@@ -769,9 +672,8 @@ static int autofs4_dir_symlink(struct inode *dir,
 * that the file no longer exists. However, doing that means that the
 * VFS layer can turn the dentry into a negative dentry.  We don't want
 * this, because the unlink is probably the result of an expire.
- * We simply d_drop it and add it to a rehash candidates list in the
+ * We simply d_drop it and add it to a expiring list in the super block,
- * super block, which allows the dentry lookup to reuse it retaining
+ * which allows the dentry lookup to check for an incomplete expire.
- * the flags, such as expire in progress, in case we're racing with expire.
 *
 * If a process is blocked on the dentry waiting for the expire to finish,
 * it will invalidate the dentry and try to mount with a new one.
@@ -801,9 +703,10 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
        spin_lock(&dcache_lock);
-        spin_lock(&sbi->rehash_lock);
+        spin_lock(&sbi->lookup_lock);
-        list_add(&ino->rehash, &sbi->rehash_list);
+        if (list_empty(&ino->expiring))
-        spin_unlock(&sbi->rehash_lock);
+                list_add(&ino->expiring, &sbi->expiring_list);
+        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -829,9 +732,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                spin_unlock(&dcache_lock);
                return -ENOTEMPTY;
        }
-        spin_lock(&sbi->rehash_lock);
+        spin_lock(&sbi->lookup_lock);
-        list_add(&ino->rehash, &sbi->rehash_list);
+        if (list_empty(&ino->expiring))
-        spin_unlock(&sbi->rehash_lock);
+                list_add(&ino->expiring, &sbi->expiring_list);
+        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -866,10 +770,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                dentry, dentry->d_name.len, dentry->d_name.name);
        ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
-        if (ino == NULL)
+        if (!ino)
-                return -ENOSPC;
+                return -ENOMEM;
+        spin_lock(&sbi->lookup_lock);
+        if (!list_empty(&ino->active))
+                list_del_init(&ino->active);
+        spin_unlock(&sbi->lookup_lock);
        inode = autofs4_get_inode(dir->i_sb, ino);
+        if (!inode) {
+                if (!dentry->d_fsdata)
+                        kfree(ino);
+                return -ENOMEM;
+        }
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
@@ -922,44 +836,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user
 }
 /*
- * Tells the daemon whether we need to reghost or not. Also, clears
- * the reghost_needed flag.
- */
-static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-        int status;
-        DPRINTK("returning %d", sbi->needs_reghost);
-        status = put_user(sbi->needs_reghost, p);
-        if (status)
-                return status;
-        sbi->needs_reghost = 0;
-        return 0;
-}
-/*
- * Enable / Disable reghosting ioctl() operation
- */
-static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-        int status;
-        int val;
-        status = get_user(val, p);
-        DPRINTK("reghost = %d", val);
-        if (status)
-                return status;
-        /* turn on/off reghosting, with the val */
-        sbi->reghost_enabled = val;
-        return 0;
-}
-/*
 * Tells the daemon whether it can umount the autofs mount.
 */
 static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
@@ -1023,11 +899,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
        case AUTOFS_IOC_SETTIMEOUT:
                return autofs4_get_set_timeout(sbi, p);
-        case AUTOFS_IOC_TOGGLEREGHOST:
-                return autofs4_toggle_reghost(sbi, p);
-        case AUTOFS_IOC_ASKREGHOST:
-                return autofs4_ask_reghost(sbi, p);
        case AUTOFS_IOC_ASKUMOUNT:
                return autofs4_ask_umount(filp->f_path.mnt, p);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 75e5955c3f6d..35216d18d8b5 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 {
        struct autofs_wait_queue *wq, *nwq;
+        mutex_lock(&sbi->wq_mutex);
+        if (sbi->catatonic) {
+                mutex_unlock(&sbi->wq_mutex);
+                return;
+        }
        DPRINTK("entering catatonic mode");
        sbi->catatonic = 1;
@@ -36,13 +42,18 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
        while (wq) {
                nwq = wq->next;
                wq->status = -ENOENT; /* Magic is gone - report failure */
-                kfree(wq->name);
+                if (wq->name.name) {
-                wq->name = NULL;
+                        kfree(wq->name.name);
+                        wq->name.name = NULL;
+                }
+                wq->wait_ctr--;
                wake_up_interruptible(&wq->queue);
                wq = nwq;
        }
        fput(sbi->pipe);        /* Close the pipe */
        sbi->pipe = NULL;
+        sbi->pipefd = -1;
+        mutex_unlock(&sbi->wq_mutex);
 }
 static int autofs4_write(struct file *file, const void *addr, int bytes)
@@ -89,10 +100,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                union autofs_packet_union v4_pkt;
                union autofs_v5_packet_union v5_pkt;
        } pkt;
+        struct file *pipe = NULL;
        size_t pktsz;
        DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-                wq->wait_queue_token, wq->len, wq->name, type);
+                wq->wait_queue_token, wq->name.len, wq->name.name, type);
        memset(&pkt,0,sizeof pkt); /* For security reasons */
@@ -107,9 +119,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                pktsz = sizeof(*mp);
                mp->wait_queue_token = wq->wait_queue_token;
-                mp->len = wq->len;
+                mp->len = wq->name.len;
-                memcpy(mp->name, wq->name, wq->len);
+                memcpy(mp->name, wq->name.name, wq->name.len);
-                mp->name[wq->len] = '\0';
+                mp->name[wq->name.len] = '\0';
                break;
        }
        case autofs_ptype_expire_multi:
@@ -119,9 +131,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                pktsz = sizeof(*ep);
                ep->wait_queue_token = wq->wait_queue_token;
-                ep->len = wq->len;
+                ep->len = wq->name.len;
-                memcpy(ep->name, wq->name, wq->len);
+                memcpy(ep->name, wq->name.name, wq->name.len);
-                ep->name[wq->len] = '\0';
+                ep->name[wq->name.len] = '\0';
                break;
        }
        /*
@@ -138,9 +150,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                pktsz = sizeof(*packet);
                packet->wait_queue_token = wq->wait_queue_token;
-                packet->len = wq->len;
+                packet->len = wq->name.len;
-                memcpy(packet->name, wq->name, wq->len);
+                memcpy(packet->name, wq->name.name, wq->name.len);
-                packet->name[wq->len] = '\0';
+                packet->name[wq->name.len] = '\0';
                packet->dev = wq->dev;
                packet->ino = wq->ino;
                packet->uid = wq->uid;
@@ -154,8 +166,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                return;
        }
-        if (autofs4_write(sbi->pipe, &pkt, pktsz))
+        /* Check if we have become catatonic */
-                autofs4_catatonic_mode(sbi);
+        mutex_lock(&sbi->wq_mutex);
+        if (!sbi->catatonic) {
+                pipe = sbi->pipe;
+                get_file(pipe);
+        }
+        mutex_unlock(&sbi->wq_mutex);
+        if (pipe) {
+                if (autofs4_write(pipe, &pkt, pktsz))
+                        autofs4_catatonic_mode(sbi);
+                fput(pipe);
+        }
 }
 static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -191,58 +214,55 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 }
 static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi,
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
-                  char *name, unsigned int hash, unsigned int len)
 {
        struct autofs_wait_queue *wq;
        for (wq = sbi->queues; wq; wq = wq->next) {
-                if (wq->hash == hash &&
+                if (wq->name.hash == qstr->hash &&
-                    wq->len == len &&
+                    wq->name.len == qstr->len &&
-                    wq->name && !memcmp(wq->name, name, len))
+                    wq->name.name &&
+                         !memcmp(wq->name.name, qstr->name, qstr->len))
                        break;
        }
        return wq;
 }
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+/*
-                enum autofs_notify notify)
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ *   In this case we can return an autofs_wait_queue entry if one is
+ *   found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+                            struct autofs_sb_info *sbi,
+                            struct qstr *qstr,
+                            struct dentry*dentry, enum autofs_notify notify)
 {
-        struct autofs_info *ino;
        struct autofs_wait_queue *wq;
-        char *name;
+        struct autofs_info *ino;
-        unsigned int len = 0;
-        unsigned int hash = 0;
-        int status, type;
-        /* In catatonic mode, we don't wait for nobody */
-        if (sbi->catatonic)
-                return -ENOENT;
-        
-        name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-        if (!name)
-                return -ENOMEM;
-        /* If this is a direct mount request create a dummy name */
+        /* Wait in progress, continue; */
-        if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
+        wq = autofs4_find_wait(sbi, qstr);
-                len = sprintf(name, "%p", dentry);
+        if (wq) {
-        else {
+                *wait = wq;
-                len = autofs4_getpath(sbi, dentry, &name);
+                return 1;
-                if (!len) {
-                        kfree(name);
-                        return -ENOENT;
-                }
        }
-        hash = full_name_hash(name, len);
-        if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+        *wait = NULL;
-                kfree(name);
-                return -EINTR;
-        }
-        wq = autofs4_find_wait(sbi, name, hash, len);
+        /* If we don't yet have any info this is a new request */
        ino = autofs4_dentry_ino(dentry);
-        if (!wq && ino && notify == NFY_NONE) {
+        if (!ino)
+                return 1;
+        /*
+         * If we've been asked to wait on an existing expire (NFY_NONE)
+         * but there is no wait in the queue ...
+         */
+        if (notify == NFY_NONE) {
                /*
                 * Either we've betean the pending expire to post it's
                 * wait or it finished while we waited on the mutex.
@@ -253,13 +273,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                while (ino->flags & AUTOFS_INF_EXPIRING) {
                        mutex_unlock(&sbi->wq_mutex);
                        schedule_timeout_interruptible(HZ/10);
-                        if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+                        if (mutex_lock_interruptible(&sbi->wq_mutex))
-                                kfree(name);
                                return -EINTR;
+                        wq = autofs4_find_wait(sbi, qstr);
+                        if (wq) {
+                                *wait = wq;
+                                return 1;
                        }
-                        wq = autofs4_find_wait(sbi, name, hash, len);
-                        if (wq)
-                                break;
                }
                /*
@@ -267,18 +288,96 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                 * cases where we wait on NFY_NONE neither depend on the
                 * return status of the wait.
                 */
-                if (!wq) {
+                return 0;
+        }
+        /*
+         * If we've been asked to trigger a mount and the request
+         * completed while we waited on the mutex ...
+         */
+        if (notify == NFY_MOUNT) {
+                /*
+                 * If the dentry isn't hashed just go ahead and try the
+                 * mount again with a new wait (not much else we can do).
+                */
+                if (!d_unhashed(dentry)) {
+                        /*
+                         * But if the dentry is hashed, that means that we
+                         * got here through the revalidate path.  Thus, we
+                         * need to check if the dentry has been mounted
+                         * while we waited on the wq_mutex. If it has,
+                         * simply return success.
+                         */
+                        if (d_mountpoint(dentry))
+                                return 0;
+                }
+        }
+        return 1;
+}
+int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+                enum autofs_notify notify)
+{
+        struct autofs_wait_queue *wq;
+        struct qstr qstr;
+        char *name;
+        int status, ret, type;
+        /* In catatonic mode, we don't wait for nobody */
+        if (sbi->catatonic)
+                return -ENOENT;
+        if (!dentry->d_inode) {
+                /*
+                 * A wait for a negative dentry is invalid for certain
+                 * cases. A direct or offset mount "always" has its mount
+                 * point directory created and so the request dentry must
+                 * be positive or the map key doesn't exist. The situation
+                 * is very similar for indirect mounts except only dentrys
+                 * in the root of the autofs file system may be negative.
+                 */
+                if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET))
+                        return -ENOENT;
+                else if (!IS_ROOT(dentry->d_parent))
+                        return -ENOENT;
+        }
+        name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!name)
+                return -ENOMEM;
+        /* If this is a direct mount request create a dummy name */
+        if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
+                qstr.len = sprintf(name, "%p", dentry);
+        else {
+                qstr.len = autofs4_getpath(sbi, dentry, &name);
+                if (!qstr.len) {
                        kfree(name);
-                        mutex_unlock(&sbi->wq_mutex);
+                        return -ENOENT;
-                        return 0;
                }
        }
+        qstr.name = name;
+        qstr.hash = full_name_hash(name, qstr.len);
+        if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+                kfree(qstr.name);
+                return -EINTR;
+        }
+        ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+        if (ret <= 0) {
+                if (ret == 0)
+                        mutex_unlock(&sbi->wq_mutex);
+                kfree(qstr.name);
+                return ret;
+        }
        if (!wq) {
                /* Create a new wait queue */
                wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
                if (!wq) {
-                        kfree(name);
+                        kfree(qstr.name);
                        mutex_unlock(&sbi->wq_mutex);
                        return -ENOMEM;
                }
@@ -289,9 +388,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                wq->next = sbi->queues;
                sbi->queues = wq;
                init_waitqueue_head(&wq->queue);
-                wq->hash = hash;
+                memcpy(&wq->name, &qstr, sizeof(struct qstr));
-                wq->name = name;
-                wq->len = len;
                wq->dev = autofs4_get_dev(sbi);
                wq->ino = autofs4_get_ino(sbi);
                wq->uid = current->uid;
@@ -299,7 +396,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                wq->pid = current->pid;
                wq->tgid = current->tgid;
                wq->status = -EINTR; /* Status return if interrupted */
-                atomic_set(&wq->wait_ctr, 2);
+                wq->wait_ctr = 2;
                mutex_unlock(&sbi->wq_mutex);
                if (sbi->version < 5) {
@@ -319,28 +416,25 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                }
                DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-                        (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+                        (unsigned long) wq->wait_queue_token, wq->name.len,
+                        wq->name.name, notify);
                /* autofs4_notify_daemon() may block */
                autofs4_notify_daemon(sbi, wq, type);
        } else {
-                atomic_inc(&wq->wait_ctr);
+                wq->wait_ctr++;
                mutex_unlock(&sbi->wq_mutex);
-                kfree(name);
+                kfree(qstr.name);
                DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
-                        (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+                        (unsigned long) wq->wait_queue_token, wq->name.len,
-        }
+                        wq->name.name, notify);
-        /* wq->name is NULL if and only if the lock is already released */
-        if (sbi->catatonic) {
-                /* We might have slept, so check again for catatonic mode */
-                wq->status = -ENOENT;
-                kfree(wq->name);
-                wq->name = NULL;
        }
-        if (wq->name) {
+        /*
+         * wq->name.name is NULL iff the lock is already released
+         * or the mount has been made catatonic.
+         */
+        if (wq->name.name) {
                /* Block all but "shutdown" signals while waiting */
                sigset_t oldset;
                unsigned long irqflags;
@@ -351,7 +445,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                recalc_sigpending();
                spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
-                wait_event_interruptible(wq->queue, wq->name == NULL);
+                wait_event_interruptible(wq->queue, wq->name.name == NULL);
                spin_lock_irqsave(&current->sighand->siglock, irqflags);
                current->blocked = oldset;
@@ -364,8 +458,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
        status = wq->status;
        /* Are we the last process to need status? */
-        if (atomic_dec_and_test(&wq->wait_ctr))
+        mutex_lock(&sbi->wq_mutex);
+        if (!--wq->wait_ctr)
                kfree(wq);
+        mutex_unlock(&sbi->wq_mutex);
        return status;
 }
@@ -387,16 +483,13 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
        }
        *wql = wq->next;        /* Unlink from chain */
-        mutex_unlock(&sbi->wq_mutex);
+        kfree(wq->name.name);
-        kfree(wq->name);
+        wq->name.name = NULL;   /* Do not wait on this queue */
-        wq->name = NULL;        /* Do not wait on this queue */
        wq->status = status;
+        wake_up_interruptible(&wq->queue);
-        if (atomic_dec_and_test(&wq->wait_ctr)) /* Is anyone still waiting for this guy? */
+        if (!--wq->wait_ctr)
                kfree(wq);
-        else
+        mutex_unlock(&sbi->wq_mutex);
-                wake_up_interruptible(&wq->queue);
        return 0;
 }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d48ff5f370f4..639d2d8b5710 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -204,6 +204,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_GID, tsk->gid);
        NEW_AUX_ENT(AT_EGID, tsk->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
                NEW_AUX_ENT(AT_PLATFORM,
                            (elf_addr_t)(unsigned long)u_platform);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 7191306367c5..756205314c24 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/syscalls.h>
+#include <linux/fs.h>
 #include <asm/uaccess.h>
@@ -535,31 +536,16 @@ static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
        Node *e = file->f_path.dentry->d_inode->i_private;
-        loff_t pos = *ppos;
        ssize_t res;
        char *page;
-        int len;
        if (!(page = (char*) __get_free_page(GFP_KERNEL)))
                return -ENOMEM;
        entry_status(e, page);
-        len = strlen(page);
-        res = -EINVAL;
+        res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
-        if (pos < 0)
-                goto out;
-        res = 0;
-        if (pos >= len)
-                goto out;
-        if (len < pos + nbytes)
-                nbytes = len - pos;
-        res = -EFAULT;
-        if (copy_to_user(buf, page + pos, nbytes))
-                goto out;
-        *ppos = pos + nbytes;
-        res = nbytes;
-out:
        free_page((unsigned long) page);
        return res;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e3eb3556622b..40c36f7352a6 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,9 @@ static int init_coda_psdev(void)
                goto out_chrdev;
        }               
        for (i = 0; i < MAX_CODADEVS; i++)
-                device_create(coda_psdev_class, NULL,
+                device_create_drvdata(coda_psdev_class, NULL,
-                              MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
+                                      MKDEV(CODA_PSDEV_MAJOR, i),
+                                      NULL, "cfs%d", i);
        coda_sysctl_init();
        goto out;
diff --git a/fs/compat.c b/fs/compat.c
index ed43e17a5dc6..106eba28ec5a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,8 +197,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 {
        
        if (sizeof ubuf->f_blocks == 4) {
-                if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
+                if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
-                    0xffffffff00000000ULL)
+                     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
                        return -EOVERFLOW;
                /* f_files and f_ffree may be -1; it's okay
                 * to stuff that into 32 bits */
@@ -271,8 +271,8 @@ out:
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
 {
        if (sizeof ubuf->f_blocks == 4) {
-                if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
+                if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
-                    0xffffffff00000000ULL)
+                     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
                        return -EOVERFLOW;
                /* f_files and f_ffree may be -1; it's okay
                 * to stuff that into 32 bits */
@@ -2131,9 +2131,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 #ifdef CONFIG_SIGNALFD
-asmlinkage long compat_sys_signalfd(int ufd,
+asmlinkage long compat_sys_signalfd4(int ufd,
-                                    const compat_sigset_t __user *sigmask,
+                                     const compat_sigset_t __user *sigmask,
-                                    compat_size_t sigsetsize)
+                                     compat_size_t sigsetsize, int flags)
 {
        compat_sigset_t ss32;
        sigset_t tmp;
@@ -2148,9 +2148,15 @@ asmlinkage long compat_sys_signalfd(int ufd,
        if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
                return -EFAULT;
-        return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
+        return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
 }
+asmlinkage long compat_sys_signalfd(int ufd,
+                                    const compat_sigset_t __user *sigmask,
+                                    compat_size_t sigsetsize)
+{
+        return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
 #endif /* CONFIG_SIGNALFD */
 #ifdef CONFIG_TIMERFD
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 7b3a03c7c6a9..18e2c548161d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2297,8 +2297,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
 COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST)
-COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST)
 COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25b393c..3818d6ab76ca 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -61,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
 static struct hlist_head *dentry_hashtable __read_mostly;
-static LIST_HEAD(dentry_unused);
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
@@ -96,14 +95,6 @@ static void d_free(struct dentry *dentry)
                call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
-static void dentry_lru_remove(struct dentry *dentry)
-{
-        if (!list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
-                dentry_stat.nr_unused--;
-        }
-}
 /*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
@@ -130,6 +121,41 @@ static void dentry_iput(struct dentry * dentry)
        }
 }
+/*
+ * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+        list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        dentry->d_sb->s_nr_dentry_unused++;
+        dentry_stat.nr_unused++;
+}
+static void dentry_lru_add_tail(struct dentry *dentry)
+{
+        list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        dentry->d_sb->s_nr_dentry_unused++;
+        dentry_stat.nr_unused++;
+}
+static void dentry_lru_del(struct dentry *dentry)
+{
+        if (!list_empty(&dentry->d_lru)) {
+                list_del(&dentry->d_lru);
+                dentry->d_sb->s_nr_dentry_unused--;
+                dentry_stat.nr_unused--;
+        }
+}
+static void dentry_lru_del_init(struct dentry *dentry)
+{
+        if (likely(!list_empty(&dentry->d_lru))) {
+                list_del_init(&dentry->d_lru);
+                dentry->d_sb->s_nr_dentry_unused--;
+                dentry_stat.nr_unused--;
+        }
+}
 /**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
@@ -212,8 +238,7 @@ repeat:
                goto kill_it;
        if (list_empty(&dentry->d_lru)) {
                dentry->d_flags |= DCACHE_REFERENCED;
-                list_add(&dentry->d_lru, &dentry_unused);
+                dentry_lru_add(dentry);
-                dentry_stat.nr_unused++;
        }
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dcache_lock);
@@ -222,7 +247,8 @@ repeat:
 unhash_it:
        __d_drop(dentry);
 kill_it:
-        dentry_lru_remove(dentry);
+        /* if dentry was on the d_lru list delete it from there */
+        dentry_lru_del(dentry);
        dentry = d_kill(dentry);
        if (dentry)
                goto repeat;
@@ -290,7 +316,7 @@ int d_invalidate(struct dentry * dentry)
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
        atomic_inc(&dentry->d_count);
-        dentry_lru_remove(dentry);
+        dentry_lru_del_init(dentry);
        return dentry;
 }
@@ -406,133 +432,167 @@ static void prune_one_dentry(struct dentry * dentry)
                if (dentry->d_op && dentry->d_op->d_delete)
                        dentry->d_op->d_delete(dentry);
-                dentry_lru_remove(dentry);
+                dentry_lru_del_init(dentry);
                __d_drop(dentry);
                dentry = d_kill(dentry);
                spin_lock(&dcache_lock);
        }
 }
-/**
+/*
- * prune_dcache - shrink the dcache
+ * Shrink the dentry LRU on a given superblock.
- * @count: number of entries to try and free
+ * @sb   : superblock to shrink dentry LRU.
- * @sb: if given, ignore dentries for other superblocks
+ * @count: If count is NULL, we prune all dentries on superblock.
- *         which are being unmounted.
+ * @flags: If flags is non-zero, we need to do special processing based on
- *
+ * which flags are set. This means we don't need to maintain multiple
- * Shrink the dcache. This is done when we need
+ * similar copies of this loop.
- * more memory, or simply when we need to unmount
- * something (at which point we need to unuse
- * all dentries).
- *
- * This function may fail to free any resources if
- * all the dentries are in use.
 */
- 
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
-static void prune_dcache(int count, struct super_block *sb)
 {
-        spin_lock(&dcache_lock);
+        LIST_HEAD(referenced);
-        for (; count ; count--) {
+        LIST_HEAD(tmp);
-                struct dentry *dentry;
+        struct dentry *dentry;
-                struct list_head *tmp;
+        int cnt = 0;
-                struct rw_semaphore *s_umount;
-                cond_resched_lock(&dcache_lock);
-                tmp = dentry_unused.prev;
+        BUG_ON(!sb);
-                if (sb) {
+        BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
-                        /* Try to find a dentry for this sb, but don't try
+        spin_lock(&dcache_lock);
-                         * too hard, if they aren't near the tail they will
+        if (count != NULL)
-                         * be moved down again soon
+                /* called from prune_dcache() and shrink_dcache_parent() */
+                cnt = *count;
+restart:
+        if (count == NULL)
+                list_splice_init(&sb->s_dentry_lru, &tmp);
+        else {
+                while (!list_empty(&sb->s_dentry_lru)) {
+                        dentry = list_entry(sb->s_dentry_lru.prev,
+                                        struct dentry, d_lru);
+                        BUG_ON(dentry->d_sb != sb);
+                        spin_lock(&dentry->d_lock);
+                        /*
+                         * If we are honouring the DCACHE_REFERENCED flag and
+                         * the dentry has this flag set, don't free it. Clear
+                         * the flag and put it back on the LRU.
                         */
-                        int skip = count;
+                        if ((flags & DCACHE_REFERENCED)
-                        while (skip && tmp != &dentry_unused &&
+                                && (dentry->d_flags & DCACHE_REFERENCED)) {
-                            list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                skip--;
+                                list_move_tail(&dentry->d_lru, &referenced);
-                                tmp = tmp->prev;
+                                spin_unlock(&dentry->d_lock);
+                        } else {
+                                list_move_tail(&dentry->d_lru, &tmp);
+                                spin_unlock(&dentry->d_lock);
+                                cnt--;
+                                if (!cnt)
+                                        break;
                        }
                }
-                if (tmp == &dentry_unused)
+        }
-                        break;
+        while (!list_empty(&tmp)) {
-                list_del_init(tmp);
+                dentry = list_entry(tmp.prev, struct dentry, d_lru);
-                prefetch(dentry_unused.prev);
+                dentry_lru_del_init(dentry);
-                dentry_stat.nr_unused--;
+                spin_lock(&dentry->d_lock);
-                dentry = list_entry(tmp, struct dentry, d_lru);
-                spin_lock(&dentry->d_lock);
                /*
                 * We found an inuse dentry which was not removed from
-                 * dentry_unused because of laziness during lookup.  Do not free
+                 * the LRU because of laziness during lookup.  Do not free
-                 * it - just keep it off the dentry_unused list.
+                 * it - just keep it off the LRU list.
                 */
-                if (atomic_read(&dentry->d_count)) {
+                if (atomic_read(&dentry->d_count)) {
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                /* If the dentry was recently referenced, don't free it. */
+                prune_one_dentry(dentry);
-                if (dentry->d_flags & DCACHE_REFERENCED) {
+                /* dentry->d_lock was dropped in prune_one_dentry() */
-                        dentry->d_flags &= ~DCACHE_REFERENCED;
+                cond_resched_lock(&dcache_lock);
-                        list_add(&dentry->d_lru, &dentry_unused);
+        }
-                        dentry_stat.nr_unused++;
+        if (count == NULL && !list_empty(&sb->s_dentry_lru))
-                        spin_unlock(&dentry->d_lock);
+                goto restart;
+        if (count != NULL)
+                *count = cnt;
+        if (!list_empty(&referenced))
+                list_splice(&referenced, &sb->s_dentry_lru);
+        spin_unlock(&dcache_lock);
+}
+/**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try to free
+ *
+ * Shrink the dcache. This is done when we need more memory, or simply when we
+ * need to unmount something (at which point we need to unuse all dentries).
+ *
+ * This function may fail to free any resources if all the dentries are in use.
+ */
+static void prune_dcache(int count)
+{
+        struct super_block *sb;
+        int w_count;
+        int unused = dentry_stat.nr_unused;
+        int prune_ratio;
+        int pruned;
+        if (unused == 0 || count == 0)
+                return;
+        spin_lock(&dcache_lock);
+restart:
+        if (count >= unused)
+                prune_ratio = 1;
+        else
+                prune_ratio = unused / count;
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (sb->s_nr_dentry_unused == 0)
                        continue;
-                }
+                sb->s_count++;
-                /*
+                /* Now, we reclaim unused dentrins with fairness.
-                 * If the dentry is not DCACHED_REFERENCED, it is time
+                 * We reclaim them same percentage from each superblock.
-                 * to remove it from the dcache, provided the super block is
+                 * We calculate number of dentries to scan on this sb
-                 * NULL (which means we are trying to reclaim memory)
+                 * as follows, but the implementation is arranged to avoid
-                 * or this dentry belongs to the same super block that
+                 * overflows:
-                 * we want to shrink.
+                 * number of dentries to scan on this sb =
-                 */
+                 * count * (number of dentries on this sb /
-                /*
+                 * number of dentries in the machine)
-                 * If this dentry is for "my" filesystem, then I can prune it
-                 * without taking the s_umount lock (I already hold it).
                 */
-                if (sb && dentry->d_sb == sb) {
+                spin_unlock(&sb_lock);
-                        prune_one_dentry(dentry);
+                if (prune_ratio != 1)
-                        continue;
+                        w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
-                }
+                else
+                        w_count = sb->s_nr_dentry_unused;
+                pruned = w_count;
                /*
-                 * ...otherwise we need to be sure this filesystem isn't being
+                 * We need to be sure this filesystem isn't being unmounted,
-                 * unmounted, otherwise we could race with
+                 * otherwise we could race with generic_shutdown_super(), and
-                 * generic_shutdown_super(), and end up holding a reference to
+                 * end up holding a reference to an inode while the filesystem
-                 * an inode while the filesystem is unmounted.
+                 * is unmounted.  So we try to get s_umount, and make sure
-                 * So we try to get s_umount, and make sure s_root isn't NULL.
+                 * s_root isn't NULL.
-                 * (Take a local copy of s_umount to avoid a use-after-free of
-                 * `dentry').
                 */
-                s_umount = &dentry->d_sb->s_umount;
+                if (down_read_trylock(&sb->s_umount)) {
-                if (down_read_trylock(s_umount)) {
+                        if ((sb->s_root != NULL) &&
-                        if (dentry->d_sb->s_root != NULL) {
+                            (!list_empty(&sb->s_dentry_lru))) {
-                                prune_one_dentry(dentry);
+                                spin_unlock(&dcache_lock);
-                                up_read(s_umount);
+                                __shrink_dcache_sb(sb, &w_count,
-                                continue;
+                                                DCACHE_REFERENCED);
+                                pruned -= w_count;
+                                spin_lock(&dcache_lock);
                        }
-                        up_read(s_umount);
+                        up_read(&sb->s_umount);
                }
-                spin_unlock(&dentry->d_lock);
+                spin_lock(&sb_lock);
+                count -= pruned;
                /*
-                 * Insert dentry at the head of the list as inserting at the
+                 * restart only when sb is no longer on the list and
-                 * tail leads to a cycle.
+                 * we have more work to do.
                 */
-                list_add(&dentry->d_lru, &dentry_unused);
+                if (__put_super_and_need_restart(sb) && count > 0) {
-                dentry_stat.nr_unused++;
+                        spin_unlock(&sb_lock);
+                        goto restart;
+                }
        }
+        spin_unlock(&sb_lock);
        spin_unlock(&dcache_lock);
 }
-/*
- * Shrink the dcache for the specified super block.
- * This allows us to unmount a device without disturbing
- * the dcache for the other devices.
- *
- * This implementation makes just two traversals of the
- * unused list.  On the first pass we move the selected
- * dentries to the most recent end, and on the second
- * pass we free them.  The second pass must restart after
- * each dput(), but since the target dentries are all at
- * the end, it's really just a single traversal.
- */
 /**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
@@ -541,44 +601,9 @@ static void prune_dcache(int count, struct super_block *sb)
 * is used to free the dcache before unmounting a file
 * system
 */
 void shrink_dcache_sb(struct super_block * sb)
 {
-        struct list_head *tmp, *next;
+        __shrink_dcache_sb(sb, NULL, 0);
-        struct dentry *dentry;
-        /*
-         * Pass one ... move the dentries for the specified
-         * superblock to the most recent end of the unused list.
-         */
-        spin_lock(&dcache_lock);
-        list_for_each_prev_safe(tmp, next, &dentry_unused) {
-                dentry = list_entry(tmp, struct dentry, d_lru);
-                if (dentry->d_sb != sb)
-                        continue;
-                list_move_tail(tmp, &dentry_unused);
-        }
-        /*
-         * Pass two ... free the dentries for this superblock.
-         */
-repeat:
-        list_for_each_prev_safe(tmp, next, &dentry_unused) {
-                dentry = list_entry(tmp, struct dentry, d_lru);
-                if (dentry->d_sb != sb)
-                        continue;
-                dentry_stat.nr_unused--;
-                list_del_init(tmp);
-                spin_lock(&dentry->d_lock);
-                if (atomic_read(&dentry->d_count)) {
-                        spin_unlock(&dentry->d_lock);
-                        continue;
-                }
-                prune_one_dentry(dentry);
-                cond_resched_lock(&dcache_lock);
-                goto repeat;
-        }
-        spin_unlock(&dcache_lock);
 }
 /*
@@ -595,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        /* detach this root from the system */
        spin_lock(&dcache_lock);
-        dentry_lru_remove(dentry);
+        dentry_lru_del_init(dentry);
        __d_drop(dentry);
        spin_unlock(&dcache_lock);
@@ -609,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        spin_lock(&dcache_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
-                                dentry_lru_remove(loop);
+                                dentry_lru_del_init(loop);
                                __d_drop(loop);
                                cond_resched_lock(&dcache_lock);
                        }
@@ -791,14 +816,13 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                dentry_lru_remove(dentry);
+                dentry_lru_del_init(dentry);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        list_add_tail(&dentry->d_lru, &dentry_unused);
+                        dentry_lru_add_tail(dentry);
-                        dentry_stat.nr_unused++;
                        found++;
                }
@@ -840,10 +864,11 @@ out:
 
 void shrink_dcache_parent(struct dentry * parent)
 {
+        struct super_block *sb = parent->d_sb;
        int found;
        while ((found = select_parent(parent)) != 0)
-                prune_dcache(found, parent->d_sb);
+                __shrink_dcache_sb(sb, &found, 0);
 }
 /*
@@ -863,7 +888,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                prune_dcache(nr, NULL);
+                prune_dcache(nr);
        }
        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
@@ -1215,7 +1240,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
 * lookup is going on.
 *
- * dentry_unused list is not updated even if lookup finds the required dentry
+ * The dentry unused LRU is not updated even if lookup finds the required dentry
 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
 * acquisition.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e9602d85c11d..08e28c9bb416 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -309,6 +309,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
+static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+        int ret = 0;
+        if (debugfs_positive(dentry)) {
+                if (dentry->d_inode) {
+                        dget(dentry);
+                        switch (dentry->d_inode->i_mode & S_IFMT) {
+                        case S_IFDIR:
+                                ret = simple_rmdir(parent->d_inode, dentry);
+                                break;
+                        case S_IFLNK:
+                                kfree(dentry->d_inode->i_private);
+                                /* fall through */
+                        default:
+                                simple_unlink(parent->d_inode, dentry);
+                                break;
+                        }
+                        if (!ret)
+                                d_delete(dentry);
+                        dput(dentry);
+                }
+        }
+}
 /**
 * debugfs_remove - removes a file or directory from the debugfs filesystem
 * @dentry: a pointer to a the dentry of the file or directory to be
@@ -325,7 +350,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 void debugfs_remove(struct dentry *dentry)
 {
        struct dentry *parent;
-        int ret = 0;
        
        if (!dentry)
                return;
@@ -335,29 +359,83 @@ void debugfs_remove(struct dentry *dentry)
                return;
        mutex_lock(&parent->d_inode->i_mutex);
-        if (debugfs_positive(dentry)) {
+        __debugfs_remove(dentry, parent);
-                if (dentry->d_inode) {
+        mutex_unlock(&parent->d_inode->i_mutex);
-                        dget(dentry);
+        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-                        switch (dentry->d_inode->i_mode & S_IFMT) {
+}
-                        case S_IFDIR:
+EXPORT_SYMBOL_GPL(debugfs_remove);
-                                ret = simple_rmdir(parent->d_inode, dentry);
-                                break;
+/**
-                        case S_IFLNK:
+ * debugfs_remove_recursive - recursively removes a directory
-                                kfree(dentry->d_inode->i_private);
+ * @dentry: a pointer to a the dentry of the directory to be removed.
-                                /* fall through */
+ *
-                        default:
+ * This function recursively removes a directory tree in debugfs that
-                                simple_unlink(parent->d_inode, dentry);
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+        struct dentry *child;
+        struct dentry *parent;
+        if (!dentry)
+                return;
+        parent = dentry->d_parent;
+        if (!parent || !parent->d_inode)
+                return;
+        parent = dentry;
+        mutex_lock(&parent->d_inode->i_mutex);
+        while (1) {
+                /*
+                 * When all dentries under "parent" has been removed,
+                 * walk up the tree until we reach our starting point.
+                 */
+                if (list_empty(&parent->d_subdirs)) {
+                        mutex_unlock(&parent->d_inode->i_mutex);
+                        if (parent == dentry)
                                break;
-                        }
+                        parent = parent->d_parent;
-                        if (!ret)
+                        mutex_lock(&parent->d_inode->i_mutex);
-                                d_delete(dentry);
+                }
-                        dput(dentry);
+                child = list_entry(parent->d_subdirs.next, struct dentry,
+                                d_u.d_child);
+                /*
+                 * If "child" isn't empty, walk down the tree and
+                 * remove all its descendants first.
+                 */
+                if (!list_empty(&child->d_subdirs)) {
+                        mutex_unlock(&parent->d_inode->i_mutex);
+                        parent = child;
+                        mutex_lock(&parent->d_inode->i_mutex);
+                        continue;
                }
+                __debugfs_remove(child, parent);
+                if (parent->d_subdirs.next == &child->d_u.d_child) {
+                        /*
+                         * Avoid infinite loop if we fail to remove
+                         * one dentry.
+                         */
+                        mutex_unlock(&parent->d_inode->i_mutex);
+                        break;
+                }
+                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
+        parent = dentry->d_parent;
+        mutex_lock(&parent->d_inode->i_mutex);
+        __debugfs_remove(dentry, parent);
        mutex_unlock(&parent->d_inode->i_mutex);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
-EXPORT_SYMBOL_GPL(debugfs_remove);
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 /**
 * debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 1e34a7fd4884..b4755a85996e 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e2832bc7869a..7b99917ffadc 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 static int
@@ -1032,10 +1033,8 @@ static int contains_ecryptfs_marker(char *data)
 {
        u32 m_1, m_2;
-        memcpy(&m_1, data, 4);
+        m_1 = get_unaligned_be32(data);
-        m_1 = be32_to_cpu(m_1);
+        m_2 = get_unaligned_be32(data + 4);
-        memcpy(&m_2, (data + 4), 4);
-        m_2 = be32_to_cpu(m_2);
        if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
                return 1;
        ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
@@ -1073,8 +1072,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
        int i;
        u32 flags;
-        memcpy(&flags, page_virt, 4);
+        flags = get_unaligned_be32(page_virt);
-        flags = be32_to_cpu(flags);
        for (i = 0; i < ((sizeof(ecryptfs_flag_map)
                          / sizeof(struct ecryptfs_flag_map_elem))); i++)
                if (flags & ecryptfs_flag_map[i].file_flag) {
@@ -1100,11 +1098,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
        get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
        m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
-        m_1 = cpu_to_be32(m_1);
+        put_unaligned_be32(m_1, page_virt);
-        memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+        page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
-        m_2 = cpu_to_be32(m_2);
+        put_unaligned_be32(m_2, page_virt);
-        memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
-               (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
        (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
@@ -1121,8 +1117,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
                        flags |= ecryptfs_flag_map[i].file_flag;
        /* Version is in top 8 bits of the 32-bit flag vector */
        flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
-        flags = cpu_to_be32(flags);
+        put_unaligned_be32(flags, page_virt);
-        memcpy(page_virt, &flags, 4);
        (*written) = 4;
 }
@@ -1238,11 +1233,9 @@ ecryptfs_write_header_metadata(char *virt,
        num_header_extents_at_front =
                (u16)(crypt_stat->num_header_bytes_at_front
                      / crypt_stat->extent_size);
-        header_extent_size = cpu_to_be32(header_extent_size);
+        put_unaligned_be32(header_extent_size, virt);
-        memcpy(virt, &header_extent_size, 4);
        virt += 4;
-        num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
+        put_unaligned_be16(num_header_extents_at_front, virt);
-        memcpy(virt, &num_header_extents_at_front, 2);
        (*written) = 6;
 }
@@ -1410,15 +1403,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
        u32 header_extent_size;
        u16 num_header_extents_at_front;
-        memcpy(&header_extent_size, virt, sizeof(u32));
+        header_extent_size = get_unaligned_be32(virt);
-        header_extent_size = be32_to_cpu(header_extent_size);
+        virt += sizeof(__be32);
-        virt += sizeof(u32);
+        num_header_extents_at_front = get_unaligned_be16(virt);
-        memcpy(&num_header_extents_at_front, virt, sizeof(u16));
-        num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
        crypt_stat->num_header_bytes_at_front =
                (((size_t)num_header_extents_at_front
                  * (size_t)header_extent_size));
-        (*bytes_read) = (sizeof(u32) + sizeof(u16));
+        (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
        if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
            && (crypt_stat->num_header_bytes_at_front
                < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c15c25745e05..b73fb752c5f8 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -559,10 +559,25 @@ extern struct kmem_cache *ecryptfs_key_record_cache;
 extern struct kmem_cache *ecryptfs_key_sig_cache;
 extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
 extern struct kmem_cache *ecryptfs_key_tfm_cache;
+extern struct kmem_cache *ecryptfs_open_req_cache;
+struct ecryptfs_open_req {
+#define ECRYPTFS_REQ_PROCESSED 0x00000001
+#define ECRYPTFS_REQ_DROPPED   0x00000002
+#define ECRYPTFS_REQ_ZOMBIE    0x00000004
+        u32 flags;
+        struct file **lower_file;
+        struct dentry *lower_dentry;
+        struct vfsmount *lower_mnt;
+        wait_queue_head_t wait;
+        struct mutex mux;
+        struct list_head kthread_ctl_list;
+};
+#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
 int ecryptfs_interpose(struct dentry *hidden_dentry,
                       struct dentry *this_dentry, struct super_block *sb,
-                       int flag);
+                       u32 flags);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
                             const char *name, int length,
@@ -690,5 +705,11 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
 ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
                      struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+                             struct dentry *lower_dentry,
+                             struct vfsmount *lower_mnt);
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24749bf0668f..9244d653743e 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -192,6 +192,23 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+            && !(file->f_flags & O_RDONLY)) {
+                rc = -EPERM;
+                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+                       "file must hence be opened RO\n", __func__);
+                goto out;
+        }
+        if (!ecryptfs_inode_to_private(inode)->lower_file) {
+                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to initialize "
+                               "the persistent file for the dentry with name "
+                               "[%s]; rc = [%d]\n", __func__,
+                               ecryptfs_dentry->d_name.name, rc);
+                        goto out;
+                }
+        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c92cc1c00aae..d755455e3bff 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 static struct dentry *lock_parent(struct dentry *dentry)
@@ -188,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
+        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to initialize "
+                               "the persistent file for the dentry with name "
+                               "[%s]; rc = [%d]\n", __func__,
+                               ecryptfs_dentry->d_name.name, rc);
+                        goto out;
+                }
+        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry);
        if (rc) {
                printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
@@ -307,10 +318,11 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
                d_add(dentry, NULL);
                goto out;
        }
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error interposing\n");
-                goto out_dput;
+                goto out;
        }
        if (S_ISDIR(lower_inode->i_mode)) {
                ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
@@ -336,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
                rc = -ENOMEM;
                ecryptfs_printk(KERN_ERR,
                                "Cannot ecryptfs_kmalloc a page\n");
-                goto out_dput;
+                goto out;
        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
                ecryptfs_set_default_sizes(crypt_stat);
+        if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
+                rc = ecryptfs_init_persistent_file(dentry);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to initialize "
+                               "the persistent file for the dentry with name "
+                               "[%s]; rc = [%d]\n", __func__,
+                               dentry->d_name.name, rc);
+                        goto out;
+                }
+        }
        rc = ecryptfs_read_and_validate_header_region(page_virt,
                                                      dentry->d_inode);
        if (rc) {
@@ -364,8 +386,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
                else
                        file_size = i_size_read(lower_dentry->d_inode);
        } else {
-                memcpy(&file_size, page_virt, sizeof(file_size));
+                file_size = get_unaligned_be64(page_virt);
-                file_size = be64_to_cpu(file_size);
        }
        i_size_write(dentry->d_inode, (loff_t)file_size);
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e82b457180be..f5b76a331b9c 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -44,15 +44,15 @@ static int process_request_key_err(long err_code)
        int rc = 0;
        switch (err_code) {
-        case ENOKEY:
+        case -ENOKEY:
                ecryptfs_printk(KERN_WARNING, "No key\n");
                rc = -ENOENT;
                break;
-        case EKEYEXPIRED:
+        case -EKEYEXPIRED:
                ecryptfs_printk(KERN_WARNING, "Key expired\n");
                rc = -ETIME;
                break;
-        case EKEYREVOKED:
+        case -EKEYREVOKED:
                ecryptfs_printk(KERN_WARNING, "Key revoked\n");
                rc = -EINVAL;
                break;
@@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
        if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
                printk(KERN_ERR "Could not find key with description: [%s]\n",
                       sig);
-                process_request_key_err(PTR_ERR(*auth_tok_key));
+                rc = process_request_key_err(PTR_ERR(*auth_tok_key));
-                rc = -EINVAL;
                goto out;
        }
        (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key);
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
new file mode 100644
index 000000000000..c440c6b58b2d
--- /dev/null
+++ b/fs/ecryptfs/kthread.c
@@ -0,0 +1,203 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+struct kmem_cache *ecryptfs_open_req_cache;
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+        u32 flags;
+        struct mutex mux;
+        struct list_head req_list;
+        wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+static struct task_struct *ecryptfs_kthread;
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower persistent file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+        set_freezable();
+        while (1)  {
+                struct ecryptfs_open_req *req;
+                wait_event_freezable(
+                        ecryptfs_kthread_ctl.wait,
+                        (!list_empty(&ecryptfs_kthread_ctl.req_list)
+                         || kthread_should_stop()));
+                mutex_lock(&ecryptfs_kthread_ctl.mux);
+                if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+                        mutex_unlock(&ecryptfs_kthread_ctl.mux);
+                        goto out;
+                }
+                while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+                        req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+                                               struct ecryptfs_open_req,
+                                               kthread_ctl_list);
+                        mutex_lock(&req->mux);
+                        list_del(&req->kthread_ctl_list);
+                        if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) {
+                                dget(req->lower_dentry);
+                                mntget(req->lower_mnt);
+                                (*req->lower_file) = dentry_open(
+                                        req->lower_dentry, req->lower_mnt,
+                                        (O_RDWR | O_LARGEFILE));
+                                req->flags |= ECRYPTFS_REQ_PROCESSED;
+                        }
+                        wake_up(&req->wait);
+                        mutex_unlock(&req->mux);
+                }
+                mutex_unlock(&ecryptfs_kthread_ctl.mux);
+        }
+out:
+        return 0;
+}
+int ecryptfs_init_kthread(void)
+{
+        int rc = 0;
+        mutex_init(&ecryptfs_kthread_ctl.mux);
+        init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+        INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+        ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+                                       "ecryptfs-kthread");
+        if (IS_ERR(ecryptfs_kthread)) {
+                rc = PTR_ERR(ecryptfs_kthread);
+                printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+                       "\n", __func__, rc);
+        }
+        return rc;
+}
+void ecryptfs_destroy_kthread(void)
+{
+        struct ecryptfs_open_req *req;
+        mutex_lock(&ecryptfs_kthread_ctl.mux);
+        ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+        list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
+                            kthread_ctl_list) {
+                mutex_lock(&req->mux);
+                req->flags |= ECRYPTFS_REQ_ZOMBIE;
+                wake_up(&req->wait);
+                mutex_unlock(&req->mux);
+        }
+        mutex_unlock(&ecryptfs_kthread_ctl.mux);
+        kthread_stop(ecryptfs_kthread);
+        wake_up(&ecryptfs_kthread_ctl.wait);
+}
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+                             struct dentry *lower_dentry,
+                             struct vfsmount *lower_mnt)
+{
+        struct ecryptfs_open_req *req;
+        int rc = 0;
+        /* Corresponding dput() and mntput() are done when the
+         * persistent file is fput() when the eCryptfs inode is
+         * destroyed. */
+        dget(lower_dentry);
+        mntget(lower_mnt);
+        (*lower_file) = dentry_open(lower_dentry, lower_mnt,
+                                    (O_RDWR | O_LARGEFILE));
+        if (!IS_ERR(*lower_file))
+                goto out;
+        req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
+        if (!req) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        mutex_init(&req->mux);
+        req->lower_file = lower_file;
+        req->lower_dentry = lower_dentry;
+        req->lower_mnt = lower_mnt;
+        init_waitqueue_head(&req->wait);
+        req->flags = 0;
+        mutex_lock(&ecryptfs_kthread_ctl.mux);
+        if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+                rc = -EIO;
+                mutex_unlock(&ecryptfs_kthread_ctl.mux);
+                printk(KERN_ERR "%s: We are in the middle of shutting down; "
+                       "aborting privileged request to open lower file\n",
+                        __func__);
+                goto out_free;
+        }
+        list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+        mutex_unlock(&ecryptfs_kthread_ctl.mux);
+        wake_up(&ecryptfs_kthread_ctl.wait);
+        wait_event(req->wait, (req->flags != 0));
+        mutex_lock(&req->mux);
+        BUG_ON(req->flags == 0);
+        if (req->flags & ECRYPTFS_REQ_DROPPED
+            || req->flags & ECRYPTFS_REQ_ZOMBIE) {
+                rc = -EIO;
+                printk(KERN_WARNING "%s: Privileged open request dropped\n",
+                       __func__);
+                goto out_unlock;
+        }
+        if (IS_ERR(*req->lower_file)) {
+                rc = PTR_ERR(*req->lower_file);
+                dget(lower_dentry);
+                mntget(lower_mnt);
+                (*lower_file) = dentry_open(lower_dentry, lower_mnt,
+                                            (O_RDONLY | O_LARGEFILE));
+                if (IS_ERR(*lower_file)) {
+                        rc = PTR_ERR(*req->lower_file);
+                        (*lower_file) = NULL;
+                        printk(KERN_WARNING "%s: Error attempting privileged "
+                               "open of lower file with either RW or RO "
+                               "perms; rc = [%d]. Giving up.\n",
+                               __func__, rc);
+                }
+        }
+out_unlock:
+        mutex_unlock(&req->mux);
+out_free:
+        kmem_cache_free(ecryptfs_open_req_cache, req);
+out:
+        return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d603631601eb..6f403cfba14f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -117,7 +117,7 @@ void __ecryptfs_printk(const char *fmt, ...)
 *
 * Returns zero on success; non-zero otherwise
 */
-static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 {
        struct ecryptfs_inode_info *inode_info =
                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
@@ -130,26 +130,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                        ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
                lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
-                /* Corresponding dput() and mntput() are done when the
+                rc = ecryptfs_privileged_open(&inode_info->lower_file,
-                 * persistent file is fput() when the eCryptfs inode
+                                                     lower_dentry, lower_mnt);
-                 * is destroyed. */
+                if (rc || IS_ERR(inode_info->lower_file)) {
-                dget(lower_dentry);
-                mntget(lower_mnt);
-                inode_info->lower_file = dentry_open(lower_dentry,
-                                                     lower_mnt,
-                                                     (O_RDWR | O_LARGEFILE));
-                if (IS_ERR(inode_info->lower_file)) {
-                        dget(lower_dentry);
-                        mntget(lower_mnt);
-                        inode_info->lower_file = dentry_open(lower_dentry,
-                                                             lower_mnt,
-                                                             (O_RDONLY
-                                                              | O_LARGEFILE));
-                }
-                if (IS_ERR(inode_info->lower_file)) {
                        printk(KERN_ERR "Error opening lower persistent file "
-                               "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
+                               "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
-                               lower_dentry, lower_mnt);
+                               "rc = [%d]\n", lower_dentry, lower_mnt, rc);
                        rc = PTR_ERR(inode_info->lower_file);
                        inode_info->lower_file = NULL;
                }
@@ -163,14 +149,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 * @lower_dentry: Existing dentry in the lower filesystem
 * @dentry: ecryptfs' dentry
 * @sb: ecryptfs's super_block
- * @flag: If set to true, then d_add is called, else d_instantiate is called
+ * @flags: flags to govern behavior of interpose procedure
 *
 * Interposes upper and lower dentries.
 *
 * Returns zero on success; non-zero otherwise
 */
 int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-                       struct super_block *sb, int flag)
+                       struct super_block *sb, u32 flags)
 {
        struct inode *lower_inode;
        struct inode *inode;
@@ -207,7 +193,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
        dentry->d_op = &ecryptfs_dops;
-        if (flag)
+        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
@@ -215,13 +201,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
-        rc = ecryptfs_init_persistent_file(dentry);
-        if (rc) {
-                printk(KERN_ERR "%s: Error attempting to initialize the "
-                       "persistent file for the dentry with name [%s]; "
-                       "rc = [%d]\n", __func__, dentry->d_name.name, rc);
-                goto out;
-        }
 out:
        return rc;
 }
@@ -262,10 +241,11 @@ static int ecryptfs_init_global_auth_toks(
                               "session keyring for sig specified in mount "
                               "option: [%s]\n", global_auth_tok->sig);
                        global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
-                        rc = 0;
+                        goto out;
                } else
                        global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
        }
+out:
        return rc;
 }
@@ -314,7 +294,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        char *cipher_name_dst;
        char *cipher_name_src;
        char *cipher_key_bytes_src;
-        int cipher_name_len;
        if (!options) {
                rc = -EINVAL;
@@ -395,17 +374,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                goto out;
        }
        if (!cipher_name_set) {
-                cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+                int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
-                if (unlikely(cipher_name_len
-                             >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
+                BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-                        rc = -EINVAL;
-                        BUG();
+                strcpy(mount_crypt_stat->global_default_cipher_name,
-                        goto out;
+                       ECRYPTFS_DEFAULT_CIPHER);
-                }
-                memcpy(mount_crypt_stat->global_default_cipher_name,
-                       ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
-                mount_crypt_stat->global_default_cipher_name[cipher_name_len]
-                    = '\0';
        }
        if (!cipher_key_bytes_set) {
                mount_crypt_stat->global_default_cipher_key_size = 0;
@@ -430,7 +404,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                printk(KERN_WARNING "One or more global auth toks could not "
                       "properly register; rc = [%d]\n", rc);
        }
-        rc = 0;
 out:
        return rc;
 }
@@ -679,6 +652,11 @@ static struct ecryptfs_cache_info {
                .name = "ecryptfs_key_tfm_cache",
                .size = sizeof(struct ecryptfs_key_tfm),
        },
+        {
+                .cache = &ecryptfs_open_req_cache,
+                .name = "ecryptfs_open_req_cache",
+                .size = sizeof(struct ecryptfs_open_req),
+        },
 };
 static void ecryptfs_free_kmem_caches(void)
@@ -795,11 +773,17 @@ static int __init ecryptfs_init(void)
                printk(KERN_ERR "sysfs registration failed\n");
                goto out_unregister_filesystem;
        }
+        rc = ecryptfs_init_kthread();
+        if (rc) {
+                printk(KERN_ERR "%s: kthread initialization failed; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_do_sysfs_unregistration;
+        }
        rc = ecryptfs_init_messaging(ecryptfs_transport);
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Failure occured while attempting to "
+                printk(KERN_ERR "Failure occured while attempting to "
                                "initialize the eCryptfs netlink socket\n");
-                goto out_do_sysfs_unregistration;
+                goto out_destroy_kthread;
        }
        rc = ecryptfs_init_crypto();
        if (rc) {
@@ -814,6 +798,8 @@ static int __init ecryptfs_init(void)
        goto out;
 out_release_messaging:
        ecryptfs_release_messaging(ecryptfs_transport);
+out_destroy_kthread:
+        ecryptfs_destroy_kthread();
 out_do_sysfs_unregistration:
        do_sysfs_unregistration();
 out_unregister_filesystem:
@@ -833,6 +819,7 @@ static void __exit ecryptfs_exit(void)
                printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
                       "rc = [%d]\n", rc);
        ecryptfs_release_messaging(ecryptfs_transport);
+        ecryptfs_destroy_kthread();
        do_sysfs_unregistration();
        unregister_filesystem(&ecryptfs_fs_type);
        ecryptfs_free_kmem_caches();
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 09a4522f65e6..b484792a0996 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -358,46 +358,6 @@ out_unlock_daemon:
 }
 /**
- * ecryptfs_miscdev_helo
- * @euid: effective user id of miscdevess sending helo packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending helo packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
-                                 struct pid *pid)
-{
-        int rc;
-        rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
-                                   pid);
-        if (rc)
-                printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
-        return rc;
-}
-/**
- * ecryptfs_miscdev_quit
- * @euid: effective user id of miscdevess sending quit packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending quit packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
-                                 struct pid *pid)
-{
-        int rc;
-        rc = ecryptfs_process_quit(euid, user_ns, pid);
-        if (rc)
-                printk(KERN_WARNING
-                       "Error processing QUIT message; rc = [%d]\n", rc);
-        return rc;
-}
-/**
 * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
 * @data: Bytes comprising struct ecryptfs_message
 * @data_size: sizeof(struct ecryptfs_message) + data len
@@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                               __func__, rc);
                break;
        case ECRYPTFS_MSG_HELO:
-                rc = ecryptfs_miscdev_helo(current->euid,
-                                           current->nsproxy->user_ns,
-                                           task_pid(current));
-                if (rc) {
-                        printk(KERN_ERR "%s: Error attempting to process "
-                               "helo from pid [0x%p]; rc = [%d]\n", __func__,
-                               task_pid(current), rc);
-                        goto out_free;
-                }
-                break;
        case ECRYPTFS_MSG_QUIT:
-                rc = ecryptfs_miscdev_quit(current->euid,
-                                           current->nsproxy->user_ns,
-                                           task_pid(current));
-                if (rc) {
-                        printk(KERN_ERR "%s: Error attempting to process "
-                               "quit from pid [0x%p]; rc = [%d]\n", __func__,
-                               task_pid(current), rc);
-                        goto out_free;
-                }
                break;
        default:
                ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2b6fe1e6e8ba..245c2dc02d5c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -372,7 +373,6 @@ out:
 */
 static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 {
-        u64 file_size;
        char *file_size_virt;
        int rc;
@@ -381,9 +381,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
                rc = -ENOMEM;
                goto out;
        }
-        file_size = (u64)i_size_read(ecryptfs_inode);
+        put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
-        file_size = cpu_to_be64(file_size);
-        memcpy(file_size_virt, &file_size, sizeof(u64));
        rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
                                  sizeof(u64));
        kfree(file_size_virt);
@@ -403,7 +401,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
        struct dentry *lower_dentry =
                ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
        struct inode *lower_inode = lower_dentry->d_inode;
-        u64 file_size;
        int rc;
        if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
@@ -424,9 +421,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
                                           xattr_virt, PAGE_CACHE_SIZE);
        if (size < 0)
                size = 8;
-        file_size = (u64)i_size_read(ecryptfs_inode);
+        put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-        file_size = cpu_to_be64(file_size);
-        memcpy(xattr_virt, &file_size, sizeof(u64));
        rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
                                         xattr_virt, size, 0);
        mutex_unlock(&lower_inode->i_mutex);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 343942deeec1..08bf558d0408 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,11 +198,18 @@ struct file *eventfd_fget(int fd)
        return file;
 }
-asmlinkage long sys_eventfd(unsigned int count)
+asmlinkage long sys_eventfd2(unsigned int count, int flags)
 {
        int fd;
        struct eventfd_ctx *ctx;
+        /* Check the EFD_* constants for consistency.  */
+        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+        if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+                return -EINVAL;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
@@ -214,9 +221,15 @@ asmlinkage long sys_eventfd(unsigned int count)
         * When we call this, the initialization must be complete, since
         * anon_inode_getfd() will install the fd.
         */
-        fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx);
+        fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
+                              flags & (O_CLOEXEC | O_NONBLOCK));
        if (fd < 0)
                kfree(ctx);
        return fd;
 }
+asmlinkage long sys_eventfd(unsigned int count)
+{
+        return sys_eventfd2(count, 0);
+}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 990c01d2d66b..0c87474f7917 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1046,20 +1046,25 @@ retry:
 * RB tree. With the current implementation, the "size" parameter is ignored
 * (besides sanity checks).
 */
-asmlinkage long sys_epoll_create(int size)
+asmlinkage long sys_epoll_create1(int flags)
 {
        int error, fd = -1;
        struct eventpoll *ep;
+        /* Check the EPOLL_* constant for consistency.  */
+        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+        if (flags & ~EPOLL_CLOEXEC)
+                return -EINVAL;
        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-                     current, size));
+                     current, flags));
        /*
-         * Sanity check on the size parameter, and create the internal data
+         * Create the internal data structure ( "struct eventpoll" ).
-         * structure ( "struct eventpoll" ).
         */
-        error = -EINVAL;
+        error = ep_alloc(&ep);
-        if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
+        if (error < 0) {
                fd = error;
                goto error_return;
        }
@@ -1068,17 +1073,26 @@ asmlinkage long sys_epoll_create(int size)
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
-        fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
+        fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+                              flags & O_CLOEXEC);
        if (fd < 0)
                ep_free(ep);
 error_return:
        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-                     current, size, fd));
+                     current, flags, fd));
        return fd;
 }
+asmlinkage long sys_epoll_create(int size)
+{
+        if (size < 0)
+                return -EINVAL;
+        return sys_epoll_create1(0);
+}
 /*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
diff --git a/fs/exec.c b/fs/exec.c
index fd9234379e8d..190ed1f92774 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -541,7 +541,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-                free_pgd_range(&tlb, new_end, old_end, new_end,
+                free_pgd_range(tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -550,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-                free_pgd_range(&tlb, old_start, old_end, new_end,
+                free_pgd_range(tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
        tlb_finish_mmu(tlb, new_end, old_end);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 330a7d782591..9679fcbdeaa0 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -125,13 +125,16 @@ static int dupfd(struct file *file, unsigned int start, int cloexec)
        return fd;
 }
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
        int err = -EBADF;
        struct file * file, *tofree;
        struct files_struct * files = current->files;
        struct fdtable *fdt;
+        if ((flags & ~O_CLOEXEC) != 0)
+                return -EINVAL;
        spin_lock(&files->file_lock);
        if (!(file = fcheck(oldfd)))
                goto out_unlock;
@@ -163,7 +166,10 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
        rcu_assign_pointer(fdt->fd[newfd], file);
        FD_SET(newfd, fdt->open_fds);
-        FD_CLR(newfd, fdt->close_on_exec);
+        if (flags & O_CLOEXEC)
+                FD_SET(newfd, fdt->close_on_exec);
+        else
+                FD_CLR(newfd, fdt->close_on_exec);
        spin_unlock(&files->file_lock);
        if (tofree)
@@ -181,6 +187,11 @@ out_fput:
        goto out;
 }
+asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+{
+        return sys_dup3(oldfd, newfd, 0);
+}
 asmlinkage long sys_dup(unsigned int fildes)
 {
        int ret = -EBADF;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aeabf80f81a5..dbd01d262ca4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -53,6 +53,7 @@ int sysctl_hugetlb_shm_group;
 enum {
        Opt_size, Opt_nr_inodes,
        Opt_mode, Opt_uid, Opt_gid,
+        Opt_pagesize,
        Opt_err,
 };
@@ -62,6 +63,7 @@ static match_table_t tokens = {
        {Opt_mode,      "mode=%o"},
        {Opt_uid,       "uid=%u"},
        {Opt_gid,       "gid=%u"},
+        {Opt_pagesize,  "pagesize=%s"},
        {Opt_err,       NULL},
 };
@@ -80,6 +82,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        struct inode *inode = file->f_path.dentry->d_inode;
        loff_t len, vma_len;
        int ret;
+        struct hstate *h = hstate_file(file);
        /*
         * vma address alignment (but not the pgoff alignment) has
@@ -92,7 +95,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
        vma->vm_ops = &hugetlb_vm_ops;
-        if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+        if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
        vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -103,9 +106,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        ret = -ENOMEM;
        len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        if (vma->vm_flags & VM_MAYSHARE &&
+        if (hugetlb_reserve_pages(inode,
-            hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+                                vma->vm_pgoff >> huge_page_order(h),
-                                  len >> HPAGE_SHIFT))
+                                len >> huge_page_shift(h), vma))
                goto out;
        ret = 0;
@@ -130,20 +133,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long start_addr;
+        struct hstate *h = hstate_file(file);
-        if (len & ~HPAGE_MASK)
+        if (len & ~huge_page_mask(h))
                return -EINVAL;
        if (len > TASK_SIZE)
                return -ENOMEM;
        if (flags & MAP_FIXED) {
-                if (prepare_hugepage_range(addr, len))
+                if (prepare_hugepage_range(file, addr, len))
                        return -EINVAL;
                return addr;
        }
        if (addr) {
-                addr = ALIGN(addr, HPAGE_SIZE);
+                addr = ALIGN(addr, huge_page_size(h));
                vma = find_vma(mm, addr);
                if (TASK_SIZE - len >= addr &&
                    (!vma || addr + len <= vma->vm_start))
@@ -156,7 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                start_addr = TASK_UNMAPPED_BASE;
 full_search:
-        addr = ALIGN(start_addr, HPAGE_SIZE);
+        addr = ALIGN(start_addr, huge_page_size(h));
        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
                /* At this point:  (!vma || addr < vma->vm_end). */
@@ -174,7 +178,7 @@ full_search:
                if (!vma || addr + len <= vma->vm_start)
                        return addr;
-                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+                addr = ALIGN(vma->vm_end, huge_page_size(h));
        }
 }
 #endif
@@ -225,10 +229,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                              size_t len, loff_t *ppos)
 {
+        struct hstate *h = hstate_file(filp);
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
-        unsigned long index = *ppos >> HPAGE_SHIFT;
+        unsigned long index = *ppos >> huge_page_shift(h);
-        unsigned long offset = *ppos & ~HPAGE_MASK;
+        unsigned long offset = *ppos & ~huge_page_mask(h);
        unsigned long end_index;
        loff_t isize;
        ssize_t retval = 0;
@@ -243,17 +248,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
        if (!isize)
                goto out;
-        end_index = (isize - 1) >> HPAGE_SHIFT;
+        end_index = (isize - 1) >> huge_page_shift(h);
        for (;;) {
                struct page *page;
-                int nr, ret;
+                unsigned long nr, ret;
                /* nr is the maximum number of bytes to copy from this page */
-                nr = HPAGE_SIZE;
+                nr = huge_page_size(h);
                if (index >= end_index) {
                        if (index > end_index)
                                goto out;
-                        nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+                        nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
                        if (nr <= offset) {
                                goto out;
                        }
@@ -287,8 +292,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                offset += ret;
                retval += ret;
                len -= ret;
-                index += offset >> HPAGE_SHIFT;
+                index += offset >> huge_page_shift(h);
-                offset &= ~HPAGE_MASK;
+                offset &= ~huge_page_mask(h);
                if (page)
                        page_cache_release(page);
@@ -298,7 +303,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                        break;
        }
 out:
-        *ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+        *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
        mutex_unlock(&inode->i_mutex);
        return retval;
 }
@@ -339,8 +344,9 @@ static void truncate_huge_page(struct page *page)
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+        struct hstate *h = hstate_inode(inode);
        struct address_space *mapping = &inode->i_data;
-        const pgoff_t start = lstart >> HPAGE_SHIFT;
+        const pgoff_t start = lstart >> huge_page_shift(h);
        struct pagevec pvec;
        pgoff_t next;
        int i, freed = 0;
@@ -441,7 +447,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
                        v_offset = 0;
                __unmap_hugepage_range(vma,
-                                vma->vm_start + v_offset, vma->vm_end);
+                                vma->vm_start + v_offset, vma->vm_end, NULL);
        }
 }
@@ -449,8 +455,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
        pgoff_t pgoff;
        struct address_space *mapping = inode->i_mapping;
+        struct hstate *h = hstate_inode(inode);
-        BUG_ON(offset & ~HPAGE_MASK);
+        BUG_ON(offset & ~huge_page_mask(h));
        pgoff = offset >> PAGE_SHIFT;
        i_size_write(inode, offset);
@@ -465,6 +472,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
+        struct hstate *h = hstate_inode(inode);
        int error;
        unsigned int ia_valid = attr->ia_valid;
@@ -476,7 +484,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (ia_valid & ATTR_SIZE) {
                error = -EINVAL;
-                if (!(attr->ia_size & ~HPAGE_MASK))
+                if (!(attr->ia_size & ~huge_page_mask(h)))
                        error = hugetlb_vmtruncate(inode, attr->ia_size);
                if (error)
                        goto out;
@@ -610,9 +618,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+        struct hstate *h = hstate_inode(dentry->d_inode);
        buf->f_type = HUGETLBFS_MAGIC;
-        buf->f_bsize = HPAGE_SIZE;
+        buf->f_bsize = huge_page_size(h);
        if (sbinfo) {
                spin_lock(&sbinfo->stat_lock);
                /* If no limits set, just report 0 for max/free/used
@@ -743,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
        char *p, *rest;
        substring_t args[MAX_OPT_ARGS];
        int option;
+        unsigned long long size = 0;
+        enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
        if (!options)
                return 0;
@@ -773,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        break;
                case Opt_size: {
-                        unsigned long long size;
                        /* memparse() will accept a K/M/G without a digit */
                        if (!isdigit(*args[0].from))
                                goto bad_val;
                        size = memparse(args[0].from, &rest);
-                        if (*rest == '%') {
+                        setsize = SIZE_STD;
-                                size <<= HPAGE_SHIFT;
+                        if (*rest == '%')
-                                size *= max_huge_pages;
+                                setsize = SIZE_PERCENT;
-                                do_div(size, 100);
-                        }
-                        pconfig->nr_blocks = (size >> HPAGE_SHIFT);
                        break;
                }
@@ -794,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        pconfig->nr_inodes = memparse(args[0].from, &rest);
                        break;
+                case Opt_pagesize: {
+                        unsigned long ps;
+                        ps = memparse(args[0].from, &rest);
+                        pconfig->hstate = size_to_hstate(ps);
+                        if (!pconfig->hstate) {
+                                printk(KERN_ERR
+                                "hugetlbfs: Unsupported page size %lu MB\n",
+                                        ps >> 20);
+                                return -EINVAL;
+                        }
+                        break;
+                }
                default:
                        printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
                                 p);
@@ -801,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
                        break;
                }
        }
+        /* Do size after hstate is set up */
+        if (setsize > NO_SIZE) {
+                struct hstate *h = pconfig->hstate;
+                if (setsize == SIZE_PERCENT) {
+                        size <<= huge_page_shift(h);
+                        size *= h->max_huge_pages;
+                        do_div(size, 100);
+                }
+                pconfig->nr_blocks = (size >> huge_page_shift(h));
+        }
        return 0;
 bad_val:
@@ -825,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        config.uid = current->fsuid;
        config.gid = current->fsgid;
        config.mode = 0755;
+        config.hstate = &default_hstate;
        ret = hugetlbfs_parse_options(data, &config);
        if (ret)
                return ret;
@@ -833,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbinfo)
                return -ENOMEM;
        sb->s_fs_info = sbinfo;
+        sbinfo->hstate = config.hstate;
        spin_lock_init(&sbinfo->stat_lock);
        sbinfo->max_blocks = config.nr_blocks;
        sbinfo->free_blocks = config.nr_blocks;
        sbinfo->max_inodes = config.nr_inodes;
        sbinfo->free_inodes = config.nr_inodes;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-        sb->s_blocksize = HPAGE_SIZE;
+        sb->s_blocksize = huge_page_size(config.hstate);
-        sb->s_blocksize_bits = HPAGE_SHIFT;
+        sb->s_blocksize_bits = huge_page_shift(config.hstate);
        sb->s_magic = HUGETLBFS_MAGIC;
        sb->s_op = &hugetlbfs_ops;
        sb->s_time_gran = 1;
@@ -942,7 +976,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
                goto out_dentry;
        error = -ENOMEM;
-        if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+        if (hugetlb_reserve_pages(inode, 0,
+                        size >> huge_page_shift(hstate_inode(inode)), NULL))
                goto out_inode;
        d_instantiate(dentry, inode);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 6676c06bb7c1..fe79c25d95dc 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -566,7 +566,7 @@ static const struct inotify_operations inotify_user_ops = {
        .destroy_watch  = free_inotify_user_watch,
 };
-asmlinkage long sys_inotify_init(void)
+asmlinkage long sys_inotify_init1(int flags)
 {
        struct inotify_device *dev;
        struct inotify_handle *ih;
@@ -574,7 +574,14 @@ asmlinkage long sys_inotify_init(void)
        struct file *filp;
        int fd, ret;
-        fd = get_unused_fd();
+        /* Check the IN_* constants for consistency.  */
+        BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+        if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+                return -EINVAL;
+        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;
@@ -610,7 +617,7 @@ asmlinkage long sys_inotify_init(void)
        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        filp->f_mode = FMODE_READ;
-        filp->f_flags = O_RDONLY;
+        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
        filp->private_data = dev;
        INIT_LIST_HEAD(&dev->events);
@@ -638,6 +645,11 @@ out_put_fd:
        return ret;
 }
+asmlinkage long sys_inotify_init(void)
+{
+        return sys_inotify_init1(0);
+}
 asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 {
        struct inode *inode;
diff --git a/fs/open.c b/fs/open.c
index a99ad09c3197..bb98d2fe809f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -64,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
                memcpy(buf, &st, sizeof(st));
        else {
                if (sizeof buf->f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail) &
+                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                             st.f_bsize | st.f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6149e4b58c88..efef715135d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -401,7 +401,7 @@ void register_disk(struct gendisk *disk)
        disk->dev.parent = disk->driverfs_dev;
        disk->dev.devt = MKDEV(disk->major, disk->first_minor);
-        strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+        strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
        /* ewww... some of these buggers have / in the name... */
        s = strchr(disk->dev.bus_id, '/');
        if (s)
diff --git a/fs/pipe.c b/fs/pipe.c
index 700f4e0d9572..10c4e9aa5c49 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -950,7 +950,7 @@ fail_inode:
        return NULL;
 }
-struct file *create_write_pipe(void)
+struct file *create_write_pipe(int flags)
 {
        int err;
        struct inode *inode;
@@ -983,7 +983,7 @@ struct file *create_write_pipe(void)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
-        f->f_flags = O_WRONLY;
+        f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
        f->f_version = 0;
        return f;
@@ -1007,7 +1007,7 @@ void free_write_pipe(struct file *f)
        put_filp(f);
 }
-struct file *create_read_pipe(struct file *wrf)
+struct file *create_read_pipe(struct file *wrf, int flags)
 {
        struct file *f = get_empty_filp();
        if (!f)
@@ -1019,7 +1019,7 @@ struct file *create_read_pipe(struct file *wrf)
        f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
        f->f_pos = 0;
-        f->f_flags = O_RDONLY;
+        f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
        f->f_op = &read_pipe_fops;
        f->f_mode = FMODE_READ;
        f->f_version = 0;
@@ -1027,26 +1027,29 @@ struct file *create_read_pipe(struct file *wrf)
        return f;
 }
-int do_pipe(int *fd)
+int do_pipe_flags(int *fd, int flags)
 {
        struct file *fw, *fr;
        int error;
        int fdw, fdr;
-        fw = create_write_pipe();
+        if (flags & ~(O_CLOEXEC | O_NONBLOCK))
+                return -EINVAL;
+        fw = create_write_pipe(flags);
        if (IS_ERR(fw))
                return PTR_ERR(fw);
-        fr = create_read_pipe(fw);
+        fr = create_read_pipe(fw, flags);
        error = PTR_ERR(fr);
        if (IS_ERR(fr))
                goto err_write_pipe;
-        error = get_unused_fd();
+        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_read_pipe;
        fdr = error;
-        error = get_unused_fd();
+        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_fdr;
        fdw = error;
@@ -1074,16 +1077,21 @@ int do_pipe(int *fd)
        return error;
 }
+int do_pipe(int *fd)
+{
+        return do_pipe_flags(fd, 0);
+}
 /*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
 {
        int fd[2];
        int error;
-        error = do_pipe(fd);
+        error = do_pipe_flags(fd, flags);
        if (!error) {
                if (copy_to_user(fildes, fd, sizeof(fd))) {
                        sys_close(fd[0]);
@@ -1094,6 +1102,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
        return error;
 }
+asmlinkage long __weak sys_pipe(int __user *fildes)
+{
+        return sys_pipe2(fildes, 0);
+}
 /*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index c652d469dc08..ded969862960 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -232,7 +232,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 #undef K
 }
-extern const struct seq_operations fragmentation_op;
 static int fragmentation_open(struct inode *inode, struct file *file)
 {
        (void)inode;
@@ -246,7 +245,6 @@ static const struct file_operations fragmentation_file_operations = {
        .release        = seq_release,
 };
-extern const struct seq_operations pagetypeinfo_op;
 static int pagetypeinfo_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &pagetypeinfo_op);
@@ -259,7 +257,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
        .release        = seq_release,
 };
-extern const struct seq_operations zoneinfo_op;
 static int zoneinfo_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &zoneinfo_op);
@@ -356,7 +353,6 @@ static const struct file_operations proc_devinfo_operations = {
        .release        = seq_release,
 };
-extern const struct seq_operations vmstat_op;
 static int vmstat_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &vmstat_op);
@@ -468,14 +464,25 @@ static const struct file_operations proc_slabstats_operations = {
 #ifdef CONFIG_MMU
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &vmalloc_op);
+        unsigned int *ptr = NULL;
+        int ret;
+        if (NUMA_BUILD)
+                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+        ret = seq_open(file, &vmalloc_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = ptr;
+        } else
+                kfree(ptr);
+        return ret;
 }
 static const struct file_operations proc_vmalloc_operations = {
        .open           = vmalloc_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_private,
 };
 #endif
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index b224a28e0c15..7bc296f424ae 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -27,6 +27,11 @@
 #include "internal.h"
+static struct net *get_proc_net(const struct inode *inode)
+{
+        return maybe_get_net(PDE_NET(PDE(inode)));
+}
 int seq_open_net(struct inode *ino, struct file *f,
                 const struct seq_operations *ops, int size)
 {
@@ -185,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name)
 }
 EXPORT_SYMBOL_GPL(proc_net_remove);
-struct net *get_proc_net(const struct inode *inode)
-{
-        return maybe_get_net(PDE_NET(PDE(inode)));
-}
-EXPORT_SYMBOL_GPL(get_proc_net);
 static __net_init int proc_net_ns_init(struct net *net)
 {
        struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 164bd9f9ede3..7546a918f790 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -636,7 +636,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        struct pagemapread pm;
        int pagecount;
        int ret = -ESRCH;
-        struct mm_walk pagemap_walk;
+        struct mm_walk pagemap_walk = {};
        unsigned long src;
        unsigned long svpfn;
        unsigned long start_vaddr;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 619725644c75..9c39bc7f8431 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,11 +205,19 @@ static const struct file_operations signalfd_fops = {
        .read           = signalfd_read,
 };
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
+                              size_t sizemask, int flags)
 {
        sigset_t sigmask;
        struct signalfd_ctx *ctx;
+        /* Check the SFD_* constants for consistency.  */
+        BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+        if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+                return -EINVAL;
        if (sizemask != sizeof(sigset_t) ||
            copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
                return -EINVAL;
@@ -227,7 +235,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
                 * When we call this, the initialization must be complete, since
                 * anon_inode_getfd() will install the fd.
                 */
-                ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx);
+                ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+                                       flags & (O_CLOEXEC | O_NONBLOCK));
                if (ufd < 0)
                        kfree(ctx);
        } else {
@@ -249,3 +258,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
        return ufd;
 }
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
+                             size_t sizemask)
+{
+        return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
diff --git a/fs/super.c b/fs/super.c
index 453877c5697b..e931ae9511fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
+                INIT_LIST_HEAD(&s->s_dentry_lru);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/fs/sync.c b/fs/sync.c
index 228e17b5e9ee..2967562d416f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback.
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8c0e4b92574f..c1a7efb310bf 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -398,7 +398,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 }
 /**
- *      sysfs_add_one - add sysfs_dirent to parent
+ *      __sysfs_add_one - add sysfs_dirent to parent without warning
 *      @acxt: addrm context to use
 *      @sd: sysfs_dirent to be added
 *
@@ -417,7 +417,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 *      0 on success, -EEXIST if entry with the given name already
 *      exists.
 */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
                return -EEXIST;
@@ -435,6 +435,39 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 }
 /**
+ *      sysfs_add_one - add sysfs_dirent to parent
+ *      @acxt: addrm context to use
+ *      @sd: sysfs_dirent to be added
+ *
+ *      Get @acxt->parent_sd and set sd->s_parent to it and increment
+ *      nlink of parent inode if @sd is a directory and link into the
+ *      children list of the parent.
+ *
+ *      This function should be called between calls to
+ *      sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *      passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *      LOCKING:
+ *      Determined by sysfs_addrm_start().
+ *
+ *      RETURNS:
+ *      0 on success, -EEXIST if entry with the given name already
+ *      exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+        int ret;
+        ret = __sysfs_add_one(acxt, sd);
+        if (ret == -EEXIST) {
+                printk(KERN_WARNING "sysfs: duplicate filename '%s' "
+                       "can not be created\n", sd->s_name);
+                WARN_ON(1);
+        }
+        return ret;
+}
+/**
 *      sysfs_remove_one - remove sysfs_dirent from parent
 *      @acxt: addrm context to use
 *      @sd: sysfs_dirent to be removed
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e7735f643cd1..3f07893ff896 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,6 +14,7 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
+#include <linux/fsnotify.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
@@ -585,9 +586,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-        rc = notify_change(victim, &newattrs);
+        newattrs.ia_ctime = current_fs_time(inode->i_sb);
+        rc = sysfs_setattr(victim, &newattrs);
        if (rc == 0) {
+                fsnotify_change(victim, newattrs.ia_valid);
                mutex_lock(&sysfs_mutex);
                victim_sd->s_mode = newattrs.ia_mode;
                mutex_unlock(&sysfs_mutex);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 817f5966edca..a3ba217fbe74 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,13 +19,8 @@
 #include "sysfs.h"
-/**
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
- *      sysfs_create_link - create symlink between two objects.
+                                const char *name, int warn)
- *      @kobj:  object whose directory we're creating the link in.
- *      @target:        object we're pointing to.
- *      @name:          name of the symlink.
- */
-int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
 {
        struct sysfs_dirent *parent_sd = NULL;
        struct sysfs_dirent *target_sd = NULL;
@@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
        target_sd = NULL;       /* reference is now owned by the symlink */
        sysfs_addrm_start(&acxt, parent_sd);
-        error = sysfs_add_one(&acxt, sd);
+        if (warn)
+                error = sysfs_add_one(&acxt, sd);
+        else
+                error = __sysfs_add_one(&acxt, sd);
        sysfs_addrm_finish(&acxt);
        if (error)
@@ -80,6 +78,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 }
 /**
+ *      sysfs_create_link - create symlink between two objects.
+ *      @kobj:  object whose directory we're creating the link in.
+ *      @target:        object we're pointing to.
+ *      @name:          name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+                      const char *name)
+{
+        return sysfs_do_create_link(kobj, target, name, 1);
+}
+/**
+ *      sysfs_create_link_nowarn - create symlink between two objects.
+ *      @kobj:  object whose directory we're creating the link in.
+ *      @target:        object we're pointing to.
+ *      @name:          name of the symlink.
+ *
+ *      This function does the same as sysf_create_link(), but it
+ *      doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+                             const char *name)
+{
+        return sysfs_do_create_link(kobj, target, name, 0);
+}
+/**
 *      sysfs_remove_link - remove symlink in object's directory.
 *      @kobj:  object we're acting for.
 *      @name:  name of the symlink to remove.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce4e15f8aaeb..a5db496f71c7 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
                       struct sysfs_dirent *parent_sd);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d87d354ec424..c502c60e4f54 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -184,7 +184,11 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
        int ufd;
        struct timerfd_ctx *ctx;
-        if (flags)
+        /* Check the TFD_* constants for consistency.  */
+        BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+        if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
                return -EINVAL;
        if (clockid != CLOCK_MONOTONIC &&
            clockid != CLOCK_REALTIME)
@@ -198,7 +202,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
        ctx->clockid = clockid;
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
-        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx);
+        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+                               flags & (O_CLOEXEC | O_NONBLOCK));
        if (ufd < 0)
                kfree(ctx);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 85b22b5977fa..506f724055c2 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1232,7 +1232,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
        unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
-        struct match_token *tp = tokens;
+        const struct match_token *tp = tokens;
        while (tp->token != Opt_onerror_panic && tp->token != mval)
                ++tp;