12 files changed, 587 insertions, 83 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
        bool
        depends on COMPAT && BINFMT_ELF
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+        bool
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..bcb884e2d613 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
                        /* Memory randomization might have been switched off
                         * in runtime via sysctl.
                         * If that is the case, retain the original non-zero
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c050..20375e6691c3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                               mask);
+                                               mask | __GFP_WRITE);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7d..aeb135c7ff5c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include <trace/events/task.h>
 #include "internal.h"
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
+        trace_task_rename(tsk, buf);
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
diff --git a/fs/inode.c b/fs/inode.c
index 87535753ab04..4fa4f0916af9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
        else
                __count_vm_events(PGINODESTEAL, reap);
        spin_unlock(&sb->s_inode_lru_lock);
+        if (current->reclaim_state)
+                current->reclaim_state->reclaimed_slab += reap;
        dispose_list(&freeable);
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f2..8173dfd89cb2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,9 +83,11 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 /* NOTE:
@@ -133,6 +135,8 @@ struct pid_entry {
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
 /*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
        return result;
 }
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
        if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
        return result;
 }
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
        if (task) {
@@ -627,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+                                 struct task_struct *task,
+                                 int hide_pid_min)
+{
+        if (pid->hide_pid < hide_pid_min)
+                return true;
+        if (in_group_p(pid->pid_gid))
+                return true;
+        return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+        struct pid_namespace *pid = inode->i_sb->s_fs_info;
+        struct task_struct *task;
+        bool has_perms;
+        task = get_proc_task(inode);
+        has_perms = has_pid_permissions(pid, task, 1);
+        put_task_struct(task);
+        if (!has_perms) {
+                if (pid->hide_pid == 2) {
+                        /*
+                         * Let's make getdents(), stat(), and open()
+                         * consistent with each other.  If a process
+                         * may not stat() a file, it shouldn't be seen
+                         * in procfs at all.
+                         */
+                        return -ENOENT;
+                }
+                return -EPERM;
+        }
+        return generic_permission(inode, mask);
+}
 static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
 };
@@ -1010,6 +1058,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+        trace_oom_score_adj_update(task);
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1146,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
+        trace_oom_score_adj_update(task);
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1453,13 +1503,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
        struct task_struct *task;
        struct mm_struct *mm;
        struct file *exe_file;
-        task = get_proc_task(inode);
+        task = get_proc_task(dentry->d_inode);
        if (!task)
                return -ENOENT;
        mm = get_task_mm(task);
@@ -1489,7 +1539,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        if (!proc_fd_access_allowed(inode))
                goto out;
-        error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+        error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
        return ERR_PTR(error);
 }
@@ -1528,7 +1578,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        if (!proc_fd_access_allowed(inode))
                goto out;
-        error = PROC_I(inode)->op.proc_get_link(inode, &path);
+        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;
@@ -1609,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
        const struct cred *cred;
+        struct pid_namespace *pid = dentry->d_sb->s_fs_info;
        generic_fillattr(inode, stat);
@@ -1617,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        stat->gid = 0;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
+                if (!has_pid_permissions(pid, task, 2)) {
+                        rcu_read_unlock();
+                        /*
+                         * This doesn't prevent learning whether PID exists,
+                         * it only makes getattr() consistent with readdir().
+                         */
+                        return -ENOENT;
+                }
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
                        cred = __task_cred(task);
@@ -1820,9 +1879,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
        return -ENOENT;
 }
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-        return proc_fd_info(inode, path, NULL);
+        return proc_fd_info(dentry->d_inode, path, NULL);
 }
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2043,6 +2102,355 @@ static const struct file_operations proc_fd_operations = {
        .llseek         = default_llseek,
 };
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+                             unsigned long *start, unsigned long *end)
+{
+        if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+                return -EINVAL;
+        return 0;
+}
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        unsigned long vm_start, vm_end;
+        bool exact_vma_exists = false;
+        struct mm_struct *mm = NULL;
+        struct task_struct *task;
+        const struct cred *cred;
+        struct inode *inode;
+        int status = 0;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        if (!capable(CAP_SYS_ADMIN)) {
+                status = -EACCES;
+                goto out_notask;
+        }
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        if (!task)
+                goto out_notask;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out;
+        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+                down_read(&mm->mmap_sem);
+                exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+                up_read(&mm->mmap_sem);
+        }
+        mmput(mm);
+        if (exact_vma_exists) {
+                if (task_dumpable(task)) {
+                        rcu_read_lock();
+                        cred = __task_cred(task);
+                        inode->i_uid = cred->euid;
+                        inode->i_gid = cred->egid;
+                        rcu_read_unlock();
+                } else {
+                        inode->i_uid = 0;
+                        inode->i_gid = 0;
+                }
+                security_task_to_inode(task, inode);
+                status = 1;
+        }
+out:
+        put_task_struct(task);
+out_notask:
+        if (status <= 0)
+                d_drop(dentry);
+        return status;
+}
+static const struct dentry_operations tid_map_files_dentry_operations = {
+        .d_revalidate   = map_files_d_revalidate,
+        .d_delete       = pid_delete_dentry,
+};
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+        unsigned long vm_start, vm_end;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct mm_struct *mm;
+        int rc;
+        rc = -ENOENT;
+        task = get_proc_task(dentry->d_inode);
+        if (!task)
+                goto out;
+        mm = get_task_mm(task);
+        put_task_struct(task);
+        if (!mm)
+                goto out;
+        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+        if (rc)
+                goto out_mmput;
+        down_read(&mm->mmap_sem);
+        vma = find_exact_vma(mm, vm_start, vm_end);
+        if (vma && vma->vm_file) {
+                *path = vma->vm_file->f_path;
+                path_get(path);
+                rc = 0;
+        }
+        up_read(&mm->mmap_sem);
+out_mmput:
+        mmput(mm);
+out:
+        return rc;
+}
+struct map_files_info {
+        struct file     *file;
+        unsigned long   len;
+        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+                           struct task_struct *task, const void *ptr)
+{
+        const struct file *file = ptr;
+        struct proc_inode *ei;
+        struct inode *inode;
+        if (!file)
+                return ERR_PTR(-ENOENT);
+        inode = proc_pid_make_inode(dir->i_sb, task);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        ei = PROC_I(inode);
+        ei->op.proc_get_link = proc_map_files_get_link;
+        inode->i_op = &proc_pid_link_inode_operations;
+        inode->i_size = 64;
+        inode->i_mode = S_IFLNK;
+        if (file->f_mode & FMODE_READ)
+                inode->i_mode |= S_IRUSR;
+        if (file->f_mode & FMODE_WRITE)
+                inode->i_mode |= S_IWUSR;
+        d_set_d_op(dentry, &tid_map_files_dentry_operations);
+        d_add(dentry, inode);
+        return NULL;
+}
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+                struct dentry *dentry, struct nameidata *nd)
+{
+        unsigned long vm_start, vm_end;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct dentry *result;
+        struct mm_struct *mm;
+        result = ERR_PTR(-EACCES);
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        result = ERR_PTR(-ENOENT);
+        task = get_proc_task(dir);
+        if (!task)
+                goto out;
+        result = ERR_PTR(-EACCES);
+        if (lock_trace(task))
+                goto out_put_task;
+        result = ERR_PTR(-ENOENT);
+        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+                goto out_unlock;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out_unlock;
+        down_read(&mm->mmap_sem);
+        vma = find_exact_vma(mm, vm_start, vm_end);
+        if (!vma)
+                goto out_no_vma;
+        result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+out_no_vma:
+        up_read(&mm->mmap_sem);
+        mmput(mm);
+out_unlock:
+        unlock_trace(task);
+out_put_task:
+        put_task_struct(task);
+out:
+        return result;
+}
+static const struct inode_operations proc_map_files_inode_operations = {
+        .lookup         = proc_map_files_lookup,
+        .permission     = proc_fd_permission,
+        .setattr        = proc_setattr,
+};
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct mm_struct *mm;
+        ino_t ino;
+        int ret;
+        ret = -EACCES;
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        ret = -ENOENT;
+        task = get_proc_task(inode);
+        if (!task)
+                goto out;
+        ret = -EACCES;
+        if (lock_trace(task))
+                goto out_put_task;
+        ret = 0;
+        switch (filp->f_pos) {
+        case 0:
+                ino = inode->i_ino;
+                if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+                        goto out_unlock;
+                filp->f_pos++;
+        case 1:
+                ino = parent_ino(dentry);
+                if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+                        goto out_unlock;
+                filp->f_pos++;
+        default:
+        {
+                unsigned long nr_files, pos, i;
+                struct flex_array *fa = NULL;
+                struct map_files_info info;
+                struct map_files_info *p;
+                mm = get_task_mm(task);
+                if (!mm)
+                        goto out_unlock;
+                down_read(&mm->mmap_sem);
+                nr_files = 0;
+                /*
+                 * We need two passes here:
+                 *
+                 *  1) Collect vmas of mapped files with mmap_sem taken
+                 *  2) Release mmap_sem and instantiate entries
+                 *
+                 * otherwise we get lockdep complained, since filldir()
+                 * routine might require mmap_sem taken in might_fault().
+                 */
+                for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+                        if (vma->vm_file && ++pos > filp->f_pos)
+                                nr_files++;
+                }
+                if (nr_files) {
+                        fa = flex_array_alloc(sizeof(info), nr_files,
+                                                GFP_KERNEL);
+                        if (!fa || flex_array_prealloc(fa, 0, nr_files,
+                                                        GFP_KERNEL)) {
+                                ret = -ENOMEM;
+                                if (fa)
+                                        flex_array_free(fa);
+                                up_read(&mm->mmap_sem);
+                                mmput(mm);
+                                goto out_unlock;
+                        }
+                        for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                        vma = vma->vm_next) {
+                                if (!vma->vm_file)
+                                        continue;
+                                if (++pos <= filp->f_pos)
+                                        continue;
+                                get_file(vma->vm_file);
+                                info.file = vma->vm_file;
+                                info.len = snprintf(info.name,
+                                                sizeof(info.name), "%lx-%lx",
+                                                vma->vm_start, vma->vm_end);
+                                if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                        BUG();
+                        }
+                }
+                up_read(&mm->mmap_sem);
+                for (i = 0; i < nr_files; i++) {
+                        p = flex_array_get(fa, i);
+                        ret = proc_fill_cache(filp, dirent, filldir,
+                                              p->name, p->len,
+                                              proc_map_files_instantiate,
+                                              task, p->file);
+                        if (ret)
+                                break;
+                        filp->f_pos++;
+                        fput(p->file);
+                }
+                for (; i < nr_files; i++) {
+                        /*
+                         * In case of error don't forget
+                         * to put rest of file refs.
+                         */
+                        p = flex_array_get(fa, i);
+                        fput(p->file);
+                }
+                if (fa)
+                        flex_array_free(fa);
+                mmput(mm);
+        }
+        }
+out_unlock:
+        unlock_trace(task);
+out_put_task:
+        put_task_struct(task);
+out:
+        return ret;
+}
+static const struct file_operations proc_map_files_operations = {
+        .read           = generic_read_dir,
+        .readdir        = proc_map_files_readdir,
+        .llseek         = default_llseek,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 /*
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
@@ -2658,6 +3066,9 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
@@ -2761,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
+        .permission     = proc_pid_permission,
 };
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2964,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
                                proc_pid_instantiate, iter.task, NULL);
 }
+static int fake_filldir(void *buf, const char *name, int namelen,
+                        loff_t offset, u64 ino, unsigned d_type)
+{
+        return 0;
+}
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -2971,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+        filldir_t __filldir;
        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
                goto out_no_task;
@@ -2992,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
+                if (has_pid_permissions(ns, iter.task, 2))
+                        __filldir = filldir;
+                else
+                        __filldir = fake_filldir;
                filp->f_pos = iter.tgid + TGID_OFFSET;
-                if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+                if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
                        put_task_struct(iter.task);
                        goto out;
                }
@@ -3328,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
+        .permission     = proc_pid_permission,
 };
 static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 51a176622b8f..84fd3235a590 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
+#include <linux/pid_namespace.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
@@ -17,7 +18,9 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/mount.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -101,12 +104,27 @@ void __init proc_init_inodecache(void)
                                             init_once);
 }
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+        struct super_block *sb = root->d_sb;
+        struct pid_namespace *pid = sb->s_fs_info;
+        if (pid->pid_gid)
+                seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+        if (pid->hide_pid != 0)
+                seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+        return 0;
+}
 static const struct super_operations proc_sops = {
        .alloc_inode    = proc_alloc_inode,
        .destroy_inode  = proc_destroy_inode,
        .drop_inode     = generic_delete_inode,
        .evict_inode    = proc_evict_inode,
        .statfs         = simple_statfs,
+        .remount_fs     = proc_remount,
+        .show_options   = proc_show_options,
 };
 static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec14..292577531ad1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d978180..46a15d8a29ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
+#include <linux/parser.h>
 #include "internal.h"
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
        return err;
 }
+enum {
+        Opt_gid, Opt_hidepid, Opt_err,
+};
+static const match_table_t tokens = {
+        {Opt_hidepid, "hidepid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_err, NULL},
+};
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                args[0].to = args[0].from = 0;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_gid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        pid->pid_gid = option;
+                        break;
+                case Opt_hidepid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0 || option > 2) {
+                                pr_err("proc: hidepid value must be between 0 and 2.\n");
+                                return 0;
+                        }
+                        pid->hide_pid = option;
+                        break;
+                default:
+                        pr_err("proc: unrecognized mount option \"%s\" "
+                               "or missing value\n", p);
+                        return 0;
+                }
+        }
+        return 1;
+}
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct pid_namespace *pid = sb->s_fs_info;
+        return !proc_parse_options(data, pid);
+}
 static struct dentry *proc_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        struct super_block *sb;
        struct pid_namespace *ns;
        struct proc_inode *ei;
+        char *options;
-        if (flags & MS_KERNMOUNT)
+        if (flags & MS_KERNMOUNT) {
                ns = (struct pid_namespace *)data;
-        else
+                options = NULL;
+        } else {
                ns = current->nsproxy->pid_ns;
+                options = data;
+        }
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        if (!sb->s_root) {
                sb->s_flags = flags;
+                if (!proc_parse_options(options, ns)) {
+                        deactivate_locked_super(sb);
+                        return ERR_PTR(-EINVAL);
+                }
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a945cd265228..70de42f09f1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
        struct reiserfs_bitmap_info *bitmap;
        unsigned int bmap_nr = reiserfs_bmap_count(sb);
-        /* Avoid lock recursion in fault case */
-        reiserfs_write_unlock(sb);
        bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
-        reiserfs_write_lock(sb);
        if (bitmap == NULL)
                return -ENOMEM;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f2..c3cf54fd4de3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        char b[BDEVNAME_SIZE];
        int ret;
-        /*
-         * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
-         * dependency inversion warnings.
-         */
-        reiserfs_write_unlock(sb);
        journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
-                reiserfs_write_lock(sb);
                return 1;
        }
        INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        INIT_LIST_HEAD(&journal->j_working_list);
        INIT_LIST_HEAD(&journal->j_journal_list);
        journal->j_persistent_trans = 0;
-        ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+        if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-                                           reiserfs_bmap_count(sb));
+                                           reiserfs_bmap_count(sb)))
-        reiserfs_write_lock(sb);
-        if (ret)
                goto free_and_return;
        allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
-        /*
-         * We need to unlock here to avoid creating the following
-         * dependency:
-         * reiserfs_lock -> sysfs_mutex
-         * Because the reiserfs mmap path creates the following dependency:
-         * mm->mmap -> reiserfs_lock, hence we have
-         * mm->mmap -> reiserfs_lock ->sysfs_mutex
-         * This would ends up in a circular dependency with sysfs readdir path
-         * which does sysfs_mutex -> mm->mmap_sem
-         * This is fine because the reiserfs lock is useless in mount path,
-         * at least until we call journal_begin. We keep it for paranoid
-         * reasons.
-         */
-        reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-                reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
-        reiserfs_write_lock(sb);
        rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        journal->j_mount_id = 10;
        journal->j_state = 0;
        atomic_set(&(journal->j_jlock), 0);
-        reiserfs_write_unlock(sb);
        journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-        reiserfs_write_lock(sb);
        journal->j_cnode_free_orig = journal->j_cnode_free_list;
        journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
        journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        init_journal_hash(sb);
        jl = journal->j_current_jl;
+        /*
+         * get_list_bitmap() may call flush_commit_list() which
+         * requires the lock. Calling flush_commit_list() shouldn't happen
+         * this early but I like to be paranoid.
+         */
+        reiserfs_write_lock(sb);
        jl->j_list_bitmap = get_list_bitmap(sb, jl);
+        reiserfs_write_unlock(sb);
        if (!jl->j_list_bitmap) {
                reiserfs_warning(sb, "journal-2005",
                                 "get_list_bitmap failed for journal list 0");
                goto free_and_return;
        }
-        if (journal_read(sb) < 0) {
+        /*
+         * Journal_read needs to be inspected in order to push down
+         * the lock further inside (or even remove it).
+         */
+        reiserfs_write_lock(sb);
+        ret = journal_read(sb);
+        reiserfs_write_unlock(sb);
+        if (ret < 0) {
                reiserfs_warning(sb, "reiserfs-2006",
                                 "Replay Failure, unable to mount");
                goto free_and_return;
        }
        reiserfs_mounted_fs_count++;
-        if (reiserfs_mounted_fs_count <= 1) {
+        if (reiserfs_mounted_fs_count <= 1)
-                reiserfs_write_unlock(sb);
                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-                reiserfs_write_lock(sb);
-        }
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
            journal->j_cnode_free < (journal->j_trans_max * 3)) {
                return 1;
        }
-        /* protected by the BKL here */
        journal->j_len_alloc += new_alloc;
        th->t_blocks_allocated += new_alloc ;
        return 0;
 }
-/* this must be called inside a transaction, and requires the
+/* this must be called inside a transaction
-** kernel_lock to be held
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 {
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
        return;
 }
-/* this must be called without a transaction started, and does not
+/* this must be called without a transaction started
-** require BKL
 */
 void reiserfs_allow_writes(struct super_block *s)
 {
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
        wake_up(&journal->j_join_wait);
 }
-/* this must be called without a transaction started, and does not
+/* this must be called without a transaction started
-** require BKL
 */
 void reiserfs_wait_on_write_block(struct super_block *s)
 {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d42e707d5fa..e12d8b97cd4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
-        reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
-        reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1746,22 +1744,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        mutex_init(&REISERFS_SB(s)->lock);
        REISERFS_SB(s)->lock_depth = -1;
-        /*
-         * This function is called with the bkl, which also was the old
-         * locking used here.
-         * do_journal_begin() will soon check if we hold the lock (ie: was the
-         * bkl). This is likely because do_journal_begin() has several another
-         * callers because at this time, it doesn't seem to be necessary to
-         * protect against anything.
-         * Anyway, let's be conservative and lock for now.
-         */
-        reiserfs_write_lock(s);
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
             &commit_max_age, qf_names, &qfmt) == 0) {
-                goto error;
+                goto error_unlocked;
        }
        if (jdev_name && jdev_name[0]) {
                REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
@@ -1777,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if (blocks) {
                SWARN(silent, s, "jmacd-7", "resize option for remount only");
-                goto error;
+                goto error_unlocked;
        }
        /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1787,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
                SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
                      reiserfs_bdevname(s));
-                goto error;
+                goto error_unlocked;
        }
        rs = SB_DISK_SUPER_BLOCK(s);
@@ -1803,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                      "or increase size of your LVM partition");
                SWARN(silent, s, "", "Or may be you forgot to "
                      "reboot after fdisk when it told you to");
-                goto error;
+                goto error_unlocked;
        }
        sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1811,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if ((errval = reiserfs_init_bitmap_cache(s))) {
                SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-                goto error;
+                goto error_unlocked;
        }
        errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
        SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1835,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if (reiserfs_barrier_flush(s)) {
                printk("reiserfs: using flush barriers\n");
        }
        // set_device_ro(s->s_dev, 1) ;
        if (journal_init(s, jdev_name, old_format, commit_max_age)) {
                SWARN(silent, s, "sh-2022",
                      "unable to initialize journal space");
-                goto error;
+                goto error_unlocked;
        } else {
                jinit_done = 1; /* once this is set, journal_release must be called
                                 ** if we error out of the mount
                                 */
        }
        if (reread_meta_blocks(s)) {
                SWARN(silent, s, "jmacd-9",
                      "unable to reread meta blocks after journal init");
-                goto error;
+                goto error_unlocked;
        }
        if (replay_only(s))
-                goto error;
+                goto error_unlocked;
        if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
                SWARN(silent, s, "clm-7000",
@@ -1866,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                         reiserfs_init_locked_inode, (void *)(&args));
        if (!root_inode) {
                SWARN(silent, s, "jmacd-10", "get root inode failed");
-                goto error;
+                goto error_unlocked;
        }
+        /*
+         * This path assumed to be called with the BKL in the old times.
+         * Now we have inherited the big reiserfs lock from it and many
+         * reiserfs helpers called in the mount path and elsewhere require
+         * this lock to be held even if it's not always necessary. Let's be
+         * conservative and hold it early. The window can be reduced after
+         * careful review of the code.
+         */
+        reiserfs_write_lock(s);
        if (root_inode->i_state & I_NEW) {
                reiserfs_read_locked_inode(root_inode, &args);
                unlock_new_inode(root_inode);
@@ -1995,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
 error:
-        if (jinit_done) {       /* kill the commit thread, free journal ram */
+        reiserfs_write_unlock(s);
+error_unlocked:
+        /* kill the commit thread, free journal ram */
+        if (jinit_done) {
+                reiserfs_write_lock(s);
                journal_release_error(NULL, s);
+                reiserfs_write_unlock(s);
        }
-        reiserfs_write_unlock(s);
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));