aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/cgroup.c430
-rw-r--r--kernel/cgroup_debug.c2
-rw-r--r--kernel/cpuset.c254
-rw-r--r--kernel/exec_domain.c23
-rw-r--r--kernel/exit.c245
-rw-r--r--kernel/extable.c29
-rw-r--r--kernel/fork.c70
-rw-r--r--kernel/kallsyms.c19
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kmod.c10
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/module.c274
-rw-r--r--kernel/ns_cgroup.c14
-rw-r--r--kernel/panic.c115
-rw-r--r--kernel/params.c26
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/pid_namespace.c15
-rw-r--r--kernel/power/disk.c5
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swsusp.c18
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/ptrace.c103
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/relay.c8
-rw-r--r--kernel/sched.c68
-rw-r--r--kernel/signal.c63
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/smp.c432
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/spinlock.c18
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/workqueue.c41
38 files changed, 2146 insertions, 959 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o
96 97
97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 98ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 99# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h>
69 70
70#include "audit.h" 71#include "audit.h"
71 72
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
585 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
586 * This is called before css refcnt check. 623 * This is called before css refcnt check.
587 */ 624 */
588static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
589{ 626{
590 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
591 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
592 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
593 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
594 return; 633 if (ret)
634 break;
635 }
636 return ret;
595} 637}
596 638
597static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
685 remove_dir(dentry); 727 remove_dir(dentry);
686} 728}
687 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
688static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
689 unsigned long final_bits) 747 unsigned long final_bits)
690{ 748{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
857 } 915 }
858 916
859 ret = rebind_subsystems(root, opts.subsys_bits); 917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
860 920
861 /* (re)populate subsystem files */ 921 /* (re)populate subsystem files */
862 if (!ret) 922 cgroup_populate_dir(cgrp);
863 cgroup_populate_dir(cgrp);
864 923
865 if (opts.release_agent) 924 if (opts.release_agent)
866 strcpy(root->release_agent_path, opts.release_agent); 925 strcpy(root->release_agent_path, opts.release_agent);
867 out_unlock: 926 out_unlock:
868 if (opts.release_agent) 927 kfree(opts.release_agent);
869 kfree(opts.release_agent);
870 mutex_unlock(&cgroup_mutex); 928 mutex_unlock(&cgroup_mutex);
871 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
872 return ret; 930 return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
969 /* First find the desired set of subsystems */ 1027 /* First find the desired set of subsystems */
970 ret = parse_cgroupfs_options(data, &opts); 1028 ret = parse_cgroupfs_options(data, &opts);
971 if (ret) { 1029 if (ret) {
972 if (opts.release_agent) 1030 kfree(opts.release_agent);
973 kfree(opts.release_agent);
974 return ret; 1031 return ret;
975 } 1032 }
976 1033
977 root = kzalloc(sizeof(*root), GFP_KERNEL); 1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
978 if (!root) { 1035 if (!root) {
979 if (opts.release_agent) 1036 kfree(opts.release_agent);
980 kfree(opts.release_agent);
981 return -ENOMEM; 1037 return -ENOMEM;
982 } 1038 }
983 1039
@@ -1280,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1280 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1336 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1281 synchronize_rcu(); 1337 synchronize_rcu();
1282 put_css_set(cg); 1338 put_css_set(cg);
1339
1340 /*
1341 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1342 * is no longer empty.
1343 */
1344 cgroup_wakeup_rmdir_waiters(cgrp);
1283 return 0; 1345 return 0;
1284} 1346}
1285 1347
@@ -1625,7 +1687,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
1625 .rename = cgroup_rename, 1687 .rename = cgroup_rename,
1626}; 1688};
1627 1689
1628static int cgroup_create_file(struct dentry *dentry, int mode, 1690static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1629 struct super_block *sb) 1691 struct super_block *sb)
1630{ 1692{
1631 static const struct dentry_operations cgroup_dops = { 1693 static const struct dentry_operations cgroup_dops = {
@@ -1671,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
1671 * @mode: mode to set on new directory. 1733 * @mode: mode to set on new directory.
1672 */ 1734 */
1673static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1674 int mode) 1736 mode_t mode)
1675{ 1737{
1676 struct dentry *parent; 1738 struct dentry *parent;
1677 int error = 0; 1739 int error = 0;
@@ -1689,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1689 return error; 1751 return error;
1690} 1752}
1691 1753
1754/**
1755 * cgroup_file_mode - deduce file mode of a control file
1756 * @cft: the control file in question
1757 *
1758 * returns cft->mode if ->mode is not 0
1759 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1760 * returns S_IRUGO if it has only a read handler
1761 * returns S_IWUSR if it has only a write hander
1762 */
1763static mode_t cgroup_file_mode(const struct cftype *cft)
1764{
1765 mode_t mode = 0;
1766
1767 if (cft->mode)
1768 return cft->mode;
1769
1770 if (cft->read || cft->read_u64 || cft->read_s64 ||
1771 cft->read_map || cft->read_seq_string)
1772 mode |= S_IRUGO;
1773
1774 if (cft->write || cft->write_u64 || cft->write_s64 ||
1775 cft->write_string || cft->trigger)
1776 mode |= S_IWUSR;
1777
1778 return mode;
1779}
1780
1692int cgroup_add_file(struct cgroup *cgrp, 1781int cgroup_add_file(struct cgroup *cgrp,
1693 struct cgroup_subsys *subsys, 1782 struct cgroup_subsys *subsys,
1694 const struct cftype *cft) 1783 const struct cftype *cft)
@@ -1696,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
1696 struct dentry *dir = cgrp->dentry; 1785 struct dentry *dir = cgrp->dentry;
1697 struct dentry *dentry; 1786 struct dentry *dentry;
1698 int error; 1787 int error;
1788 mode_t mode;
1699 1789
1700 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1790 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1701 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1791 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1706,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
1706 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1796 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1707 dentry = lookup_one_len(name, dir, strlen(name)); 1797 dentry = lookup_one_len(name, dir, strlen(name));
1708 if (!IS_ERR(dentry)) { 1798 if (!IS_ERR(dentry)) {
1709 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1799 mode = cgroup_file_mode(cft);
1800 error = cgroup_create_file(dentry, mode | S_IFREG,
1710 cgrp->root->sb); 1801 cgrp->root->sb);
1711 if (!error) 1802 if (!error)
1712 dentry->d_fsdata = (void *)cft; 1803 dentry->d_fsdata = (void *)cft;
@@ -2288,6 +2379,7 @@ static struct cftype files[] = {
2288 .write_u64 = cgroup_tasks_write, 2379 .write_u64 = cgroup_tasks_write,
2289 .release = cgroup_tasks_release, 2380 .release = cgroup_tasks_release,
2290 .private = FILE_TASKLIST, 2381 .private = FILE_TASKLIST,
2382 .mode = S_IRUGO | S_IWUSR,
2291 }, 2383 },
2292 2384
2293 { 2385 {
@@ -2327,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2327 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2419 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2328 return err; 2420 return err;
2329 } 2421 }
2422 /* This cgroup is ready now */
2423 for_each_subsys(cgrp->root, ss) {
2424 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2425 /*
2426 * Update id->css pointer and make this css visible from
2427 * CSS ID functions. This pointer will be dereferened
2428 * from RCU-read-side without locks.
2429 */
2430 if (css->id)
2431 rcu_assign_pointer(css->id->css, css);
2432 }
2330 2433
2331 return 0; 2434 return 0;
2332} 2435}
@@ -2338,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2338 css->cgroup = cgrp; 2441 css->cgroup = cgrp;
2339 atomic_set(&css->refcnt, 1); 2442 atomic_set(&css->refcnt, 1);
2340 css->flags = 0; 2443 css->flags = 0;
2444 css->id = NULL;
2341 if (cgrp == dummytop) 2445 if (cgrp == dummytop)
2342 set_bit(CSS_ROOT, &css->flags); 2446 set_bit(CSS_ROOT, &css->flags);
2343 BUG_ON(cgrp->subsys[ss->subsys_id]); 2447 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2376,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2376 * Must be called with the mutex on the parent inode held 2480 * Must be called with the mutex on the parent inode held
2377 */ 2481 */
2378static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2482static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2379 int mode) 2483 mode_t mode)
2380{ 2484{
2381 struct cgroup *cgrp; 2485 struct cgroup *cgrp;
2382 struct cgroupfs_root *root = parent->root; 2486 struct cgroupfs_root *root = parent->root;
@@ -2413,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2413 goto err_destroy; 2517 goto err_destroy;
2414 } 2518 }
2415 init_cgroup_css(css, ss, cgrp); 2519 init_cgroup_css(css, ss, cgrp);
2520 if (ss->use_id)
2521 if (alloc_css_id(ss, parent, cgrp))
2522 goto err_destroy;
2523 /* At error, ->destroy() callback has to free assigned ID. */
2416 } 2524 }
2417 2525
2418 cgroup_lock_hierarchy(root); 2526 cgroup_lock_hierarchy(root);
@@ -2555,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2555 struct cgroup *cgrp = dentry->d_fsdata; 2663 struct cgroup *cgrp = dentry->d_fsdata;
2556 struct dentry *d; 2664 struct dentry *d;
2557 struct cgroup *parent; 2665 struct cgroup *parent;
2666 DEFINE_WAIT(wait);
2667 int ret;
2558 2668
2559 /* the vfs holds both inode->i_mutex already */ 2669 /* the vfs holds both inode->i_mutex already */
2560 2670again:
2561 mutex_lock(&cgroup_mutex); 2671 mutex_lock(&cgroup_mutex);
2562 if (atomic_read(&cgrp->count) != 0) { 2672 if (atomic_read(&cgrp->count) != 0) {
2563 mutex_unlock(&cgroup_mutex); 2673 mutex_unlock(&cgroup_mutex);
@@ -2573,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2573 * Call pre_destroy handlers of subsys. Notify subsystems 2683 * Call pre_destroy handlers of subsys. Notify subsystems
2574 * that rmdir() request comes. 2684 * that rmdir() request comes.
2575 */ 2685 */
2576 cgroup_call_pre_destroy(cgrp); 2686 ret = cgroup_call_pre_destroy(cgrp);
2687 if (ret)
2688 return ret;
2577 2689
2578 mutex_lock(&cgroup_mutex); 2690 mutex_lock(&cgroup_mutex);
2579 parent = cgrp->parent; 2691 parent = cgrp->parent;
2580 2692 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2581 if (atomic_read(&cgrp->count)
2582 || !list_empty(&cgrp->children)
2583 || !cgroup_clear_css_refs(cgrp)) {
2584 mutex_unlock(&cgroup_mutex); 2693 mutex_unlock(&cgroup_mutex);
2585 return -EBUSY; 2694 return -EBUSY;
2586 } 2695 }
2696 /*
2697 * css_put/get is provided for subsys to grab refcnt to css. In typical
2698 * case, subsystem has no reference after pre_destroy(). But, under
2699 * hierarchy management, some *temporal* refcnt can be hold.
2700 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2701 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2702 * is called when css_put() is called and refcnt goes down to 0.
2703 */
2704 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2705 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2706
2707 if (!cgroup_clear_css_refs(cgrp)) {
2708 mutex_unlock(&cgroup_mutex);
2709 schedule();
2710 finish_wait(&cgroup_rmdir_waitq, &wait);
2711 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2712 if (signal_pending(current))
2713 return -EINTR;
2714 goto again;
2715 }
2716 /* NO css_tryget() can success after here. */
2717 finish_wait(&cgroup_rmdir_waitq, &wait);
2718 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2587 2719
2588 spin_lock(&release_list_lock); 2720 spin_lock(&release_list_lock);
2589 set_bit(CGRP_REMOVED, &cgrp->flags); 2721 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2708,6 +2840,8 @@ int __init cgroup_init(void)
2708 struct cgroup_subsys *ss = subsys[i]; 2840 struct cgroup_subsys *ss = subsys[i];
2709 if (!ss->early_init) 2841 if (!ss->early_init)
2710 cgroup_init_subsys(ss); 2842 cgroup_init_subsys(ss);
2843 if (ss->use_id)
2844 cgroup_subsys_init_idr(ss);
2711 } 2845 }
2712 2846
2713 /* Add init_css_set to the hash table */ 2847 /* Add init_css_set to the hash table */
@@ -3084,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3084} 3218}
3085 3219
3086/** 3220/**
3087 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 3221 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
3088 * @cgrp: the cgroup in question 3222 * @cgrp: the cgroup in question
3223 * @task: the task in question
3089 * 3224 *
3090 * See if @cgrp is a descendant of the current task's cgroup in 3225 * See if @cgrp is a descendant of @task's cgroup in the appropriate
3091 * the appropriate hierarchy. 3226 * hierarchy.
3092 * 3227 *
3093 * If we are sending in dummytop, then presumably we are creating 3228 * If we are sending in dummytop, then presumably we are creating
3094 * the top cgroup in the subsystem. 3229 * the top cgroup in the subsystem.
3095 * 3230 *
3096 * Called only by the ns (nsproxy) cgroup. 3231 * Called only by the ns (nsproxy) cgroup.
3097 */ 3232 */
3098int cgroup_is_descendant(const struct cgroup *cgrp) 3233int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3099{ 3234{
3100 int ret; 3235 int ret;
3101 struct cgroup *target; 3236 struct cgroup *target;
@@ -3105,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
3105 return 1; 3240 return 1;
3106 3241
3107 get_first_subsys(cgrp, NULL, &subsys_id); 3242 get_first_subsys(cgrp, NULL, &subsys_id);
3108 target = task_cgroup(current, subsys_id); 3243 target = task_cgroup(task, subsys_id);
3109 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3244 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3110 cgrp = cgrp->parent; 3245 cgrp = cgrp->parent;
3111 ret = (cgrp == target); 3246 ret = (cgrp == target);
@@ -3138,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
3138{ 3273{
3139 struct cgroup *cgrp = css->cgroup; 3274 struct cgroup *cgrp = css->cgroup;
3140 rcu_read_lock(); 3275 rcu_read_lock();
3141 if ((atomic_dec_return(&css->refcnt) == 1) && 3276 if (atomic_dec_return(&css->refcnt) == 1) {
3142 notify_on_release(cgrp)) { 3277 if (notify_on_release(cgrp)) {
3143 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3278 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3144 check_for_release(cgrp); 3279 check_for_release(cgrp);
3280 }
3281 cgroup_wakeup_rmdir_waiters(cgrp);
3145 } 3282 }
3146 rcu_read_unlock(); 3283 rcu_read_unlock();
3147} 3284}
@@ -3241,3 +3378,232 @@ static int __init cgroup_disable(char *str)
3241 return 1; 3378 return 1;
3242} 3379}
3243__setup("cgroup_disable=", cgroup_disable); 3380__setup("cgroup_disable=", cgroup_disable);
3381
3382/*
3383 * Functons for CSS ID.
3384 */
3385
3386/*
3387 *To get ID other than 0, this should be called when !cgroup_is_removed().
3388 */
3389unsigned short css_id(struct cgroup_subsys_state *css)
3390{
3391 struct css_id *cssid = rcu_dereference(css->id);
3392
3393 if (cssid)
3394 return cssid->id;
3395 return 0;
3396}
3397
3398unsigned short css_depth(struct cgroup_subsys_state *css)
3399{
3400 struct css_id *cssid = rcu_dereference(css->id);
3401
3402 if (cssid)
3403 return cssid->depth;
3404 return 0;
3405}
3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 const struct cgroup_subsys_state *root)
3409{
3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id);
3412
3413 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3414 return false;
3415 return child_id->stack[root_id->depth] == root_id->id;
3416}
3417
3418static void __free_css_id_cb(struct rcu_head *head)
3419{
3420 struct css_id *id;
3421
3422 id = container_of(head, struct css_id, rcu_head);
3423 kfree(id);
3424}
3425
3426void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3427{
3428 struct css_id *id = css->id;
3429 /* When this is called before css_id initialization, id can be NULL */
3430 if (!id)
3431 return;
3432
3433 BUG_ON(!ss->use_id);
3434
3435 rcu_assign_pointer(id->css, NULL);
3436 rcu_assign_pointer(css->id, NULL);
3437 spin_lock(&ss->id_lock);
3438 idr_remove(&ss->idr, id->id);
3439 spin_unlock(&ss->id_lock);
3440 call_rcu(&id->rcu_head, __free_css_id_cb);
3441}
3442
3443/*
3444 * This is called by init or create(). Then, calls to this function are
3445 * always serialized (By cgroup_mutex() at create()).
3446 */
3447
3448static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3449{
3450 struct css_id *newid;
3451 int myid, error, size;
3452
3453 BUG_ON(!ss->use_id);
3454
3455 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3456 newid = kzalloc(size, GFP_KERNEL);
3457 if (!newid)
3458 return ERR_PTR(-ENOMEM);
3459 /* get id */
3460 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3461 error = -ENOMEM;
3462 goto err_out;
3463 }
3464 spin_lock(&ss->id_lock);
3465 /* Don't use 0. allocates an ID of 1-65535 */
3466 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3467 spin_unlock(&ss->id_lock);
3468
3469 /* Returns error when there are no free spaces for new ID.*/
3470 if (error) {
3471 error = -ENOSPC;
3472 goto err_out;
3473 }
3474 if (myid > CSS_ID_MAX)
3475 goto remove_idr;
3476
3477 newid->id = myid;
3478 newid->depth = depth;
3479 return newid;
3480remove_idr:
3481 error = -ENOSPC;
3482 spin_lock(&ss->id_lock);
3483 idr_remove(&ss->idr, myid);
3484 spin_unlock(&ss->id_lock);
3485err_out:
3486 kfree(newid);
3487 return ERR_PTR(error);
3488
3489}
3490
3491static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3492{
3493 struct css_id *newid;
3494 struct cgroup_subsys_state *rootcss;
3495
3496 spin_lock_init(&ss->id_lock);
3497 idr_init(&ss->idr);
3498
3499 rootcss = init_css_set.subsys[ss->subsys_id];
3500 newid = get_new_cssid(ss, 0);
3501 if (IS_ERR(newid))
3502 return PTR_ERR(newid);
3503
3504 newid->stack[0] = newid->id;
3505 newid->css = rootcss;
3506 rootcss->id = newid;
3507 return 0;
3508}
3509
3510static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3511 struct cgroup *child)
3512{
3513 int subsys_id, i, depth = 0;
3514 struct cgroup_subsys_state *parent_css, *child_css;
3515 struct css_id *child_id, *parent_id = NULL;
3516
3517 subsys_id = ss->subsys_id;
3518 parent_css = parent->subsys[subsys_id];
3519 child_css = child->subsys[subsys_id];
3520 depth = css_depth(parent_css) + 1;
3521 parent_id = parent_css->id;
3522
3523 child_id = get_new_cssid(ss, depth);
3524 if (IS_ERR(child_id))
3525 return PTR_ERR(child_id);
3526
3527 for (i = 0; i < depth; i++)
3528 child_id->stack[i] = parent_id->stack[i];
3529 child_id->stack[depth] = child_id->id;
3530 /*
3531 * child_id->css pointer will be set after this cgroup is available
3532 * see cgroup_populate_dir()
3533 */
3534 rcu_assign_pointer(child_css->id, child_id);
3535
3536 return 0;
3537}
3538
3539/**
3540 * css_lookup - lookup css by id
3541 * @ss: cgroup subsys to be looked into.
3542 * @id: the id
3543 *
3544 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3545 * NULL if not. Should be called under rcu_read_lock()
3546 */
3547struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3548{
3549 struct css_id *cssid = NULL;
3550
3551 BUG_ON(!ss->use_id);
3552 cssid = idr_find(&ss->idr, id);
3553
3554 if (unlikely(!cssid))
3555 return NULL;
3556
3557 return rcu_dereference(cssid->css);
3558}
3559
3560/**
3561 * css_get_next - lookup next cgroup under specified hierarchy.
3562 * @ss: pointer to subsystem
3563 * @id: current position of iteration.
3564 * @root: pointer to css. search tree under this.
3565 * @foundid: position of found object.
3566 *
3567 * Search next css under the specified hierarchy of rootid. Calling under
3568 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3569 */
3570struct cgroup_subsys_state *
3571css_get_next(struct cgroup_subsys *ss, int id,
3572 struct cgroup_subsys_state *root, int *foundid)
3573{
3574 struct cgroup_subsys_state *ret = NULL;
3575 struct css_id *tmp;
3576 int tmpid;
3577 int rootid = css_id(root);
3578 int depth = css_depth(root);
3579
3580 if (!rootid)
3581 return NULL;
3582
3583 BUG_ON(!ss->use_id);
3584 /* fill start point for scan */
3585 tmpid = id;
3586 while (1) {
3587 /*
3588 * scan next entry from bitmap(tree), tmpid is updated after
3589 * idr_get_next().
3590 */
3591 spin_lock(&ss->id_lock);
3592 tmp = idr_get_next(&ss->idr, &tmpid);
3593 spin_unlock(&ss->id_lock);
3594
3595 if (!tmp)
3596 break;
3597 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3598 ret = rcu_dereference(tmp->css);
3599 if (ret) {
3600 *foundid = tmpid;
3601 break;
3602 }
3603 }
3604 /* continue to scan from next id */
3605 tmpid = tmpid + 1;
3606 }
3607 return ret;
3608}
3609
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{ 40{
41 u64 count; 41 u64 count;
42 42
43 cgroup_lock();
44 count = cgroup_task_count(cont); 43 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count; 44 return count;
47} 45}
48 46
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 128 return container_of(task_subsys_state(task, cpuset_subsys_id),
129 struct cpuset, css); 129 struct cpuset, css);
130} 130}
131struct cpuset_hotplug_scanner {
132 struct cgroup_scanner scan;
133 struct cgroup *to;
134};
135 131
136/* bits in struct cpuset flags field */ 132/* bits in struct cpuset flags field */
137typedef enum { 133typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
521 return 0; 517 return 0;
522} 518}
523 519
520#ifdef CONFIG_SMP
524/* 521/*
525 * Helper routine for generate_sched_domains(). 522 * Helper routine for generate_sched_domains().
526 * Do cpusets a, b have overlapping cpus_allowed masks? 523 * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
815 812
816 put_online_cpus(); 813 put_online_cpus();
817} 814}
815#else /* !CONFIG_SMP */
816static void do_rebuild_sched_domains(struct work_struct *unused)
817{
818}
819
820static int generate_sched_domains(struct cpumask **domains,
821 struct sched_domain_attr **attributes)
822{
823 *domains = NULL;
824 return 1;
825}
826#endif /* CONFIG_SMP */
818 827
819static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); 828static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
820 829
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1026 mutex_unlock(&callback_mutex); 1035 mutex_unlock(&callback_mutex);
1027} 1036}
1028 1037
1038/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
1041 */
1042static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan)
1044{
1045 struct mm_struct *mm;
1046 struct cpuset *cs;
1047 int migrate;
1048 const nodemask_t *oldmem = scan->data;
1049
1050 mm = get_task_mm(p);
1051 if (!mm)
1052 return;
1053
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs);
1056
1057 mpol_rebind_mm(mm, &cs->mems_allowed);
1058 if (migrate)
1059 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1060 mmput(mm);
1061}
1062
1029static void *cpuset_being_rebound; 1063static void *cpuset_being_rebound;
1030 1064
1031/** 1065/**
1032 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1066 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1033 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1067 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1034 * @oldmem: old mems_allowed of cpuset cs 1068 * @oldmem: old mems_allowed of cpuset cs
1069 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1035 * 1070 *
1036 * Called with cgroup_mutex held 1071 * Called with cgroup_mutex held
1037 * Return 0 if successful, -errno if not. 1072 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1073 * if @heap != NULL.
1038 */ 1074 */
1039static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) 1075static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1076 struct ptr_heap *heap)
1040{ 1077{
1041 struct task_struct *p; 1078 struct cgroup_scanner scan;
1042 struct mm_struct **mmarray;
1043 int i, n, ntasks;
1044 int migrate;
1045 int fudge;
1046 struct cgroup_iter it;
1047 int retval;
1048 1079
1049 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1080 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1050 1081
1051 fudge = 10; /* spare mmarray[] slots */ 1082 scan.cg = cs->css.cgroup;
1052 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ 1083 scan.test_task = NULL;
1053 retval = -ENOMEM; 1084 scan.process_task = cpuset_change_nodemask;
1054 1085 scan.heap = heap;
1055 /* 1086 scan.data = (nodemask_t *)oldmem;
1056 * Allocate mmarray[] to hold mm reference for each task
1057 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
1058 * tasklist_lock. We could use GFP_ATOMIC, but with a
1059 * few more lines of code, we can retry until we get a big
1060 * enough mmarray[] w/o using GFP_ATOMIC.
1061 */
1062 while (1) {
1063 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
1064 ntasks += fudge;
1065 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1066 if (!mmarray)
1067 goto done;
1068 read_lock(&tasklist_lock); /* block fork */
1069 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1070 break; /* got enough */
1071 read_unlock(&tasklist_lock); /* try again */
1072 kfree(mmarray);
1073 }
1074
1075 n = 0;
1076
1077 /* Load up mmarray[] with mm reference for each task in cpuset. */
1078 cgroup_iter_start(cs->css.cgroup, &it);
1079 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1080 struct mm_struct *mm;
1081
1082 if (n >= ntasks) {
1083 printk(KERN_WARNING
1084 "Cpuset mempolicy rebind incomplete.\n");
1085 break;
1086 }
1087 mm = get_task_mm(p);
1088 if (!mm)
1089 continue;
1090 mmarray[n++] = mm;
1091 }
1092 cgroup_iter_end(cs->css.cgroup, &it);
1093 read_unlock(&tasklist_lock);
1094 1087
1095 /* 1088 /*
1096 * Now that we've dropped the tasklist spinlock, we can 1089 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1097 * rebind the vma mempolicies of each mm in mmarray[] to their 1090 * take while holding tasklist_lock. Forks can happen - the
1098 * new cpuset, and release that mm. The mpol_rebind_mm() 1091 * mpol_dup() cpuset_being_rebound check will catch such forks,
1099 * call takes mmap_sem, which we couldn't take while holding 1092 * and rebind their vma mempolicies too. Because we still hold
1100 * tasklist_lock. Forks can happen again now - the mpol_dup() 1093 * the global cgroup_mutex, we know that no other rebind effort
1101 * cpuset_being_rebound check will catch such forks, and rebind 1094 * will be contending for the global variable cpuset_being_rebound.
1102 * their vma mempolicies too. Because we still hold the global
1103 * cgroup_mutex, we know that no other rebind effort will
1104 * be contending for the global variable cpuset_being_rebound.
1105 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1095 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1106 * is idempotent. Also migrate pages in each mm to new nodes. 1096 * is idempotent. Also migrate pages in each mm to new nodes.
1107 */ 1097 */
1108 migrate = is_memory_migrate(cs); 1098 cgroup_scan_tasks(&scan);
1109 for (i = 0; i < n; i++) {
1110 struct mm_struct *mm = mmarray[i];
1111
1112 mpol_rebind_mm(mm, &cs->mems_allowed);
1113 if (migrate)
1114 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1115 mmput(mm);
1116 }
1117 1099
1118 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1100 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1119 kfree(mmarray);
1120 cpuset_being_rebound = NULL; 1101 cpuset_being_rebound = NULL;
1121 retval = 0;
1122done:
1123 return retval;
1124} 1102}
1125 1103
1126/* 1104/*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1141{ 1119{
1142 nodemask_t oldmem; 1120 nodemask_t oldmem;
1143 int retval; 1121 int retval;
1122 struct ptr_heap heap;
1144 1123
1145 /* 1124 /*
1146 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1125 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1175 if (retval < 0) 1154 if (retval < 0)
1176 goto done; 1155 goto done;
1177 1156
1157 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1158 if (retval < 0)
1159 goto done;
1160
1178 mutex_lock(&callback_mutex); 1161 mutex_lock(&callback_mutex);
1179 cs->mems_allowed = trialcs->mems_allowed; 1162 cs->mems_allowed = trialcs->mems_allowed;
1180 cs->mems_generation = cpuset_mems_generation++; 1163 cs->mems_generation = cpuset_mems_generation++;
1181 mutex_unlock(&callback_mutex); 1164 mutex_unlock(&callback_mutex);
1182 1165
1183 retval = update_tasks_nodemask(cs, &oldmem); 1166 update_tasks_nodemask(cs, &oldmem, &heap);
1167
1168 heap_free(&heap);
1184done: 1169done:
1185 return retval; 1170 return retval;
1186} 1171}
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
1192 1177
1193static int update_relax_domain_level(struct cpuset *cs, s64 val) 1178static int update_relax_domain_level(struct cpuset *cs, s64 val)
1194{ 1179{
1180#ifdef CONFIG_SMP
1195 if (val < -1 || val >= SD_LV_MAX) 1181 if (val < -1 || val >= SD_LV_MAX)
1196 return -EINVAL; 1182 return -EINVAL;
1183#endif
1197 1184
1198 if (val != cs->relax_domain_level) { 1185 if (val != cs->relax_domain_level) {
1199 cs->relax_domain_level = val; 1186 cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1355 struct cgroup *cont, struct task_struct *tsk) 1342 struct cgroup *cont, struct task_struct *tsk)
1356{ 1343{
1357 struct cpuset *cs = cgroup_cs(cont); 1344 struct cpuset *cs = cgroup_cs(cont);
1358 int ret = 0;
1359 1345
1360 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1346 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1361 return -ENOSPC; 1347 return -ENOSPC;
1362 1348
1363 if (tsk->flags & PF_THREAD_BOUND) { 1349 /*
1364 mutex_lock(&callback_mutex); 1350 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1365 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) 1351 * cannot change their cpu affinity and isolating such threads by their
1366 ret = -EINVAL; 1352 * set of allowed nodes is unnecessary. Thus, cpusets are not
1367 mutex_unlock(&callback_mutex); 1353 * applicable for such threads. This prevents checking for success of
1368 } 1354 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1355 * be changed.
1356 */
1357 if (tsk->flags & PF_THREAD_BOUND)
1358 return -EINVAL;
1369 1359
1370 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); 1360 return security_task_setscheduler(tsk, 0, NULL);
1371} 1361}
1372 1362
1373static void cpuset_attach(struct cgroup_subsys *ss, 1363static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
1706 .read_u64 = cpuset_read_u64, 1696 .read_u64 = cpuset_read_u64,
1707 .write_u64 = cpuset_write_u64, 1697 .write_u64 = cpuset_write_u64,
1708 .private = FILE_MEMORY_PRESSURE, 1698 .private = FILE_MEMORY_PRESSURE,
1699 .mode = S_IRUGO,
1709 }, 1700 },
1710 1701
1711 { 1702 {
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
1913static void cpuset_do_move_task(struct task_struct *tsk, 1904static void cpuset_do_move_task(struct task_struct *tsk,
1914 struct cgroup_scanner *scan) 1905 struct cgroup_scanner *scan)
1915{ 1906{
1916 struct cpuset_hotplug_scanner *chsp; 1907 struct cgroup *new_cgroup = scan->data;
1917 1908
1918 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); 1909 cgroup_attach_task(new_cgroup, tsk);
1919 cgroup_attach_task(chsp->to, tsk);
1920} 1910}
1921 1911
1922/** 1912/**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 */ 1922 */
1933static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) 1923static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1934{ 1924{
1935 struct cpuset_hotplug_scanner scan; 1925 struct cgroup_scanner scan;
1936 1926
1937 scan.scan.cg = from->css.cgroup; 1927 scan.cg = from->css.cgroup;
1938 scan.scan.test_task = NULL; /* select all tasks in cgroup */ 1928 scan.test_task = NULL; /* select all tasks in cgroup */
1939 scan.scan.process_task = cpuset_do_move_task; 1929 scan.process_task = cpuset_do_move_task;
1940 scan.scan.heap = NULL; 1930 scan.heap = NULL;
1941 scan.to = to->css.cgroup; 1931 scan.data = to->css.cgroup;
1942 1932
1943 if (cgroup_scan_tasks(&scan.scan)) 1933 if (cgroup_scan_tasks(&scan))
1944 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1934 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1945 "cgroup_scan_tasks failed\n"); 1935 "cgroup_scan_tasks failed\n");
1946} 1936}
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2033 remove_tasks_in_empty_cpuset(cp); 2023 remove_tasks_in_empty_cpuset(cp);
2034 else { 2024 else {
2035 update_tasks_cpumask(cp, NULL); 2025 update_tasks_cpumask(cp, NULL);
2036 update_tasks_nodemask(cp, &oldmems); 2026 update_tasks_nodemask(cp, &oldmems, NULL);
2037 } 2027 }
2038 } 2028 }
2039} 2029}
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2069 } 2059 }
2070 2060
2071 cgroup_lock(); 2061 cgroup_lock();
2062 mutex_lock(&callback_mutex);
2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2063 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2064 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2092 cgroup_lock(); 2084 cgroup_lock();
2093 switch (action) { 2085 switch (action) {
2094 case MEM_ONLINE: 2086 case MEM_ONLINE:
2095 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2096 break;
2097 case MEM_OFFLINE: 2087 case MEM_OFFLINE:
2088 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2089 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 scan_for_empty_cpusets(&top_cpuset); 2090 mutex_unlock(&callback_mutex);
2091 if (action == MEM_OFFLINE)
2092 scan_for_empty_cpusets(&top_cpuset);
2100 break; 2093 break;
2101 default: 2094 default:
2102 break; 2095 break;
@@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2206} 2199}
2207 2200
2208/** 2201/**
2209 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? 2202 * cpuset_node_allowed_softwall - Can we allocate on a memory node?
2210 * @z: is this zone on an allowed node? 2203 * @node: is this an allowed node?
2211 * @gfp_mask: memory allocation flags 2204 * @gfp_mask: memory allocation flags
2212 * 2205 *
2213 * If we're in interrupt, yes, we can always allocate. If 2206 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2214 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2207 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2215 * z's node is in our tasks mems_allowed, yes. If it's not a 2208 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
2216 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2209 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
2217 * hardwalled cpuset ancestor to this tasks cpuset, yes. 2210 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2218 * If the task has been OOM killed and has access to memory reserves 2211 * flag, yes.
2219 * as specified by the TIF_MEMDIE flag, yes.
2220 * Otherwise, no. 2212 * Otherwise, no.
2221 * 2213 *
2222 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() 2214 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2223 * reduces to cpuset_zone_allowed_hardwall(). Otherwise, 2215 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2224 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone 2216 * might sleep, and might allow a node from an enclosing cpuset.
2225 * from an enclosing cpuset.
2226 * 2217 *
2227 * cpuset_zone_allowed_hardwall() only handles the simpler case of 2218 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2228 * hardwall cpusets, and never sleeps. 2219 * cpusets, and never sleeps.
2229 * 2220 *
2230 * The __GFP_THISNODE placement logic is really handled elsewhere, 2221 * The __GFP_THISNODE placement logic is really handled elsewhere,
2231 * by forcibly using a zonelist starting at a specified node, and by 2222 * by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2264 * GFP_USER - only nodes in current tasks mems allowed ok. 2255 * GFP_USER - only nodes in current tasks mems allowed ok.
2265 * 2256 *
2266 * Rule: 2257 * Rule:
2267 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you 2258 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2268 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2259 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2269 * the code that might scan up ancestor cpusets and sleep. 2260 * the code that might scan up ancestor cpusets and sleep.
2270 */ 2261 */
2271 2262int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2272int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2273{ 2263{
2274 int node; /* node that zone z is on */
2275 const struct cpuset *cs; /* current cpuset ancestors */ 2264 const struct cpuset *cs; /* current cpuset ancestors */
2276 int allowed; /* is allocation in zone z allowed? */ 2265 int allowed; /* is allocation in zone z allowed? */
2277 2266
2278 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2279 return 1; 2268 return 1;
2280 node = zone_to_nid(z);
2281 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2269 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2282 if (node_isset(node, current->mems_allowed)) 2270 if (node_isset(node, current->mems_allowed))
2283 return 1; 2271 return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2306} 2294}
2307 2295
2308/* 2296/*
2309 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? 2297 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2310 * @z: is this zone on an allowed node? 2298 * @node: is this an allowed node?
2311 * @gfp_mask: memory allocation flags 2299 * @gfp_mask: memory allocation flags
2312 * 2300 *
2313 * If we're in interrupt, yes, we can always allocate. 2301 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2314 * If __GFP_THISNODE is set, yes, we can always allocate. If zone 2302 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2315 * z's node is in our tasks mems_allowed, yes. If the task has been 2303 * yes. If the task has been OOM killed and has access to memory reserves as
2316 * OOM killed and has access to memory reserves as specified by the 2304 * specified by the TIF_MEMDIE flag, yes.
2317 * TIF_MEMDIE flag, yes. Otherwise, no. 2305 * Otherwise, no.
2318 * 2306 *
2319 * The __GFP_THISNODE placement logic is really handled elsewhere, 2307 * The __GFP_THISNODE placement logic is really handled elsewhere,
2320 * by forcibly using a zonelist starting at a specified node, and by 2308 * by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2322 * any node on the zonelist except the first. By the time any such 2310 * any node on the zonelist except the first. By the time any such
2323 * calls get to this routine, we should just shut up and say 'yes'. 2311 * calls get to this routine, we should just shut up and say 'yes'.
2324 * 2312 *
2325 * Unlike the cpuset_zone_allowed_softwall() variant, above, 2313 * Unlike the cpuset_node_allowed_softwall() variant, above,
2326 * this variant requires that the zone be in the current tasks 2314 * this variant requires that the node be in the current task's
2327 * mems_allowed or that we're in interrupt. It does not scan up the 2315 * mems_allowed or that we're in interrupt. It does not scan up the
2328 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. 2316 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2329 * It never sleeps. 2317 * It never sleeps.
2330 */ 2318 */
2331 2319int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2332int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2333{ 2320{
2334 int node; /* node that zone z is on */
2335
2336 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2321 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2337 return 1; 2322 return 1;
2338 node = zone_to_nid(z);
2339 if (node_isset(node, current->mems_allowed)) 2323 if (node_isset(node, current->mems_allowed))
2340 return 1; 2324 return 1;
2341 /* 2325 /*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c2952..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/fs_struct.h>
21 22
22 23
23static void default_handler(int, struct pt_regs *); 24static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
145 return 0; 146 return 0;
146 } 147 }
147 148
148 if (atomic_read(&current->fs->count) != 1) {
149 struct fs_struct *fsp, *ofsp;
150
151 fsp = copy_fs_struct(current->fs);
152 if (fsp == NULL) {
153 module_put(ep->module);
154 return -ENOMEM;
155 }
156
157 task_lock(current);
158 ofsp = current->fs;
159 current->fs = fsp;
160 task_unlock(current);
161
162 put_fs_struct(ofsp);
163 }
164
165 /*
166 * At that point we are guaranteed to be the sole owner of
167 * current->fs.
168 */
169
170 current->personality = personality; 149 current->personality = personality;
171 oep = current_thread_info()->exec_domain; 150 oep = current_thread_info()->exec_domain;
172 current_thread_info()->exec_domain = ep; 151 current_thread_info()->exec_domain = ep;
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..6686ed1e4aa3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h>
49#include <linux/init_task.h> 50#include <linux/init_task.h>
50#include <trace/sched.h> 51#include <trace/sched.h>
51 52
@@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
61 62
62static void exit_mm(struct task_struct * tsk); 63static void exit_mm(struct task_struct * tsk);
63 64
64static inline int task_detached(struct task_struct *p)
65{
66 return p->exit_signal == -1;
67}
68
69static void __unhash_process(struct task_struct *p) 65static void __unhash_process(struct task_struct *p)
70{ 66{
71 nr_threads--; 67 nr_threads--;
@@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)
362void __set_special_pids(struct pid *pid) 358void __set_special_pids(struct pid *pid)
363{ 359{
364 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
365 pid_t nr = pid_nr(pid);
366 361
367 if (task_session(curr) != pid) { 362 if (task_session(curr) != pid)
368 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
369 set_task_session(curr, nr); 364
370 } 365 if (task_pgrp(curr) != pid)
371 if (task_pgrp(curr) != pid) {
372 change_pid(curr, PIDTYPE_PGID, pid); 366 change_pid(curr, PIDTYPE_PGID, pid);
373 set_task_pgrp(curr, nr);
374 }
375} 367}
376 368
377static void set_special_pids(struct pid *pid) 369static void set_special_pids(struct pid *pid)
@@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
429void daemonize(const char *name, ...) 421void daemonize(const char *name, ...)
430{ 422{
431 va_list args; 423 va_list args;
432 struct fs_struct *fs;
433 sigset_t blocked; 424 sigset_t blocked;
434 425
435 va_start(args, name); 426 va_start(args, name);
@@ -462,11 +453,7 @@ void daemonize(const char *name, ...)
462 453
463 /* Become as one with the init task */ 454 /* Become as one with the init task */
464 455
465 exit_fs(current); /* current->fs->count--; */ 456 daemonize_fs_struct();
466 fs = init_task.fs;
467 current->fs = fs;
468 atomic_inc(&fs->count);
469
470 exit_files(current); 457 exit_files(current);
471 current->files = init_task.files; 458 current->files = init_task.files;
472 atomic_inc(&current->files->count); 459 atomic_inc(&current->files->count);
@@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)
565 } 552 }
566} 553}
567 554
568void put_fs_struct(struct fs_struct *fs)
569{
570 /* No need to hold fs->lock if we are killing it */
571 if (atomic_dec_and_test(&fs->count)) {
572 path_put(&fs->root);
573 path_put(&fs->pwd);
574 kmem_cache_free(fs_cachep, fs);
575 }
576}
577
578void exit_fs(struct task_struct *tsk)
579{
580 struct fs_struct * fs = tsk->fs;
581
582 if (fs) {
583 task_lock(tsk);
584 tsk->fs = NULL;
585 task_unlock(tsk);
586 put_fs_struct(fs);
587 }
588}
589
590EXPORT_SYMBOL_GPL(exit_fs);
591
592#ifdef CONFIG_MM_OWNER 555#ifdef CONFIG_MM_OWNER
593/* 556/*
594 * Task p is exiting and it owned mm, lets find a new owner for it 557 * Task p is exiting and it owned mm, lets find a new owner for it
@@ -732,119 +695,6 @@ static void exit_mm(struct task_struct * tsk)
732} 695}
733 696
734/* 697/*
735 * Return nonzero if @parent's children should reap themselves.
736 *
737 * Called with write_lock_irq(&tasklist_lock) held.
738 */
739static int ignoring_children(struct task_struct *parent)
740{
741 int ret;
742 struct sighand_struct *psig = parent->sighand;
743 unsigned long flags;
744 spin_lock_irqsave(&psig->siglock, flags);
745 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
746 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
747 spin_unlock_irqrestore(&psig->siglock, flags);
748 return ret;
749}
750
751/*
752 * Detach all tasks we were using ptrace on.
753 * Any that need to be release_task'd are put on the @dead list.
754 *
755 * Called with write_lock(&tasklist_lock) held.
756 */
757static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
758{
759 struct task_struct *p, *n;
760 int ign = -1;
761
762 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
763 __ptrace_unlink(p);
764
765 if (p->exit_state != EXIT_ZOMBIE)
766 continue;
767
768 /*
769 * If it's a zombie, our attachedness prevented normal
770 * parent notification or self-reaping. Do notification
771 * now if it would have happened earlier. If it should
772 * reap itself, add it to the @dead list. We can't call
773 * release_task() here because we already hold tasklist_lock.
774 *
775 * If it's our own child, there is no notification to do.
776 * But if our normal children self-reap, then this child
777 * was prevented by ptrace and we must reap it now.
778 */
779 if (!task_detached(p) && thread_group_empty(p)) {
780 if (!same_thread_group(p->real_parent, parent))
781 do_notify_parent(p, p->exit_signal);
782 else {
783 if (ign < 0)
784 ign = ignoring_children(parent);
785 if (ign)
786 p->exit_signal = -1;
787 }
788 }
789
790 if (task_detached(p)) {
791 /*
792 * Mark it as in the process of being reaped.
793 */
794 p->exit_state = EXIT_DEAD;
795 list_add(&p->ptrace_entry, dead);
796 }
797 }
798}
799
800/*
801 * Finish up exit-time ptrace cleanup.
802 *
803 * Called without locks.
804 */
805static void ptrace_exit_finish(struct task_struct *parent,
806 struct list_head *dead)
807{
808 struct task_struct *p, *n;
809
810 BUG_ON(!list_empty(&parent->ptraced));
811
812 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
813 list_del_init(&p->ptrace_entry);
814 release_task(p);
815 }
816}
817
818static void reparent_thread(struct task_struct *p, struct task_struct *father)
819{
820 if (p->pdeath_signal)
821 /* We already hold the tasklist_lock here. */
822 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
823
824 list_move_tail(&p->sibling, &p->real_parent->children);
825
826 /* If this is a threaded reparent there is no need to
827 * notify anyone anything has happened.
828 */
829 if (same_thread_group(p->real_parent, father))
830 return;
831
832 /* We don't want people slaying init. */
833 if (!task_detached(p))
834 p->exit_signal = SIGCHLD;
835
836 /* If we'd notified the old parent about this child's death,
837 * also notify the new parent.
838 */
839 if (!ptrace_reparented(p) &&
840 p->exit_state == EXIT_ZOMBIE &&
841 !task_detached(p) && thread_group_empty(p))
842 do_notify_parent(p, p->exit_signal);
843
844 kill_orphaned_pgrp(p, father);
845}
846
847/*
848 * When we die, we re-parent all our children. 698 * When we die, we re-parent all our children.
849 * Try to give them to another thread in our thread 699 * Try to give them to another thread in our thread
850 * group, and if no such member exists, give it to 700 * group, and if no such member exists, give it to
@@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
883 return pid_ns->child_reaper; 733 return pid_ns->child_reaper;
884} 734}
885 735
736/*
737* Any that need to be release_task'd are put on the @dead list.
738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead)
741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children);
746
747 if (task_detached(p))
748 return;
749 /*
750 * If this is a threaded reparent there is no need to
751 * notify anyone anything has happened.
752 */
753 if (same_thread_group(p->real_parent, father))
754 return;
755
756 /* We don't want people slaying init. */
757 p->exit_signal = SIGCHLD;
758
759 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) {
764 p->exit_state = EXIT_DEAD;
765 list_move_tail(&p->sibling, dead);
766 }
767 }
768
769 kill_orphaned_pgrp(p, father);
770}
771
886static void forget_original_parent(struct task_struct *father) 772static void forget_original_parent(struct task_struct *father)
887{ 773{
888 struct task_struct *p, *n, *reaper; 774 struct task_struct *p, *n, *reaper;
889 LIST_HEAD(ptrace_dead); 775 LIST_HEAD(dead_children);
776
777 exit_ptrace(father);
890 778
891 write_lock_irq(&tasklist_lock); 779 write_lock_irq(&tasklist_lock);
892 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
893 /*
894 * First clean up ptrace if we were using it.
895 */
896 ptrace_exit(father, &ptrace_dead);
897 781
898 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
899 p->real_parent = reaper; 783 p->real_parent = reaper;
@@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
901 BUG_ON(p->ptrace); 785 BUG_ON(p->ptrace);
902 p->parent = p->real_parent; 786 p->parent = p->real_parent;
903 } 787 }
904 reparent_thread(p, father); 788 reparent_thread(father, p, &dead_children);
905 } 789 }
906
907 write_unlock_irq(&tasklist_lock); 790 write_unlock_irq(&tasklist_lock);
791
908 BUG_ON(!list_empty(&father->children)); 792 BUG_ON(!list_empty(&father->children));
909 793
910 ptrace_exit_finish(father, &ptrace_dead); 794 list_for_each_entry_safe(p, n, &dead_children, sibling) {
795 list_del_init(&p->sibling);
796 release_task(p);
797 }
911} 798}
912 799
913/* 800/*
@@ -1417,6 +1304,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
1417 return retval; 1304 return retval;
1418} 1305}
1419 1306
1307static int *task_stopped_code(struct task_struct *p, bool ptrace)
1308{
1309 if (ptrace) {
1310 if (task_is_stopped_or_traced(p))
1311 return &p->exit_code;
1312 } else {
1313 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1314 return &p->signal->group_exit_code;
1315 }
1316 return NULL;
1317}
1318
1420/* 1319/*
1421 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1320 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1422 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1321 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1427,7 +1326,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1427 int options, struct siginfo __user *infop, 1326 int options, struct siginfo __user *infop,
1428 int __user *stat_addr, struct rusage __user *ru) 1327 int __user *stat_addr, struct rusage __user *ru)
1429{ 1328{
1430 int retval, exit_code, why; 1329 int retval, exit_code, *p_code, why;
1431 uid_t uid = 0; /* unneeded, required by compiler */ 1330 uid_t uid = 0; /* unneeded, required by compiler */
1432 pid_t pid; 1331 pid_t pid;
1433 1332
@@ -1437,22 +1336,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1437 exit_code = 0; 1336 exit_code = 0;
1438 spin_lock_irq(&p->sighand->siglock); 1337 spin_lock_irq(&p->sighand->siglock);
1439 1338
1440 if (unlikely(!task_is_stopped_or_traced(p))) 1339 p_code = task_stopped_code(p, ptrace);
1441 goto unlock_sig; 1340 if (unlikely(!p_code))
1442
1443 if (!ptrace && p->signal->group_stop_count > 0)
1444 /*
1445 * A group stop is in progress and this is the group leader.
1446 * We won't report until all threads have stopped.
1447 */
1448 goto unlock_sig; 1341 goto unlock_sig;
1449 1342
1450 exit_code = p->exit_code; 1343 exit_code = *p_code;
1451 if (!exit_code) 1344 if (!exit_code)
1452 goto unlock_sig; 1345 goto unlock_sig;
1453 1346
1454 if (!unlikely(options & WNOWAIT)) 1347 if (!unlikely(options & WNOWAIT))
1455 p->exit_code = 0; 1348 *p_code = 0;
1456 1349
1457 /* don't need the RCU readlock here as we're holding a spinlock */ 1350 /* don't need the RCU readlock here as we're holding a spinlock */
1458 uid = __task_cred(p)->uid; 1351 uid = __task_cred(p)->uid;
@@ -1608,7 +1501,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1608 */ 1501 */
1609 *notask_error = 0; 1502 *notask_error = 0;
1610 1503
1611 if (task_is_stopped_or_traced(p)) 1504 if (task_stopped_code(p, ptrace))
1612 return wait_task_stopped(ptrace, p, options, 1505 return wait_task_stopped(ptrace, p, options,
1613 infop, stat_addr, ru); 1506 infop, stat_addr, ru);
1614 1507
@@ -1812,7 +1705,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1812 pid = find_get_pid(-upid); 1705 pid = find_get_pid(-upid);
1813 } else if (upid == 0) { 1706 } else if (upid == 0) {
1814 type = PIDTYPE_PGID; 1707 type = PIDTYPE_PGID;
1815 pid = get_pid(task_pgrp(current)); 1708 pid = get_task_pid(current, PIDTYPE_PGID);
1816 } else /* upid > 0 */ { 1709 } else /* upid > 0 */ {
1817 type = PIDTYPE_PID; 1710 type = PIDTYPE_PID;
1818 pid = find_get_pid(upid); 1711 pid = find_get_pid(upid);
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..81e99d1f0d5b 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
41 return e; 41 return e;
42} 42}
43 43
44static inline int init_kernel_text(unsigned long addr)
45{
46 if (addr >= (unsigned long)_sinittext &&
47 addr <= (unsigned long)_einittext)
48 return 1;
49 return 0;
50}
51
44__notrace_funcgraph int core_kernel_text(unsigned long addr) 52__notrace_funcgraph int core_kernel_text(unsigned long addr)
45{ 53{
46 if (addr >= (unsigned long)_stext && 54 if (addr >= (unsigned long)_stext &&
@@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
48 return 1; 56 return 1;
49 57
50 if (system_state == SYSTEM_BOOTING && 58 if (system_state == SYSTEM_BOOTING &&
51 addr >= (unsigned long)_sinittext && 59 init_kernel_text(addr))
52 addr <= (unsigned long)_einittext)
53 return 1; 60 return 1;
54 return 0; 61 return 0;
55} 62}
@@ -58,14 +65,26 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
58{ 65{
59 if (core_kernel_text(addr)) 66 if (core_kernel_text(addr))
60 return 1; 67 return 1;
61 return __module_text_address(addr) != NULL; 68 if (is_module_text_address(addr))
69 return 1;
70 /*
71 * There might be init symbols in saved stacktraces.
72 * Give those symbols a chance to be printed in
73 * backtraces (such as lockdep traces).
74 *
75 * Since we are after the module-symbols check, there's
76 * no danger of address overlap:
77 */
78 if (init_kernel_text(addr))
79 return 1;
80 return 0;
62} 81}
63 82
64int kernel_text_address(unsigned long addr) 83int kernel_text_address(unsigned long addr)
65{ 84{
66 if (core_kernel_text(addr)) 85 if (core_kernel_text(addr))
67 return 1; 86 return 1;
68 return module_text_address(addr) != NULL; 87 return is_module_text_address(addr);
69} 88}
70 89
71/* 90/*
@@ -81,5 +100,5 @@ int func_ptr_is_kernel_text(void *ptr)
81 addr = (unsigned long) dereference_function_descriptor(ptr); 100 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr)) 101 if (core_kernel_text(addr))
83 return 1; 102 return 1;
84 return module_text_address(addr) != NULL; 103 return is_module_text_address(addr);
85} 104}
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..660c2b8765bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
60#include <linux/tty.h> 60#include <linux/tty.h>
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h>
63#include <trace/sched.h> 64#include <trace/sched.h>
64#include <linux/magic.h> 65#include <linux/magic.h>
65 66
@@ -681,38 +682,21 @@ fail_nomem:
681 return retval; 682 return retval;
682} 683}
683 684
684static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
685{
686 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
687 /* We don't need to lock fs - think why ;-) */
688 if (fs) {
689 atomic_set(&fs->count, 1);
690 rwlock_init(&fs->lock);
691 fs->umask = old->umask;
692 read_lock(&old->lock);
693 fs->root = old->root;
694 path_get(&old->root);
695 fs->pwd = old->pwd;
696 path_get(&old->pwd);
697 read_unlock(&old->lock);
698 }
699 return fs;
700}
701
702struct fs_struct *copy_fs_struct(struct fs_struct *old)
703{
704 return __copy_fs_struct(old);
705}
706
707EXPORT_SYMBOL_GPL(copy_fs_struct);
708
709static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) 685static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
710{ 686{
687 struct fs_struct *fs = current->fs;
711 if (clone_flags & CLONE_FS) { 688 if (clone_flags & CLONE_FS) {
712 atomic_inc(&current->fs->count); 689 /* tsk->fs is already what we want */
690 write_lock(&fs->lock);
691 if (fs->in_exec) {
692 write_unlock(&fs->lock);
693 return -EAGAIN;
694 }
695 fs->users++;
696 write_unlock(&fs->lock);
713 return 0; 697 return 0;
714 } 698 }
715 tsk->fs = __copy_fs_struct(current->fs); 699 tsk->fs = copy_fs_struct(fs);
716 if (!tsk->fs) 700 if (!tsk->fs)
717 return -ENOMEM; 701 return -ENOMEM;
718 return 0; 702 return 0;
@@ -841,6 +825,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
841 atomic_set(&sig->live, 1); 825 atomic_set(&sig->live, 1);
842 init_waitqueue_head(&sig->wait_chldexit); 826 init_waitqueue_head(&sig->wait_chldexit);
843 sig->flags = 0; 827 sig->flags = 0;
828 if (clone_flags & CLONE_NEWPID)
829 sig->flags |= SIGNAL_UNKILLABLE;
844 sig->group_exit_code = 0; 830 sig->group_exit_code = 0;
845 sig->group_exit_task = NULL; 831 sig->group_exit_task = NULL;
846 sig->group_stop_count = 0; 832 sig->group_stop_count = 0;
@@ -1125,7 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1125 goto bad_fork_cleanup_mm; 1111 goto bad_fork_cleanup_mm;
1126 if ((retval = copy_io(clone_flags, p))) 1112 if ((retval = copy_io(clone_flags, p)))
1127 goto bad_fork_cleanup_namespaces; 1113 goto bad_fork_cleanup_namespaces;
1128 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1114 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1129 if (retval) 1115 if (retval)
1130 goto bad_fork_cleanup_io; 1116 goto bad_fork_cleanup_io;
1131 1117
@@ -1263,8 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1263 p->signal->leader_pid = pid; 1249 p->signal->leader_pid = pid;
1264 tty_kref_put(p->signal->tty); 1250 tty_kref_put(p->signal->tty);
1265 p->signal->tty = tty_kref_get(current->signal->tty); 1251 p->signal->tty = tty_kref_get(current->signal->tty);
1266 set_task_pgrp(p, task_pgrp_nr(current));
1267 set_task_session(p, task_session_nr(current));
1268 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1252 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1269 attach_pid(p, PIDTYPE_SID, task_session(current)); 1253 attach_pid(p, PIDTYPE_SID, task_session(current));
1270 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1488,6 +1472,7 @@ void __init proc_caches_init(void)
1488 mm_cachep = kmem_cache_create("mm_struct", 1472 mm_cachep = kmem_cache_create("mm_struct",
1489 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1473 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1490 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1474 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1475 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1491 mmap_init(); 1476 mmap_init();
1492} 1477}
1493 1478
@@ -1543,12 +1528,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1543{ 1528{
1544 struct fs_struct *fs = current->fs; 1529 struct fs_struct *fs = current->fs;
1545 1530
1546 if ((unshare_flags & CLONE_FS) && 1531 if (!(unshare_flags & CLONE_FS) || !fs)
1547 (fs && atomic_read(&fs->count) > 1)) { 1532 return 0;
1548 *new_fsp = __copy_fs_struct(current->fs); 1533
1549 if (!*new_fsp) 1534 /* don't need lock here; in the worst case we'll do useless copy */
1550 return -ENOMEM; 1535 if (fs->users == 1)
1551 } 1536 return 0;
1537
1538 *new_fsp = copy_fs_struct(fs);
1539 if (!*new_fsp)
1540 return -ENOMEM;
1552 1541
1553 return 0; 1542 return 0;
1554} 1543}
@@ -1664,8 +1653,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1664 1653
1665 if (new_fs) { 1654 if (new_fs) {
1666 fs = current->fs; 1655 fs = current->fs;
1656 write_lock(&fs->lock);
1667 current->fs = new_fs; 1657 current->fs = new_fs;
1668 new_fs = fs; 1658 if (--fs->users)
1659 new_fs = NULL;
1660 else
1661 new_fs = fs;
1662 write_unlock(&fs->lock);
1669 } 1663 }
1670 1664
1671 if (new_mm) { 1665 if (new_mm) {
@@ -1704,7 +1698,7 @@ bad_unshare_cleanup_sigh:
1704 1698
1705bad_unshare_cleanup_fs: 1699bad_unshare_cleanup_fs:
1706 if (new_fs) 1700 if (new_fs)
1707 put_fs_struct(new_fs); 1701 free_fs_struct(new_fs);
1708 1702
1709bad_unshare_cleanup_thread: 1703bad_unshare_cleanup_thread:
1710bad_unshare_out: 1704bad_unshare_out:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
161 return module_kallsyms_lookup_name(name); 161 return module_kallsyms_lookup_name(name);
162} 162}
163 163
164int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
165 unsigned long),
166 void *data)
167{
168 char namebuf[KSYM_NAME_LEN];
169 unsigned long i;
170 unsigned int off;
171 int ret;
172
173 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
174 off = kallsyms_expand_symbol(off, namebuf);
175 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
176 if (ret != 0)
177 return ret;
178 }
179 return module_kallsyms_on_each_symbol(fn, data);
180}
181EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
182
164static unsigned long get_symbol_pos(unsigned long addr, 183static unsigned long get_symbol_pos(unsigned long addr,
165 unsigned long *symbolsize, 184 unsigned long *symbolsize,
166 unsigned long *offset) 185 unsigned long *offset)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85fe017..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
42note_buf_t* crash_notes; 42note_buf_t* crash_notes;
43 43
44/* vmcoreinfo stuff */ 44/* vmcoreinfo stuff */
45unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1409 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_OFFSET(list_head, prev);
1410 VMCOREINFO_OFFSET(vm_struct, addr); 1410 VMCOREINFO_OFFSET(vm_struct, addr);
1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1412 log_buf_kexec_setup();
1412 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1413 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1414 VMCOREINFO_NUMBER(PG_lru); 1415 VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * __request_module - try to load a kernel module
54 * @wait: wait (or not) for the operation to complete
54 * @fmt: printf style format string for the name of the module 55 * @fmt: printf style format string for the name of the module
55 * @...: arguments as specified in the format string 56 * @...: arguments as specified in the format string
56 * 57 *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
63 * If module auto-loading support is disabled then this function 64 * If module auto-loading support is disabled then this function
64 * becomes a no-operation. 65 * becomes a no-operation.
65 */ 66 */
66int request_module(const char *fmt, ...) 67int __request_module(bool wait, const char *fmt, ...)
67{ 68{
68 va_list args; 69 va_list args;
69 char module_name[MODULE_NAME_LEN]; 70 char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
108 return -ENOMEM; 109 return -ENOMEM;
109 } 110 }
110 111
111 ret = call_usermodehelper(modprobe_path, argv, envp, 1); 112 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
112 atomic_dec(&kmod_concurrent); 114 atomic_dec(&kmod_concurrent);
113 return ret; 115 return ret;
114} 116}
115EXPORT_SYMBOL(request_module); 117EXPORT_SYMBOL(__request_module);
116#endif /* CONFIG_MODULES */ 118#endif /* CONFIG_MODULES */
117 119
118struct subprocess_info { 120struct subprocess_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3673a3f44d9d..981cd4854281 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks;
433atomic_t nr_find_usage_forwards_recursions; 433atomic_t nr_find_usage_forwards_recursions;
434atomic_t nr_find_usage_backwards_checks; 434atomic_t nr_find_usage_backwards_checks;
435atomic_t nr_find_usage_backwards_recursions; 435atomic_t nr_find_usage_backwards_recursions;
436# define debug_atomic_inc(ptr) atomic_inc(ptr)
437# define debug_atomic_dec(ptr) atomic_dec(ptr)
438# define debug_atomic_read(ptr) atomic_read(ptr)
439#else
440# define debug_atomic_inc(ptr) do { } while (0)
441# define debug_atomic_dec(ptr) do { } while (0)
442# define debug_atomic_read(ptr) 0
443#endif 436#endif
444 437
445/* 438/*
@@ -1900,9 +1893,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1900 curr->comm, task_pid_nr(curr)); 1893 curr->comm, task_pid_nr(curr));
1901 print_lock(this); 1894 print_lock(this);
1902 if (forwards) 1895 if (forwards)
1903 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1896 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1904 else 1897 else
1905 printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); 1898 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1906 print_lock_name(other); 1899 print_lock_name(other);
1907 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 1900 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1908 1901
@@ -2015,7 +2008,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
2015 enum lock_usage_bit bit, const char *name); 2008 enum lock_usage_bit bit, const char *name);
2016 2009
2017static int 2010static int
2018mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) 2011mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2012 enum lock_usage_bit new_bit)
2019{ 2013{
2020 int excl_bit = exclusive_bit(new_bit); 2014 int excl_bit = exclusive_bit(new_bit);
2021 int read = new_bit & 1; 2015 int read = new_bit & 1;
@@ -2043,7 +2037,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
2043 * states. 2037 * states.
2044 */ 2038 */
2045 if ((!read || !dir || STRICT_READ_CHECKS) && 2039 if ((!read || !dir || STRICT_READ_CHECKS) &&
2046 !usage(curr, this, excl_bit, state_name(new_bit))) 2040 !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
2047 return 0; 2041 return 0;
2048 2042
2049 /* 2043 /*
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..f6e08b7cff7c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
68 68
69/* List of modules, protected by module_mutex or preempt_disable 69/* List of modules, protected by module_mutex or preempt_disable
70 * (delete uses stop_machine/add uses RCU list operations). */ 70 * (delete uses stop_machine/add uses RCU list operations). */
71static DEFINE_MUTEX(module_mutex); 71DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex);
72static LIST_HEAD(modules); 73static LIST_HEAD(modules);
73 74
74/* Waiting for a module to finish initializing? */ 75/* Waiting for a module to finish initializing? */
@@ -76,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
76 77
77static BLOCKING_NOTIFIER_HEAD(module_notify_list); 78static BLOCKING_NOTIFIER_HEAD(module_notify_list);
78 79
79/* Bounds of module allocation, for speeding __module_text_address */ 80/* Bounds of module allocation, for speeding __module_address */
80static unsigned long module_addr_min = -1UL, module_addr_max = 0; 81static unsigned long module_addr_min = -1UL, module_addr_max = 0;
81 82
82int register_module_notifier(struct notifier_block * nb) 83int register_module_notifier(struct notifier_block * nb)
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
186#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 187#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
187#endif 188#endif
188 189
189struct symsearch {
190 const struct kernel_symbol *start, *stop;
191 const unsigned long *crcs;
192 enum {
193 NOT_GPL_ONLY,
194 GPL_ONLY,
195 WILL_BE_GPL_ONLY,
196 } licence;
197 bool unused;
198};
199
200static bool each_symbol_in_section(const struct symsearch *arr, 190static bool each_symbol_in_section(const struct symsearch *arr,
201 unsigned int arrsize, 191 unsigned int arrsize,
202 struct module *owner, 192 struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
217} 207}
218 208
219/* Returns true as soon as fn returns true, otherwise false. */ 209/* Returns true as soon as fn returns true, otherwise false. */
220static bool each_symbol(bool (*fn)(const struct symsearch *arr, 210bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
221 struct module *owner, 211 unsigned int symnum, void *data), void *data)
222 unsigned int symnum, void *data),
223 void *data)
224{ 212{
225 struct module *mod; 213 struct module *mod;
226 const struct symsearch arr[] = { 214 const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
273 } 261 }
274 return false; 262 return false;
275} 263}
264EXPORT_SYMBOL_GPL(each_symbol);
276 265
277struct find_symbol_arg { 266struct find_symbol_arg {
278 /* Input */ 267 /* Input */
@@ -283,7 +272,7 @@ struct find_symbol_arg {
283 /* Output */ 272 /* Output */
284 struct module *owner; 273 struct module *owner;
285 const unsigned long *crc; 274 const unsigned long *crc;
286 unsigned long value; 275 const struct kernel_symbol *sym;
287}; 276};
288 277
289static bool find_symbol_in_section(const struct symsearch *syms, 278static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
324 313
325 fsa->owner = owner; 314 fsa->owner = owner;
326 fsa->crc = symversion(syms->crcs, symnum); 315 fsa->crc = symversion(syms->crcs, symnum);
327 fsa->value = syms->start[symnum].value; 316 fsa->sym = &syms->start[symnum];
328 return true; 317 return true;
329} 318}
330 319
331/* Find a symbol, return value, (optional) crc and (optional) module 320/* Find a symbol and return it, along with, (optional) crc and
332 * which owns it */ 321 * (optional) module which owns it */
333static unsigned long find_symbol(const char *name, 322const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 323 struct module **owner,
335 const unsigned long **crc, 324 const unsigned long **crc,
336 bool gplok, 325 bool gplok,
337 bool warn) 326 bool warn)
338{ 327{
339 struct find_symbol_arg fsa; 328 struct find_symbol_arg fsa;
340 329
@@ -347,15 +336,16 @@ static unsigned long find_symbol(const char *name,
347 *owner = fsa.owner; 336 *owner = fsa.owner;
348 if (crc) 337 if (crc)
349 *crc = fsa.crc; 338 *crc = fsa.crc;
350 return fsa.value; 339 return fsa.sym;
351 } 340 }
352 341
353 DEBUGP("Failed to find symbol %s\n", name); 342 DEBUGP("Failed to find symbol %s\n", name);
354 return -ENOENT; 343 return NULL;
355} 344}
345EXPORT_SYMBOL_GPL(find_symbol);
356 346
357/* Search for module by name: must hold module_mutex. */ 347/* Search for module by name: must hold module_mutex. */
358static struct module *find_module(const char *name) 348struct module *find_module(const char *name)
359{ 349{
360 struct module *mod; 350 struct module *mod;
361 351
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
365 } 355 }
366 return NULL; 356 return NULL;
367} 357}
358EXPORT_SYMBOL_GPL(find_module);
368 359
369#ifdef CONFIG_SMP 360#ifdef CONFIG_SMP
370 361
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
641} 632}
642 633
643/* Module a uses b */ 634/* Module a uses b */
644static int use_module(struct module *a, struct module *b) 635int use_module(struct module *a, struct module *b)
645{ 636{
646 struct module_use *use; 637 struct module_use *use;
647 int no_warn, err; 638 int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
674 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); 665 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
675 return 1; 666 return 1;
676} 667}
668EXPORT_SYMBOL_GPL(use_module);
677 669
678/* Clear the unload stuff of the module. */ 670/* Clear the unload stuff of the module. */
679static void module_unload_free(struct module *mod) 671static void module_unload_free(struct module *mod)
@@ -894,7 +886,7 @@ void __symbol_put(const char *symbol)
894 struct module *owner; 886 struct module *owner;
895 887
896 preempt_disable(); 888 preempt_disable();
897 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) 889 if (!find_symbol(symbol, &owner, NULL, true, false))
898 BUG(); 890 BUG();
899 module_put(owner); 891 module_put(owner);
900 preempt_enable(); 892 preempt_enable();
@@ -908,8 +900,10 @@ void symbol_put_addr(void *addr)
908 if (core_kernel_text((unsigned long)addr)) 900 if (core_kernel_text((unsigned long)addr))
909 return; 901 return;
910 902
911 if (!(modaddr = module_text_address((unsigned long)addr))) 903 /* module_text_address is safe here: we're supposed to have reference
912 BUG(); 904 * to module from symbol_get, so it can't go away. */
905 modaddr = __module_text_address((unsigned long)addr);
906 BUG_ON(!modaddr);
913 module_put(modaddr); 907 module_put(modaddr);
914} 908}
915EXPORT_SYMBOL_GPL(symbol_put_addr); 909EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -949,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
949{ 943{
950} 944}
951 945
952static inline int use_module(struct module *a, struct module *b) 946int use_module(struct module *a, struct module *b)
953{ 947{
954 return strong_try_module_get(b) == 0; 948 return strong_try_module_get(b) == 0;
955} 949}
950EXPORT_SYMBOL_GPL(use_module);
956 951
957static inline void module_unload_init(struct module *mod) 952static inline void module_unload_init(struct module *mod)
958{ 953{
@@ -995,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
995 990
996static const char vermagic[] = VERMAGIC_STRING; 991static const char vermagic[] = VERMAGIC_STRING;
997 992
998static int try_to_force_load(struct module *mod, const char *symname) 993static int try_to_force_load(struct module *mod, const char *reason)
999{ 994{
1000#ifdef CONFIG_MODULE_FORCE_LOAD 995#ifdef CONFIG_MODULE_FORCE_LOAD
1001 if (!test_taint(TAINT_FORCED_MODULE)) 996 if (!test_taint(TAINT_FORCED_MODULE))
1002 printk("%s: no version for \"%s\" found: kernel tainted.\n", 997 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1003 mod->name, symname); 998 mod->name, reason);
1004 add_taint_module(mod, TAINT_FORCED_MODULE); 999 add_taint_module(mod, TAINT_FORCED_MODULE);
1005 return 0; 1000 return 0;
1006#else 1001#else
@@ -1057,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1057{ 1052{
1058 const unsigned long *crc; 1053 const unsigned long *crc;
1059 1054
1060 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) 1055 if (!find_symbol("module_layout", NULL, &crc, true, false))
1061 BUG(); 1056 BUG();
1062 return check_version(sechdrs, versindex, "struct_module", mod, crc); 1057 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1063} 1058}
1064 1059
1065/* First part is kernel version, which we ignore if module has crcs. */ 1060/* First part is kernel version, which we ignore if module has crcs. */
@@ -1098,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1098 1093
1099/* Resolve a symbol for this module. I.e. if we find one, record usage. 1094/* Resolve a symbol for this module. I.e. if we find one, record usage.
1100 Must be holding module_mutex. */ 1095 Must be holding module_mutex. */
1101static unsigned long resolve_symbol(Elf_Shdr *sechdrs, 1096static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1102 unsigned int versindex, 1097 unsigned int versindex,
1103 const char *name, 1098 const char *name,
1104 struct module *mod) 1099 struct module *mod)
1105{ 1100{
1106 struct module *owner; 1101 struct module *owner;
1107 unsigned long ret; 1102 const struct kernel_symbol *sym;
1108 const unsigned long *crc; 1103 const unsigned long *crc;
1109 1104
1110 ret = find_symbol(name, &owner, &crc, 1105 sym = find_symbol(name, &owner, &crc,
1111 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1106 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1112 if (!IS_ERR_VALUE(ret)) { 1107 /* use_module can fail due to OOM,
1113 /* use_module can fail due to OOM, 1108 or module initialization or unloading */
1114 or module initialization or unloading */ 1109 if (sym) {
1115 if (!check_version(sechdrs, versindex, name, mod, crc) || 1110 if (!check_version(sechdrs, versindex, name, mod, crc) ||
1116 !use_module(mod, owner)) 1111 !use_module(mod, owner))
1117 ret = -EINVAL; 1112 sym = NULL;
1118 } 1113 }
1119 return ret; 1114 return sym;
1120} 1115}
1121 1116
1122/* 1117/*
@@ -1491,6 +1486,9 @@ static void free_module(struct module *mod)
1491 /* Module unload stuff */ 1486 /* Module unload stuff */
1492 module_unload_free(mod); 1487 module_unload_free(mod);
1493 1488
1489 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp);
1491
1494 /* release any pointers to mcount in this module */ 1492 /* release any pointers to mcount in this module */
1495 ftrace_release(mod->module_core, mod->core_size); 1493 ftrace_release(mod->module_core, mod->core_size);
1496 1494
@@ -1513,17 +1511,15 @@ static void free_module(struct module *mod)
1513void *__symbol_get(const char *symbol) 1511void *__symbol_get(const char *symbol)
1514{ 1512{
1515 struct module *owner; 1513 struct module *owner;
1516 unsigned long value; 1514 const struct kernel_symbol *sym;
1517 1515
1518 preempt_disable(); 1516 preempt_disable();
1519 value = find_symbol(symbol, &owner, NULL, true, true); 1517 sym = find_symbol(symbol, &owner, NULL, true, true);
1520 if (IS_ERR_VALUE(value)) 1518 if (sym && strong_try_module_get(owner))
1521 value = 0; 1519 sym = NULL;
1522 else if (strong_try_module_get(owner))
1523 value = 0;
1524 preempt_enable(); 1520 preempt_enable();
1525 1521
1526 return (void *)value; 1522 return sym ? (void *)sym->value : NULL;
1527} 1523}
1528EXPORT_SYMBOL_GPL(__symbol_get); 1524EXPORT_SYMBOL_GPL(__symbol_get);
1529 1525
@@ -1551,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
1551 1547
1552 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1548 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1553 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1549 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1554 if (!IS_ERR_VALUE(find_symbol(s->name, &owner, 1550 if (find_symbol(s->name, &owner, NULL, true, false)) {
1555 NULL, true, false))) {
1556 printk(KERN_ERR 1551 printk(KERN_ERR
1557 "%s: exports duplicate symbol %s" 1552 "%s: exports duplicate symbol %s"
1558 " (owned by %s)\n", 1553 " (owned by %s)\n",
@@ -1576,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1576 unsigned long secbase; 1571 unsigned long secbase;
1577 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1572 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1578 int ret = 0; 1573 int ret = 0;
1574 const struct kernel_symbol *ksym;
1579 1575
1580 for (i = 1; i < n; i++) { 1576 for (i = 1; i < n; i++) {
1581 switch (sym[i].st_shndx) { 1577 switch (sym[i].st_shndx) {
@@ -1595,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1595 break; 1591 break;
1596 1592
1597 case SHN_UNDEF: 1593 case SHN_UNDEF:
1598 sym[i].st_value 1594 ksym = resolve_symbol(sechdrs, versindex,
1599 = resolve_symbol(sechdrs, versindex, 1595 strtab + sym[i].st_name, mod);
1600 strtab + sym[i].st_name, mod);
1601
1602 /* Ok if resolved. */ 1596 /* Ok if resolved. */
1603 if (!IS_ERR_VALUE(sym[i].st_value)) 1597 if (ksym) {
1598 sym[i].st_value = ksym->value;
1604 break; 1599 break;
1600 }
1601
1605 /* Ok if weak. */ 1602 /* Ok if weak. */
1606 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1603 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1607 break; 1604 break;
@@ -1676,8 +1673,7 @@ static void layout_sections(struct module *mod,
1676 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1673 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1677 || (s->sh_flags & masks[m][1]) 1674 || (s->sh_flags & masks[m][1])
1678 || s->sh_entsize != ~0UL 1675 || s->sh_entsize != ~0UL
1679 || strncmp(secstrings + s->sh_name, 1676 || strstarts(secstrings + s->sh_name, ".init"))
1680 ".init", 5) == 0)
1681 continue; 1677 continue;
1682 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1678 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1683 DEBUGP("\t%s\n", secstrings + s->sh_name); 1679 DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1694,8 +1690,7 @@ static void layout_sections(struct module *mod,
1694 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1690 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1695 || (s->sh_flags & masks[m][1]) 1691 || (s->sh_flags & masks[m][1])
1696 || s->sh_entsize != ~0UL 1692 || s->sh_entsize != ~0UL
1697 || strncmp(secstrings + s->sh_name, 1693 || !strstarts(secstrings + s->sh_name, ".init"))
1698 ".init", 5) != 0)
1699 continue; 1694 continue;
1700 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1695 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1701 | INIT_OFFSET_MASK); 1696 | INIT_OFFSET_MASK);
@@ -1828,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
1828 else 1823 else
1829 return 'b'; 1824 return 'b';
1830 } 1825 }
1831 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, 1826 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
1832 ".debug", strlen(".debug")) == 0)
1833 return 'n'; 1827 return 'n';
1834 return '?'; 1828 return '?';
1835} 1829}
@@ -1898,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
1898 unsigned int symindex = 0; 1892 unsigned int symindex = 0;
1899 unsigned int strindex = 0; 1893 unsigned int strindex = 0;
1900 unsigned int modindex, versindex, infoindex, pcpuindex; 1894 unsigned int modindex, versindex, infoindex, pcpuindex;
1901 unsigned int num_kp, num_mcount; 1895 unsigned int num_mcount;
1902 struct kernel_param *kp;
1903 struct module *mod; 1896 struct module *mod;
1904 long err = 0; 1897 long err = 0;
1905 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1916,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
1916 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1909 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1917 return ERR_PTR(-ENOMEM); 1910 return ERR_PTR(-ENOMEM);
1918 1911
1919 /* Create stop_machine threads since the error path relies on
1920 * a non-failing stop_machine call. */
1921 err = stop_machine_create();
1922 if (err)
1923 goto free_hdr;
1924
1925 if (copy_from_user(hdr, umod, len) != 0) { 1912 if (copy_from_user(hdr, umod, len) != 0) {
1926 err = -EFAULT; 1913 err = -EFAULT;
1927 goto free_hdr; 1914 goto free_hdr;
@@ -1962,9 +1949,12 @@ static noinline struct module *load_module(void __user *umod,
1962 } 1949 }
1963#ifndef CONFIG_MODULE_UNLOAD 1950#ifndef CONFIG_MODULE_UNLOAD
1964 /* Don't load .exit sections */ 1951 /* Don't load .exit sections */
1965 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) 1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
1966 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1967#endif 1954#endif
1955 /* Don't keep __versions around; it's just for loading. */
1956 if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
1957 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1968 } 1958 }
1969 1959
1970 modindex = find_sec(hdr, sechdrs, secstrings, 1960 modindex = find_sec(hdr, sechdrs, secstrings,
@@ -2006,7 +1996,7 @@ static noinline struct module *load_module(void __user *umod,
2006 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1996 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2007 /* This is allowed: modprobe --force will invalidate it. */ 1997 /* This is allowed: modprobe --force will invalidate it. */
2008 if (!modmagic) { 1998 if (!modmagic) {
2009 err = try_to_force_load(mod, "magic"); 1999 err = try_to_force_load(mod, "bad vermagic");
2010 if (err) 2000 if (err)
2011 goto free_hdr; 2001 goto free_hdr;
2012 } else if (!same_magic(modmagic, vermagic, versindex)) { 2002 } else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2144,8 +2134,8 @@ static noinline struct module *load_module(void __user *umod,
2144 2134
2145 /* Now we've got everything in the final locations, we can 2135 /* Now we've got everything in the final locations, we can
2146 * find optional sections. */ 2136 * find optional sections. */
2147 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), 2137 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2148 &num_kp); 2138 sizeof(*mod->kp), &mod->num_kp);
2149 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", 2139 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2150 sizeof(*mod->syms), &mod->num_syms); 2140 sizeof(*mod->syms), &mod->num_syms);
2151 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); 2141 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2195,8 +2185,8 @@ static noinline struct module *load_module(void __user *umod,
2195 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2185 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2196#endif 2186#endif
2197 ) { 2187 ) {
2198 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2188 err = try_to_force_load(mod,
2199 err = try_to_force_load(mod, "nocrc"); 2189 "no versions for exported symbols");
2200 if (err) 2190 if (err)
2201 goto cleanup; 2191 goto cleanup;
2202 } 2192 }
@@ -2291,11 +2281,11 @@ static noinline struct module *load_module(void __user *umod,
2291 */ 2281 */
2292 list_add_rcu(&mod->list, &modules); 2282 list_add_rcu(&mod->list, &modules);
2293 2283
2294 err = parse_args(mod->name, mod->args, kp, num_kp, NULL); 2284 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2295 if (err < 0) 2285 if (err < 0)
2296 goto unlink; 2286 goto unlink;
2297 2287
2298 err = mod_sysfs_setup(mod, kp, num_kp); 2288 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2299 if (err < 0) 2289 if (err < 0)
2300 goto unlink; 2290 goto unlink;
2301 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2291 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2304,12 +2294,13 @@ static noinline struct module *load_module(void __user *umod,
2304 /* Get rid of temporary copy */ 2294 /* Get rid of temporary copy */
2305 vfree(hdr); 2295 vfree(hdr);
2306 2296
2307 stop_machine_destroy();
2308 /* Done! */ 2297 /* Done! */
2309 return mod; 2298 return mod;
2310 2299
2311 unlink: 2300 unlink:
2312 stop_machine(__unlink_module, mod, NULL); 2301 /* Unlink carefully: kallsyms could be walking list. */
2302 list_del_rcu(&mod->list);
2303 synchronize_sched();
2313 module_arch_cleanup(mod); 2304 module_arch_cleanup(mod);
2314 cleanup: 2305 cleanup:
2315 kobject_del(&mod->mkobj.kobj); 2306 kobject_del(&mod->mkobj.kobj);
@@ -2317,8 +2308,8 @@ static noinline struct module *load_module(void __user *umod,
2317 ftrace_release(mod->module_core, mod->core_size); 2308 ftrace_release(mod->module_core, mod->core_size);
2318 free_unload: 2309 free_unload:
2319 module_unload_free(mod); 2310 module_unload_free(mod);
2320 free_init:
2321#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2311#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2312 free_init:
2322 percpu_modfree(mod->refptr); 2313 percpu_modfree(mod->refptr);
2323#endif 2314#endif
2324 module_free(mod, mod->module_init); 2315 module_free(mod, mod->module_init);
@@ -2332,7 +2323,6 @@ static noinline struct module *load_module(void __user *umod,
2332 kfree(args); 2323 kfree(args);
2333 free_hdr: 2324 free_hdr:
2334 vfree(hdr); 2325 vfree(hdr);
2335 stop_machine_destroy();
2336 return ERR_PTR(err); 2326 return ERR_PTR(err);
2337 2327
2338 truncated: 2328 truncated:
@@ -2609,6 +2599,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2609 preempt_enable(); 2599 preempt_enable();
2610 return ret; 2600 return ret;
2611} 2601}
2602
2603int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
2604 struct module *, unsigned long),
2605 void *data)
2606{
2607 struct module *mod;
2608 unsigned int i;
2609 int ret;
2610
2611 list_for_each_entry(mod, &modules, list) {
2612 for (i = 0; i < mod->num_symtab; i++) {
2613 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
2614 mod, mod->symtab[i].st_value);
2615 if (ret != 0)
2616 return ret;
2617 }
2618 }
2619 return 0;
2620}
2612#endif /* CONFIG_KALLSYMS */ 2621#endif /* CONFIG_KALLSYMS */
2613 2622
2614static char *module_flags(struct module *mod, char *buf) 2623static char *module_flags(struct module *mod, char *buf)
@@ -2744,29 +2753,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2744} 2753}
2745 2754
2746/* 2755/*
2747 * Is this a valid module address? 2756 * is_module_address - is this address inside a module?
2757 * @addr: the address to check.
2758 *
2759 * See is_module_text_address() if you simply want to see if the address
2760 * is code (not data).
2748 */ 2761 */
2749int is_module_address(unsigned long addr) 2762bool is_module_address(unsigned long addr)
2750{ 2763{
2751 struct module *mod; 2764 bool ret;
2752 2765
2753 preempt_disable(); 2766 preempt_disable();
2754 2767 ret = __module_address(addr) != NULL;
2755 list_for_each_entry_rcu(mod, &modules, list) {
2756 if (within_module_core(addr, mod)) {
2757 preempt_enable();
2758 return 1;
2759 }
2760 }
2761
2762 preempt_enable(); 2768 preempt_enable();
2763 2769
2764 return 0; 2770 return ret;
2765} 2771}
2766 2772
2767 2773/*
2768/* Is this a valid kernel address? */ 2774 * __module_address - get the module which contains an address.
2769__notrace_funcgraph struct module *__module_text_address(unsigned long addr) 2775 * @addr: the address.
2776 *
2777 * Must be called with preempt disabled or module mutex held so that
2778 * module doesn't get freed during this.
2779 */
2780__notrace_funcgraph struct module *__module_address(unsigned long addr)
2770{ 2781{
2771 struct module *mod; 2782 struct module *mod;
2772 2783
@@ -2774,22 +2785,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2774 return NULL; 2785 return NULL;
2775 2786
2776 list_for_each_entry_rcu(mod, &modules, list) 2787 list_for_each_entry_rcu(mod, &modules, list)
2777 if (within(addr, mod->module_init, mod->init_text_size) 2788 if (within_module_core(addr, mod)
2778 || within(addr, mod->module_core, mod->core_text_size)) 2789 || within_module_init(addr, mod))
2779 return mod; 2790 return mod;
2780 return NULL; 2791 return NULL;
2781} 2792}
2793EXPORT_SYMBOL_GPL(__module_address);
2782 2794
2783struct module *module_text_address(unsigned long addr) 2795/*
2796 * is_module_text_address - is this address inside module code?
2797 * @addr: the address to check.
2798 *
2799 * See is_module_address() if you simply want to see if the address is
2800 * anywhere in a module. See kernel_text_address() for testing if an
2801 * address corresponds to kernel or module code.
2802 */
2803bool is_module_text_address(unsigned long addr)
2784{ 2804{
2785 struct module *mod; 2805 bool ret;
2786 2806
2787 preempt_disable(); 2807 preempt_disable();
2788 mod = __module_text_address(addr); 2808 ret = __module_text_address(addr) != NULL;
2789 preempt_enable(); 2809 preempt_enable();
2790 2810
2811 return ret;
2812}
2813
2814/*
2815 * __module_text_address - get the module whose code contains an address.
2816 * @addr: the address.
2817 *
2818 * Must be called with preempt disabled or module mutex held so that
2819 * module doesn't get freed during this.
2820 */
2821struct module *__module_text_address(unsigned long addr)
2822{
2823 struct module *mod = __module_address(addr);
2824 if (mod) {
2825 /* Make sure it's within the text section. */
2826 if (!within(addr, mod->module_init, mod->init_text_size)
2827 && !within(addr, mod->module_core, mod->core_text_size))
2828 mod = NULL;
2829 }
2791 return mod; 2830 return mod;
2792} 2831}
2832EXPORT_SYMBOL_GPL(__module_text_address);
2793 2833
2794/* Don't grab lock, we're oopsing. */ 2834/* Don't grab lock, we're oopsing. */
2795void print_modules(void) 2835void print_modules(void)
@@ -2809,9 +2849,17 @@ void print_modules(void)
2809} 2849}
2810 2850
2811#ifdef CONFIG_MODVERSIONS 2851#ifdef CONFIG_MODVERSIONS
2812/* Generate the signature for struct module here, too, for modversions. */ 2852/* Generate the signature for all relevant module structures here.
2813void struct_module(struct module *mod) { return; } 2853 * If these change, we don't want to try to parse the module. */
2814EXPORT_SYMBOL(struct_module); 2854void module_layout(struct module *mod,
2855 struct modversion_info *ver,
2856 struct kernel_param *kp,
2857 struct kernel_symbol *ks,
2858 struct marker *marker,
2859 struct tracepoint *tp)
2860{
2861}
2862EXPORT_SYMBOL(module_layout);
2815#endif 2863#endif
2816 2864
2817#ifdef CONFIG_MARKERS 2865#ifdef CONFIG_MARKERS
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
34 34
35/* 35/*
36 * Rules: 36 * Rules:
37 * 1. you can only enter a cgroup which is a child of your current 37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup 38 * cgroup
39 * 2. you can only place another process into a cgroup if 39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN 40 * a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct cgroup *new_cgroup, struct task_struct *task)
47{ 47{
48 struct cgroup *orig;
49
50 if (current != task) { 48 if (current != task) {
51 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
52 return -EPERM; 50 return -EPERM;
53 51
54 if (!cgroup_is_descendant(new_cgroup)) 52 if (!cgroup_is_descendant(new_cgroup, current))
55 return -EPERM; 53 return -EPERM;
56 } 54 }
57 55
58 if (atomic_read(&new_cgroup->count) != 0) 56 if (!cgroup_is_descendant(new_cgroup, task))
59 return -EPERM;
60
61 orig = task_cgroup(task, ns_subsys_id);
62 if (orig && orig != new_cgroup->parent)
63 return -EPERM; 57 return -EPERM;
64 58
65 return 0; 59 return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
77 71
78 if (!capable(CAP_SYS_ADMIN)) 72 if (!capable(CAP_SYS_ADMIN))
79 return ERR_PTR(-EPERM); 73 return ERR_PTR(-EPERM);
80 if (!cgroup_is_descendant(cgroup)) 74 if (!cgroup_is_descendant(cgroup, current))
81 return ERR_PTR(-EPERM); 75 return ERR_PTR(-EPERM);
82 76
83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 77 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4eff1b89..3fd8c5bf8b39 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/debug_locks.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/notifier.h>
11#include <linux/module.h> 15#include <linux/module.h>
12#include <linux/sched.h> 16#include <linux/random.h>
13#include <linux/delay.h>
14#include <linux/reboot.h> 17#include <linux/reboot.h>
15#include <linux/notifier.h> 18#include <linux/delay.h>
16#include <linux/init.h> 19#include <linux/kexec.h>
20#include <linux/sched.h>
17#include <linux/sysrq.h> 21#include <linux/sysrq.h>
18#include <linux/interrupt.h> 22#include <linux/init.h>
19#include <linux/nmi.h> 23#include <linux/nmi.h>
20#include <linux/kexec.h>
21#include <linux/debug_locks.h>
22#include <linux/random.h>
23#include <linux/kallsyms.h>
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25 25
26int panic_on_oops; 26int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
52 * 52 *
53 * This function never returns. 53 * This function never returns.
54 */ 54 */
55
56NORET_TYPE void panic(const char * fmt, ...) 55NORET_TYPE void panic(const char * fmt, ...)
57{ 56{
58 long i;
59 static char buf[1024]; 57 static char buf[1024];
60 va_list args; 58 va_list args;
61#if defined(CONFIG_S390) 59 long i;
62 unsigned long caller = (unsigned long) __builtin_return_address(0);
63#endif
64 60
65 /* 61 /*
66 * It's possible to come here directly from a panic-assertion and not 62 * It's possible to come here directly from a panic-assertion and
67 * have preempt disabled. Some functions called from here want 63 * not have preempt disabled. Some functions called from here want
68 * preempt to be disabled. No point enabling it later though... 64 * preempt to be disabled. No point enabling it later though...
69 */ 65 */
70 preempt_disable(); 66 preempt_disable();
@@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...)
77#ifdef CONFIG_DEBUG_BUGVERBOSE 73#ifdef CONFIG_DEBUG_BUGVERBOSE
78 dump_stack(); 74 dump_stack();
79#endif 75#endif
80 bust_spinlocks(0);
81 76
82 /* 77 /*
83 * If we have crashed and we have a crash kernel loaded let it handle 78 * If we have crashed and we have a crash kernel loaded let it handle
@@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
86 */ 81 */
87 crash_kexec(NULL); 82 crash_kexec(NULL);
88 83
89#ifdef CONFIG_SMP
90 /* 84 /*
91 * Note smp_send_stop is the usual smp shutdown function, which 85 * Note smp_send_stop is the usual smp shutdown function, which
92 * unfortunately means it may not be hardened to work in a panic 86 * unfortunately means it may not be hardened to work in a panic
93 * situation. 87 * situation.
94 */ 88 */
95 smp_send_stop(); 89 smp_send_stop();
96#endif
97 90
98 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
99 92
@@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
102 95
103 if (panic_timeout > 0) { 96 if (panic_timeout > 0) {
104 /* 97 /*
105 * Delay timeout seconds before rebooting the machine. 98 * Delay timeout seconds before rebooting the machine.
106 * We can't use the "normal" timers since we just panicked.. 99 * We can't use the "normal" timers since we just panicked.
107 */ 100 */
108 printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); 101 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
102
109 for (i = 0; i < panic_timeout*1000; ) { 103 for (i = 0; i < panic_timeout*1000; ) {
110 touch_nmi_watchdog(); 104 touch_nmi_watchdog();
111 i += panic_blink(i); 105 i += panic_blink(i);
112 mdelay(1); 106 mdelay(1);
113 i++; 107 i++;
114 } 108 }
115 /* This will not be a clean reboot, with everything 109 /*
116 * shutting down. But if there is a chance of 110 * This will not be a clean reboot, with everything
117 * rebooting the system it will be rebooted. 111 * shutting down. But if there is a chance of
112 * rebooting the system it will be rebooted.
118 */ 113 */
119 emergency_restart(); 114 emergency_restart();
120 } 115 }
@@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
127 } 122 }
128#endif 123#endif
129#if defined(CONFIG_S390) 124#if defined(CONFIG_S390)
130 disabled_wait(caller); 125 {
126 unsigned long caller;
127
128 caller = (unsigned long)__builtin_return_address(0);
129 disabled_wait(caller);
130 }
131#endif 131#endif
132 local_irq_enable(); 132 local_irq_enable();
133 for (i = 0;;) { 133 for (i = 0; ; ) {
134 touch_softlockup_watchdog(); 134 touch_softlockup_watchdog();
135 i += panic_blink(i); 135 i += panic_blink(i);
136 mdelay(1); 136 mdelay(1);
137 i++; 137 i++;
138 } 138 }
139 bust_spinlocks(0);
139} 140}
140 141
141EXPORT_SYMBOL(panic); 142EXPORT_SYMBOL(panic);
142 143
143 144
144struct tnt { 145struct tnt {
145 u8 bit; 146 u8 bit;
146 char true; 147 char true;
147 char false; 148 char false;
148}; 149};
149 150
150static const struct tnt tnts[] = { 151static const struct tnt tnts[] = {
151 { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, 152 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
152 { TAINT_FORCED_MODULE, 'F', ' ' }, 153 { TAINT_FORCED_MODULE, 'F', ' ' },
153 { TAINT_UNSAFE_SMP, 'S', ' ' }, 154 { TAINT_UNSAFE_SMP, 'S', ' ' },
154 { TAINT_FORCED_RMMOD, 'R', ' ' }, 155 { TAINT_FORCED_RMMOD, 'R', ' ' },
155 { TAINT_MACHINE_CHECK, 'M', ' ' }, 156 { TAINT_MACHINE_CHECK, 'M', ' ' },
156 { TAINT_BAD_PAGE, 'B', ' ' }, 157 { TAINT_BAD_PAGE, 'B', ' ' },
157 { TAINT_USER, 'U', ' ' }, 158 { TAINT_USER, 'U', ' ' },
158 { TAINT_DIE, 'D', ' ' }, 159 { TAINT_DIE, 'D', ' ' },
159 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 160 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
160 { TAINT_WARN, 'W', ' ' }, 161 { TAINT_WARN, 'W', ' ' },
161 { TAINT_CRAP, 'C', ' ' }, 162 { TAINT_CRAP, 'C', ' ' },
162}; 163};
163 164
164/** 165/**
@@ -195,7 +196,8 @@ const char *print_tainted(void)
195 *s = 0; 196 *s = 0;
196 } else 197 } else
197 snprintf(buf, sizeof(buf), "Not tainted"); 198 snprintf(buf, sizeof(buf), "Not tainted");
198 return(buf); 199
200 return buf;
199} 201}
200 202
201int test_taint(unsigned flag) 203int test_taint(unsigned flag)
@@ -211,7 +213,8 @@ unsigned long get_taint(void)
211 213
212void add_taint(unsigned flag) 214void add_taint(unsigned flag)
213{ 215{
214 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 216 /* can't trust the integrity of the kernel anymore: */
217 debug_locks = 0;
215 set_bit(flag, &tainted_mask); 218 set_bit(flag, &tainted_mask);
216} 219}
217EXPORT_SYMBOL(add_taint); 220EXPORT_SYMBOL(add_taint);
@@ -266,8 +269,8 @@ static void do_oops_enter_exit(void)
266} 269}
267 270
268/* 271/*
269 * Return true if the calling CPU is allowed to print oops-related info. This 272 * Return true if the calling CPU is allowed to print oops-related info.
270 * is a bit racy.. 273 * This is a bit racy..
271 */ 274 */
272int oops_may_print(void) 275int oops_may_print(void)
273{ 276{
@@ -276,20 +279,22 @@ int oops_may_print(void)
276 279
277/* 280/*
278 * Called when the architecture enters its oops handler, before it prints 281 * Called when the architecture enters its oops handler, before it prints
279 * anything. If this is the first CPU to oops, and it's oopsing the first time 282 * anything. If this is the first CPU to oops, and it's oopsing the first
280 * then let it proceed. 283 * time then let it proceed.
281 * 284 *
282 * This is all enabled by the pause_on_oops kernel boot option. We do all this 285 * This is all enabled by the pause_on_oops kernel boot option. We do all
283 * to ensure that oopses don't scroll off the screen. It has the side-effect 286 * this to ensure that oopses don't scroll off the screen. It has the
284 * of preventing later-oopsing CPUs from mucking up the display, too. 287 * side-effect of preventing later-oopsing CPUs from mucking up the display,
288 * too.
285 * 289 *
286 * It turns out that the CPU which is allowed to print ends up pausing for the 290 * It turns out that the CPU which is allowed to print ends up pausing for
287 * right duration, whereas all the other CPUs pause for twice as long: once in 291 * the right duration, whereas all the other CPUs pause for twice as long:
288 * oops_enter(), once in oops_exit(). 292 * once in oops_enter(), once in oops_exit().
289 */ 293 */
290void oops_enter(void) 294void oops_enter(void)
291{ 295{
292 debug_locks_off(); /* can't trust the integrity of the kernel anymore */ 296 /* can't trust the integrity of the kernel anymore: */
297 debug_locks_off();
293 do_oops_enter_exit(); 298 do_oops_enter_exit();
294} 299}
295 300
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
27#if 0 30#if 0
28#define DEBUGP printk 31#define DEBUGP printk
29#else 32#else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 220 return -ENOSPC;
218 } 221 }
219 222
220 *(char **)kp->arg = (char *)val; 223 if (kp->perm & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg);
225
226 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg)
232 return -ENOMEM;
233 } else
234 *(const char **)kp->arg = val;
235
221 return 0; 236 return 0;
222} 237}
223 238
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
571} 586}
572#endif 587#endif
573 588
589void destroy_params(const struct kernel_param *params, unsigned num)
590{
591 unsigned int i;
592
593 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg);
596}
597
574static void __init kernel_add_sysfs_param(const char *name, 598static void __init kernel_add_sysfs_param(const char *name,
575 struct kernel_param *kparam, 599 struct kernel_param *kparam,
576 unsigned int name_skip) 600 unsigned int name_skip)
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 403{
404 struct pid *pid; 404 struct pid *pid;
405 rcu_read_lock(); 405 rcu_read_lock();
406 if (type != PIDTYPE_PID)
407 task = task->group_leader;
406 pid = get_pid(task->pids[type].pid); 408 pid = get_pid(task->pids[type].pid);
407 rcu_read_unlock(); 409 rcu_read_unlock();
408 return pid; 410 return pid;
@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
450} 452}
451EXPORT_SYMBOL_GPL(pid_vnr); 453EXPORT_SYMBOL_GPL(pid_vnr);
452 454
453pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 455pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
456 struct pid_namespace *ns)
454{ 457{
455 return pid_nr_ns(task_pid(tsk), ns); 458 pid_t nr = 0;
459
460 rcu_read_lock();
461 if (!ns)
462 ns = current->nsproxy->pid_ns;
463 if (likely(pid_alive(task))) {
464 if (type != PIDTYPE_PID)
465 task = task->group_leader;
466 nr = pid_nr_ns(task->pids[type].pid, ns);
467 }
468 rcu_read_unlock();
469
470 return nr;
456} 471}
457EXPORT_SYMBOL(task_pid_nr_ns); 472EXPORT_SYMBOL(__task_pid_nr_ns);
458 473
459pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 474pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
460{ 475{
@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
462} 477}
463EXPORT_SYMBOL(task_tgid_nr_ns); 478EXPORT_SYMBOL(task_tgid_nr_ns);
464 479
465pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
466{
467 return pid_nr_ns(task_pgrp(tsk), ns);
468}
469EXPORT_SYMBOL(task_pgrp_nr_ns);
470
471pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
472{
473 return pid_nr_ns(task_session(tsk), ns);
474}
475EXPORT_SYMBOL(task_session_nr_ns);
476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 480struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{ 481{
479 return ns_of_pid(task_pid(tsk)); 482 return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
152{ 152{
153 int nr; 153 int nr;
154 int rc; 154 int rc;
155 struct task_struct *task;
155 156
156 /* 157 /*
157 * The last thread in the cgroup-init thread group is terminating. 158 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
169 read_lock(&tasklist_lock); 170 read_lock(&tasklist_lock);
170 nr = next_pidmap(pid_ns, 1); 171 nr = next_pidmap(pid_ns, 1);
171 while (nr > 0) { 172 while (nr > 0) {
172 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); 173 rcu_read_lock();
174
175 /*
176 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
177 * any nested-container's init processes don't ignore the
178 * signal
179 */
180 task = pid_task(find_vpid(nr), PIDTYPE_PID);
181 if (task)
182 force_sig(SIGKILL, task);
183
184 rcu_read_unlock();
185
173 nr = next_pidmap(pid_ns, nr); 186 nr = next_pidmap(pid_ns, nr);
174 } 187 }
175 read_unlock(&tasklist_lock); 188 read_unlock(&tasklist_lock);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d1332a10..5f21ab2bbcdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <asm/suspend.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -288,7 +289,7 @@ static int create_image(int platform_mode)
288 * hibernation_snapshot - quiesce devices and create the hibernation 289 * hibernation_snapshot - quiesce devices and create the hibernation
289 * snapshot image. 290 * snapshot image.
290 * @platform_mode - if set, use the platform driver, if available, to 291 * @platform_mode - if set, use the platform driver, if available, to
291 * prepare the platform frimware for the power transition. 292 * prepare the platform firmware for the power transition.
292 * 293 *
293 * Must be called with pm_mutex held 294 * Must be called with pm_mutex held
294 */ 295 */
@@ -411,7 +412,7 @@ static int resume_target_kernel(bool platform_mode)
411 * hibernation_restore - quiesce devices and restore the hibernation 412 * hibernation_restore - quiesce devices and restore the hibernation
412 * snapshot image. If successful, control returns in hibernation_snaphot() 413 * snapshot image. If successful, control returns in hibernation_snaphot()
413 * @platform_mode - if set, use the platform driver, if available, to 414 * @platform_mode - if set, use the platform driver, if available, to
414 * prepare the platform frimware for the transition. 415 * prepare the platform firmware for the transition.
415 * 416 *
416 * Must be called with pm_mutex held 417 * Must be called with pm_mutex held
417 */ 418 */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
321 321
322 INIT_LIST_HEAD(list); 322 INIT_LIST_HEAD(list);
323 323
324 for_each_zone(zone) { 324 for_each_populated_zone(zone) {
325 unsigned long zone_start, zone_end; 325 unsigned long zone_start, zone_end;
326 struct mem_extent *ext, *cur, *aux; 326 struct mem_extent *ext, *cur, *aux;
327 327
328 if (!populated_zone(zone))
329 continue;
330
331 zone_start = zone->zone_start_pfn; 328 zone_start = zone->zone_start_pfn;
332 zone_end = zone->zone_start_pfn + zone->spanned_pages; 329 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333 330
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
804 struct zone *zone; 801 struct zone *zone;
805 unsigned int cnt = 0; 802 unsigned int cnt = 0;
806 803
807 for_each_zone(zone) 804 for_each_populated_zone(zone)
808 if (populated_zone(zone) && is_highmem(zone)) 805 if (is_highmem(zone))
809 cnt += zone_page_state(zone, NR_FREE_PAGES); 806 cnt += zone_page_state(zone, NR_FREE_PAGES);
810 807
811 return cnt; 808 return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h> 52#include <linux/time.h>
53#include <linux/rbtree.h> 53#include <linux/rbtree.h>
54#include <linux/io.h>
54 55
55#include "power.h" 56#include "power.h"
56 57
@@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)
229 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; 230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
230 tmp = size; 231 tmp = size;
231 size += highmem_size; 232 size += highmem_size;
232 for_each_zone (zone) 233 for_each_populated_zone(zone) {
233 if (populated_zone(zone)) { 234 tmp += snapshot_additional_pages(zone);
234 tmp += snapshot_additional_pages(zone); 235 if (is_highmem(zone)) {
235 if (is_highmem(zone)) { 236 highmem_size -=
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES); 237 zone_page_state(zone, NR_FREE_PAGES);
238 } else { 238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES); 239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 } 241 }
242 }
243 243
244 if (highmem_size < 0) 244 if (highmem_size < 0)
245 highmem_size = 0; 245 highmem_size = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..5052b5497c67 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37 38
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
135static int log_buf_len = __LOG_BUF_LEN; 136static int log_buf_len = __LOG_BUF_LEN;
136static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 137static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
137 138
139#ifdef CONFIG_KEXEC
140/*
141 * This appends the listed symbols to /proc/vmcoreinfo
142 *
143 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
144 * obtain access to symbols that are otherwise very difficult to locate. These
145 * symbols are specifically used so that utilities can access and extract the
146 * dmesg log from a vmcore file after a crash.
147 */
148void log_buf_kexec_setup(void)
149{
150 VMCOREINFO_SYMBOL(log_buf);
151 VMCOREINFO_SYMBOL(log_end);
152 VMCOREINFO_SYMBOL(log_buf_len);
153 VMCOREINFO_SYMBOL(logged_chars);
154}
155#endif
156
138static int __init log_buf_len_setup(char *str) 157static int __init log_buf_len_setup(char *str)
139{ 158{
140 unsigned size = memparse(str, &str); 159 unsigned size = memparse(str, &str);
@@ -1292,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
1292bool printk_timed_ratelimit(unsigned long *caller_jiffies, 1311bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1293 unsigned int interval_msecs) 1312 unsigned int interval_msecs)
1294{ 1313{
1295 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { 1314 if (*caller_jiffies == 0
1296 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); 1315 || !time_in_range(jiffies, *caller_jiffies,
1316 *caller_jiffies
1317 + msecs_to_jiffies(interval_msecs))) {
1318 *caller_jiffies = jiffies;
1297 return true; 1319 return true;
1298 } 1320 }
1299 return false; 1321 return false;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..aaad0ec34194 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
60{ 60{
61 spin_lock(&child->sighand->siglock); 61 spin_lock(&child->sighand->siglock);
62 if (task_is_traced(child)) { 62 if (task_is_traced(child)) {
63 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 63 /*
64 * If the group stop is completed or in progress,
65 * this thread was already counted as stopped.
66 */
67 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
68 child->signal->group_stop_count)
64 __set_task_state(child, TASK_STOPPED); 69 __set_task_state(child, TASK_STOPPED);
65 } else { 70 else
66 signal_wake_up(child, 1); 71 signal_wake_up(child, 1);
67 }
68 } 72 }
69 spin_unlock(&child->sighand->siglock); 73 spin_unlock(&child->sighand->siglock);
70} 74}
@@ -235,18 +239,58 @@ out:
235 return retval; 239 return retval;
236} 240}
237 241
238static inline void __ptrace_detach(struct task_struct *child, unsigned int data) 242/*
243 * Called with irqs disabled, returns true if childs should reap themselves.
244 */
245static int ignoring_children(struct sighand_struct *sigh)
239{ 246{
240 child->exit_code = data; 247 int ret;
241 /* .. re-parent .. */ 248 spin_lock(&sigh->siglock);
242 __ptrace_unlink(child); 249 ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
243 /* .. and wake it up. */ 250 (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
244 if (child->exit_state != EXIT_ZOMBIE) 251 spin_unlock(&sigh->siglock);
245 wake_up_process(child); 252 return ret;
253}
254
255/*
256 * Called with tasklist_lock held for writing.
257 * Unlink a traced task, and clean it up if it was a traced zombie.
258 * Return true if it needs to be reaped with release_task().
259 * (We can't call release_task() here because we already hold tasklist_lock.)
260 *
261 * If it's a zombie, our attachedness prevented normal parent notification
262 * or self-reaping. Do notification now if it would have happened earlier.
263 * If it should reap itself, return true.
264 *
265 * If it's our own child, there is no notification to do.
266 * But if our normal children self-reap, then this child
267 * was prevented by ptrace and we must reap it now.
268 */
269static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
270{
271 __ptrace_unlink(p);
272
273 if (p->exit_state == EXIT_ZOMBIE) {
274 if (!task_detached(p) && thread_group_empty(p)) {
275 if (!same_thread_group(p->real_parent, tracer))
276 do_notify_parent(p, p->exit_signal);
277 else if (ignoring_children(tracer->sighand))
278 p->exit_signal = -1;
279 }
280 if (task_detached(p)) {
281 /* Mark it as in the process of being reaped. */
282 p->exit_state = EXIT_DEAD;
283 return true;
284 }
285 }
286
287 return false;
246} 288}
247 289
248int ptrace_detach(struct task_struct *child, unsigned int data) 290int ptrace_detach(struct task_struct *child, unsigned int data)
249{ 291{
292 bool dead = false;
293
250 if (!valid_signal(data)) 294 if (!valid_signal(data))
251 return -EIO; 295 return -EIO;
252 296
@@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
255 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 299 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
256 300
257 write_lock_irq(&tasklist_lock); 301 write_lock_irq(&tasklist_lock);
258 /* protect against de_thread()->release_task() */ 302 /*
259 if (child->ptrace) 303 * This child can be already killed. Make sure de_thread() or
260 __ptrace_detach(child, data); 304 * our sub-thread doing do_wait() didn't do release_task() yet.
305 */
306 if (child->ptrace) {
307 child->exit_code = data;
308 dead = __ptrace_detach(current, child);
309 }
261 write_unlock_irq(&tasklist_lock); 310 write_unlock_irq(&tasklist_lock);
262 311
312 if (unlikely(dead))
313 release_task(child);
314
263 return 0; 315 return 0;
264} 316}
265 317
318/*
319 * Detach all tasks we were using ptrace on.
320 */
321void exit_ptrace(struct task_struct *tracer)
322{
323 struct task_struct *p, *n;
324 LIST_HEAD(ptrace_dead);
325
326 write_lock_irq(&tasklist_lock);
327 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
328 if (__ptrace_detach(tracer, p))
329 list_add(&p->ptrace_entry, &ptrace_dead);
330 }
331 write_unlock_irq(&tasklist_lock);
332
333 BUG_ON(!list_empty(&tracer->ptraced));
334
335 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
336 list_del_init(&p->ptrace_entry);
337 release_task(p);
338 }
339}
340
266int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 341int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
267{ 342{
268 int copied = 0; 343 int copied = 0;
@@ -612,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
612 goto out_put_task_struct; 687 goto out_put_task_struct;
613 688
614 ret = arch_ptrace(child, request, addr, data); 689 ret = arch_ptrace(child, request, addr, data);
615 if (ret < 0)
616 goto out_put_task_struct;
617 690
618 out_put_task_struct: 691 out_put_task_struct:
619 put_task_struct(child); 692 put_task_struct(child);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a059cf47..2c7b8457d0d2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
122 } 122 }
123} 123}
124 124
125static inline void wait_migrated_callbacks(void);
126
125/* 127/*
126 * Orchestrate the specified type of RCU barrier, waiting for all 128 * Orchestrate the specified type of RCU barrier, waiting for all
127 * RCU callbacks of the specified type to complete. 129 * RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
147 complete(&rcu_barrier_completion); 149 complete(&rcu_barrier_completion);
148 wait_for_completion(&rcu_barrier_completion); 150 wait_for_completion(&rcu_barrier_completion);
149 mutex_unlock(&rcu_barrier_mutex); 151 mutex_unlock(&rcu_barrier_mutex);
152 wait_migrated_callbacks();
150} 153}
151 154
152/** 155/**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
176} 179}
177EXPORT_SYMBOL_GPL(rcu_barrier_sched); 180EXPORT_SYMBOL_GPL(rcu_barrier_sched);
178 181
182static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
183static struct rcu_head rcu_migrate_head[3];
184static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
185
186static void rcu_migrate_callback(struct rcu_head *notused)
187{
188 if (atomic_dec_and_test(&rcu_migrate_type_count))
189 wake_up(&rcu_migrate_wq);
190}
191
192static inline void wait_migrated_callbacks(void)
193{
194 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
195}
196
197static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
198 unsigned long action, void *hcpu)
199{
200 if (action == CPU_DYING) {
201 /*
202 * preempt_disable() in on_each_cpu() prevents stop_machine(),
203 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
204 * returns, all online cpus have queued rcu_barrier_func(),
205 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
206 *
207 * These callbacks ensure _rcu_barrier() waits for all
208 * RCU callbacks of the specified type to complete.
209 */
210 atomic_set(&rcu_migrate_type_count, 3);
211 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
212 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
213 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
214 } else if (action == CPU_POST_DEAD) {
215 /* rcu_migrate_head is protected by cpu_add_remove_lock */
216 wait_migrated_callbacks();
217 }
218
219 return NOTIFY_OK;
220}
221
179void __init rcu_init(void) 222void __init rcu_init(void)
180{ 223{
181 __rcu_init(); 224 __rcu_init();
225 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
182} 226}
183 227
184void rcu_scheduler_starting(void) 228void rcu_scheduler_starting(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f2179c8056f..e92db8c06acf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,
797 if (!chan) 797 if (!chan)
798 return; 798 return;
799 799
800 if (cpu >= NR_CPUS || !chan->buf[cpu]) 800 if (cpu >= NR_CPUS || !chan->buf[cpu] ||
801 subbufs_consumed > chan->n_subbufs)
801 return; 802 return;
802 803
803 buf = chan->buf[cpu]; 804 buf = chan->buf[cpu];
804 buf->subbufs_consumed += subbufs_consumed; 805 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
805 if (buf->subbufs_consumed > buf->subbufs_produced)
806 buf->subbufs_consumed = buf->subbufs_produced; 806 buf->subbufs_consumed = buf->subbufs_produced;
807 else
808 buf->subbufs_consumed += subbufs_consumed;
807} 809}
808EXPORT_SYMBOL_GPL(relay_subbufs_consumed); 810EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
809 811
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..55a10b8e31bb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1110,7 +1110,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1110 if (rq == this_rq()) { 1110 if (rq == this_rq()) {
1111 hrtimer_restart(timer); 1111 hrtimer_restart(timer);
1112 } else if (!rq->hrtick_csd_pending) { 1112 } else if (!rq->hrtick_csd_pending) {
1113 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1113 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1114 rq->hrtick_csd_pending = 1; 1114 rq->hrtick_csd_pending = 1;
1115 } 1115 }
1116} 1116}
@@ -3818,19 +3818,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3818 */ 3818 */
3819#define MAX_PINNED_INTERVAL 512 3819#define MAX_PINNED_INTERVAL 512
3820 3820
3821/* Working cpumask for load_balance and load_balance_newidle. */
3822static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3823
3821/* 3824/*
3822 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3825 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3823 * tasks if there is an imbalance. 3826 * tasks if there is an imbalance.
3824 */ 3827 */
3825static int load_balance(int this_cpu, struct rq *this_rq, 3828static int load_balance(int this_cpu, struct rq *this_rq,
3826 struct sched_domain *sd, enum cpu_idle_type idle, 3829 struct sched_domain *sd, enum cpu_idle_type idle,
3827 int *balance, struct cpumask *cpus) 3830 int *balance)
3828{ 3831{
3829 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3832 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3830 struct sched_group *group; 3833 struct sched_group *group;
3831 unsigned long imbalance; 3834 unsigned long imbalance;
3832 struct rq *busiest; 3835 struct rq *busiest;
3833 unsigned long flags; 3836 unsigned long flags;
3837 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3834 3838
3835 cpumask_setall(cpus); 3839 cpumask_setall(cpus);
3836 3840
@@ -3985,8 +3989,7 @@ out:
3985 * this_rq is locked. 3989 * this_rq is locked.
3986 */ 3990 */
3987static int 3991static int
3988load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3992load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3989 struct cpumask *cpus)
3990{ 3993{
3991 struct sched_group *group; 3994 struct sched_group *group;
3992 struct rq *busiest = NULL; 3995 struct rq *busiest = NULL;
@@ -3994,6 +3997,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3994 int ld_moved = 0; 3997 int ld_moved = 0;
3995 int sd_idle = 0; 3998 int sd_idle = 0;
3996 int all_pinned = 0; 3999 int all_pinned = 0;
4000 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3997 4001
3998 cpumask_setall(cpus); 4002 cpumask_setall(cpus);
3999 4003
@@ -4134,10 +4138,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4134 struct sched_domain *sd; 4138 struct sched_domain *sd;
4135 int pulled_task = 0; 4139 int pulled_task = 0;
4136 unsigned long next_balance = jiffies + HZ; 4140 unsigned long next_balance = jiffies + HZ;
4137 cpumask_var_t tmpmask;
4138
4139 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
4140 return;
4141 4141
4142 for_each_domain(this_cpu, sd) { 4142 for_each_domain(this_cpu, sd) {
4143 unsigned long interval; 4143 unsigned long interval;
@@ -4148,7 +4148,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4148 if (sd->flags & SD_BALANCE_NEWIDLE) 4148 if (sd->flags & SD_BALANCE_NEWIDLE)
4149 /* If we've pulled tasks over stop searching: */ 4149 /* If we've pulled tasks over stop searching: */
4150 pulled_task = load_balance_newidle(this_cpu, this_rq, 4150 pulled_task = load_balance_newidle(this_cpu, this_rq,
4151 sd, tmpmask); 4151 sd);
4152 4152
4153 interval = msecs_to_jiffies(sd->balance_interval); 4153 interval = msecs_to_jiffies(sd->balance_interval);
4154 if (time_after(next_balance, sd->last_balance + interval)) 4154 if (time_after(next_balance, sd->last_balance + interval))
@@ -4163,7 +4163,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4163 */ 4163 */
4164 this_rq->next_balance = next_balance; 4164 this_rq->next_balance = next_balance;
4165 } 4165 }
4166 free_cpumask_var(tmpmask);
4167} 4166}
4168 4167
4169/* 4168/*
@@ -4313,11 +4312,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4313 unsigned long next_balance = jiffies + 60*HZ; 4312 unsigned long next_balance = jiffies + 60*HZ;
4314 int update_next_balance = 0; 4313 int update_next_balance = 0;
4315 int need_serialize; 4314 int need_serialize;
4316 cpumask_var_t tmp;
4317
4318 /* Fails alloc? Rebalancing probably not a priority right now. */
4319 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
4320 return;
4321 4315
4322 for_each_domain(cpu, sd) { 4316 for_each_domain(cpu, sd) {
4323 if (!(sd->flags & SD_LOAD_BALANCE)) 4317 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -4342,7 +4336,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4342 } 4336 }
4343 4337
4344 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4338 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4345 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4339 if (load_balance(cpu, rq, sd, idle, &balance)) {
4346 /* 4340 /*
4347 * We've pulled tasks over so either we're no 4341 * We've pulled tasks over so either we're no
4348 * longer idle, or one of our SMT siblings is 4342 * longer idle, or one of our SMT siblings is
@@ -4376,8 +4370,6 @@ out:
4376 */ 4370 */
4377 if (likely(update_next_balance)) 4371 if (likely(update_next_balance))
4378 rq->next_balance = next_balance; 4372 rq->next_balance = next_balance;
4379
4380 free_cpumask_var(tmp);
4381} 4373}
4382 4374
4383/* 4375/*
@@ -5196,11 +5188,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5196 __wake_up_common(q, mode, 1, 0, NULL); 5188 __wake_up_common(q, mode, 1, 0, NULL);
5197} 5189}
5198 5190
5191void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5192{
5193 __wake_up_common(q, mode, 1, 0, key);
5194}
5195
5199/** 5196/**
5200 * __wake_up_sync - wake up threads blocked on a waitqueue. 5197 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
5201 * @q: the waitqueue 5198 * @q: the waitqueue
5202 * @mode: which threads 5199 * @mode: which threads
5203 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5200 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5201 * @key: opaque value to be passed to wakeup targets
5204 * 5202 *
5205 * The sync wakeup differs that the waker knows that it will schedule 5203 * The sync wakeup differs that the waker knows that it will schedule
5206 * away soon, so while the target thread will be woken up, it will not 5204 * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5207,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5209 * 5207 *
5210 * On UP it can prevent extra preemption. 5208 * On UP it can prevent extra preemption.
5211 */ 5209 */
5212void 5210void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5213__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5211 int nr_exclusive, void *key)
5214{ 5212{
5215 unsigned long flags; 5213 unsigned long flags;
5216 int sync = 1; 5214 int sync = 1;
@@ -5222,9 +5220,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5222 sync = 0; 5220 sync = 0;
5223 5221
5224 spin_lock_irqsave(&q->lock, flags); 5222 spin_lock_irqsave(&q->lock, flags);
5225 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5223 __wake_up_common(q, mode, nr_exclusive, sync, key);
5226 spin_unlock_irqrestore(&q->lock, flags); 5224 spin_unlock_irqrestore(&q->lock, flags);
5227} 5225}
5226EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5227
5228/*
5229 * __wake_up_sync - see __wake_up_sync_key()
5230 */
5231void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5232{
5233 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5234}
5228EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5235EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5229 5236
5230/** 5237/**
@@ -7713,7 +7720,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7713{ 7720{
7714 int group; 7721 int group;
7715 7722
7716 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7723 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7717 group = cpumask_first(mask); 7724 group = cpumask_first(mask);
7718 if (sg) 7725 if (sg)
7719 *sg = &per_cpu(sched_group_core, group).sg; 7726 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7742,7 +7749,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7742 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7749 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7743 group = cpumask_first(mask); 7750 group = cpumask_first(mask);
7744#elif defined(CONFIG_SCHED_SMT) 7751#elif defined(CONFIG_SCHED_SMT)
7745 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7752 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7746 group = cpumask_first(mask); 7753 group = cpumask_first(mask);
7747#else 7754#else
7748 group = cpu; 7755 group = cpu;
@@ -8085,7 +8092,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8085 SD_INIT(sd, SIBLING); 8092 SD_INIT(sd, SIBLING);
8086 set_domain_attribute(sd, attr); 8093 set_domain_attribute(sd, attr);
8087 cpumask_and(sched_domain_span(sd), 8094 cpumask_and(sched_domain_span(sd),
8088 &per_cpu(cpu_sibling_map, i), cpu_map); 8095 topology_thread_cpumask(i), cpu_map);
8089 sd->parent = p; 8096 sd->parent = p;
8090 p->child = sd; 8097 p->child = sd;
8091 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8098 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -8096,7 +8103,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8096 /* Set up CPU (sibling) groups */ 8103 /* Set up CPU (sibling) groups */
8097 for_each_cpu(i, cpu_map) { 8104 for_each_cpu(i, cpu_map) {
8098 cpumask_and(this_sibling_map, 8105 cpumask_and(this_sibling_map,
8099 &per_cpu(cpu_sibling_map, i), cpu_map); 8106 topology_thread_cpumask(i), cpu_map);
8100 if (i != cpumask_first(this_sibling_map)) 8107 if (i != cpumask_first(this_sibling_map))
8101 continue; 8108 continue;
8102 8109
@@ -8772,6 +8779,9 @@ void __init sched_init(void)
8772#ifdef CONFIG_USER_SCHED 8779#ifdef CONFIG_USER_SCHED
8773 alloc_size *= 2; 8780 alloc_size *= 2;
8774#endif 8781#endif
8782#ifdef CONFIG_CPUMASK_OFFSTACK
8783 alloc_size += num_possible_cpus() * cpumask_size();
8784#endif
8775 /* 8785 /*
8776 * As sched_init() is called before page_alloc is setup, 8786 * As sched_init() is called before page_alloc is setup,
8777 * we use alloc_bootmem(). 8787 * we use alloc_bootmem().
@@ -8809,6 +8819,12 @@ void __init sched_init(void)
8809 ptr += nr_cpu_ids * sizeof(void **); 8819 ptr += nr_cpu_ids * sizeof(void **);
8810#endif /* CONFIG_USER_SCHED */ 8820#endif /* CONFIG_USER_SCHED */
8811#endif /* CONFIG_RT_GROUP_SCHED */ 8821#endif /* CONFIG_RT_GROUP_SCHED */
8822#ifdef CONFIG_CPUMASK_OFFSTACK
8823 for_each_possible_cpu(i) {
8824 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8825 ptr += cpumask_size();
8826 }
8827#endif /* CONFIG_CPUMASK_OFFSTACK */
8812 } 8828 }
8813 8829
8814#ifdef CONFIG_SMP 8830#ifdef CONFIG_SMP
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
55 (handler == SIG_DFL && sig_kernel_ignore(sig)); 55 (handler == SIG_DFL && sig_kernel_ignore(sig));
56} 56}
57 57
58static int sig_ignored(struct task_struct *t, int sig) 58static int sig_task_ignored(struct task_struct *t, int sig,
59 int from_ancestor_ns)
59{ 60{
60 void __user *handler; 61 void __user *handler;
61 62
63 handler = sig_handler(t, sig);
64
65 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
66 handler == SIG_DFL && !from_ancestor_ns)
67 return 1;
68
69 return sig_handler_ignored(handler, sig);
70}
71
72static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
73{
62 /* 74 /*
63 * Blocked signals are never ignored, since the 75 * Blocked signals are never ignored, since the
64 * signal handler may change by the time it is 76 * signal handler may change by the time it is
@@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
67 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 79 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
68 return 0; 80 return 0;
69 81
70 handler = sig_handler(t, sig); 82 if (!sig_task_ignored(t, sig, from_ancestor_ns))
71 if (!sig_handler_ignored(handler, sig))
72 return 0; 83 return 0;
73 84
74 /* 85 /*
75 * Tracers may want to know about even ignored signals. 86 * Tracers may want to know about even ignored signals.
76 */ 87 */
77 return !tracehook_consider_ignored_signal(t, sig, handler); 88 return !tracehook_consider_ignored_signal(t, sig);
78} 89}
79 90
80/* 91/*
@@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
318 return 1; 329 return 1;
319 if (handler != SIG_IGN && handler != SIG_DFL) 330 if (handler != SIG_IGN && handler != SIG_DFL)
320 return 0; 331 return 0;
321 return !tracehook_consider_fatal_signal(tsk, sig, handler); 332 return !tracehook_consider_fatal_signal(tsk, sig);
322} 333}
323 334
324 335
@@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
624 * Returns true if the signal should be actually delivered, otherwise 635 * Returns true if the signal should be actually delivered, otherwise
625 * it should be dropped. 636 * it should be dropped.
626 */ 637 */
627static int prepare_signal(int sig, struct task_struct *p) 638static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
628{ 639{
629 struct signal_struct *signal = p->signal; 640 struct signal_struct *signal = p->signal;
630 struct task_struct *t; 641 struct task_struct *t;
@@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
708 } 719 }
709 } 720 }
710 721
711 return !sig_ignored(p, sig); 722 return !sig_ignored(p, sig, from_ancestor_ns);
712} 723}
713 724
714/* 725/*
@@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
777 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 788 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
778 !sigismember(&t->real_blocked, sig) && 789 !sigismember(&t->real_blocked, sig) &&
779 (sig == SIGKILL || 790 (sig == SIGKILL ||
780 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { 791 !tracehook_consider_fatal_signal(t, sig))) {
781 /* 792 /*
782 * This signal will be fatal to the whole group. 793 * This signal will be fatal to the whole group.
783 */ 794 */
@@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
813 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 824 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
814} 825}
815 826
816static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 827static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
817 int group) 828 int group, int from_ancestor_ns)
818{ 829{
819 struct sigpending *pending; 830 struct sigpending *pending;
820 struct sigqueue *q; 831 struct sigqueue *q;
@@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
822 trace_sched_signal_send(sig, t); 833 trace_sched_signal_send(sig, t);
823 834
824 assert_spin_locked(&t->sighand->siglock); 835 assert_spin_locked(&t->sighand->siglock);
825 if (!prepare_signal(sig, t)) 836
837 if (!prepare_signal(sig, t, from_ancestor_ns))
826 return 0; 838 return 0;
827 839
828 pending = group ? &t->signal->shared_pending : &t->pending; 840 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
871 break; 883 break;
872 default: 884 default:
873 copy_siginfo(&q->info, info); 885 copy_siginfo(&q->info, info);
886 if (from_ancestor_ns)
887 q->info.si_pid = 0;
874 break; 888 break;
875 } 889 }
876 } else if (!is_si_special(info)) { 890 } else if (!is_si_special(info)) {
@@ -889,6 +903,20 @@ out_set:
889 return 0; 903 return 0;
890} 904}
891 905
906static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
907 int group)
908{
909 int from_ancestor_ns = 0;
910
911#ifdef CONFIG_PID_NS
912 if (!is_si_special(info) && SI_FROMUSER(info) &&
913 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
914 from_ancestor_ns = 1;
915#endif
916
917 return __send_signal(sig, info, t, group, from_ancestor_ns);
918}
919
892int print_fatal_signals; 920int print_fatal_signals;
893 921
894static void print_fatal_signal(struct pt_regs *regs, int signr) 922static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1133 if (sig && p->sighand) { 1161 if (sig && p->sighand) {
1134 unsigned long flags; 1162 unsigned long flags;
1135 spin_lock_irqsave(&p->sighand->siglock, flags); 1163 spin_lock_irqsave(&p->sighand->siglock, flags);
1136 ret = __group_send_sig_info(sig, info, p); 1164 ret = __send_signal(sig, info, p, 1, 0);
1137 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1165 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1138 } 1166 }
1139out_unlock: 1167out_unlock:
@@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1320 goto ret; 1348 goto ret;
1321 1349
1322 ret = 1; /* the signal is ignored */ 1350 ret = 1; /* the signal is ignored */
1323 if (!prepare_signal(sig, t)) 1351 if (!prepare_signal(sig, t, 0))
1324 goto out; 1352 goto out;
1325 1353
1326 ret = 0; 1354 ret = 0;
@@ -1844,9 +1872,16 @@ relock:
1844 1872
1845 /* 1873 /*
1846 * Global init gets no signals it doesn't want. 1874 * Global init gets no signals it doesn't want.
1875 * Container-init gets no signals it doesn't want from same
1876 * container.
1877 *
1878 * Note that if global/container-init sees a sig_kernel_only()
1879 * signal here, the signal must have been generated internally
1880 * or must have come from an ancestor namespace. In either
1881 * case, the signal cannot be dropped.
1847 */ 1882 */
1848 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && 1883 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1849 !signal_group_exit(signal)) 1884 !sig_kernel_only(signr))
1850 continue; 1885 continue;
1851 1886
1852 if (sig_kernel_stop(signr)) { 1887 if (sig_kernel_stop(signr)) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..cf2bc01186ef
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24
25static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long);
27
28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
30 void __user *, size_t *, loff_t *);
31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
33 void __user *, size_t *, loff_t *);
34#endif
35
36/*
37 * The pool of threads has at least min threads in it as long as someone is
38 * using the facility, and may have as many as max.
39 *
40 * A portion of the pool may be processing very slow operations.
41 */
42static unsigned slow_work_min_threads = 2;
43static unsigned slow_work_max_threads = 4;
44static unsigned vslow_work_proportion = 50; /* % of threads that may process
45 * very slow work */
46
47#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255;
50static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99;
52
53ctl_table slow_work_sysctls[] = {
54 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads",
57 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned),
59 .mode = 0644,
60 .proc_handler = slow_work_min_threads_sysctl,
61 .extra1 = (void *) &slow_work_min_min_threads,
62 .extra2 = &slow_work_max_threads,
63 },
64 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads",
67 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned),
69 .mode = 0644,
70 .proc_handler = slow_work_max_threads_sysctl,
71 .extra1 = &slow_work_min_threads,
72 .extra2 = (void *) &slow_work_max_max_threads,
73 },
74 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned),
79 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow,
83 },
84 { .ctl_name = 0 }
85};
86#endif
87
88/*
89 * The active state of the thread pool
90 */
91static atomic_t slow_work_thread_count;
92static atomic_t vslow_work_executing_count;
93
94static bool slow_work_may_not_start_new_thread;
95static bool slow_work_cull; /* cull a thread due to lack of activity */
96static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
97static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */
99
100/*
101 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs.
104 *
105 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items.
107 */
108static LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock);
111
112/*
113 * The thread controls. A variable used to signal to the threads that they
114 * should exit when the queue is empty, a waitqueue used by the threads to wait
115 * for signals, and a completion set by the last thread to exit.
116 */
117static bool slow_work_threads_should_exit;
118static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
119static DECLARE_COMPLETION(slow_work_last_thread_exited);
120
121/*
122 * The number of users of the thread pool and its lock. Whilst this is zero we
123 * have no threads hanging around, and when this reaches zero, we wait for all
124 * active or queued work items to complete and kill all the threads we do have.
125 */
126static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock);
128
129/*
130 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items.
132 *
133 * The answer is rounded up to at least 1, but may not equal or exceed the
134 * maximum number of the threads in the pool. This means we always have at
135 * least one thread that can process slow work items, and we always have at
136 * least one thread that won't get tied up doing so.
137 */
138static unsigned slow_work_calc_vsmax(void)
139{
140 unsigned vsmax;
141
142 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
143 vsmax /= 100;
144 vsmax = max(vsmax, 1U);
145 return min(vsmax, slow_work_max_threads - 1);
146}
147
148/*
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do.
151 */
152static bool slow_work_execute(void)
153{
154 struct slow_work *work = NULL;
155 unsigned vsmax;
156 bool very_slow;
157
158 vsmax = slow_work_calc_vsmax();
159
160 /* see if we can schedule a new thread to be started if we're not
161 * keeping up with the work */
162 if (!waitqueue_active(&slow_work_thread_wq) &&
163 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
164 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
165 !slow_work_may_not_start_new_thread)
166 slow_work_enqueue(&slow_work_new_thread);
167
168 /* find something to execute */
169 spin_lock_irq(&slow_work_queue_lock);
170 if (!list_empty(&vslow_work_queue) &&
171 atomic_read(&vslow_work_executing_count) < vsmax) {
172 work = list_entry(vslow_work_queue.next,
173 struct slow_work, link);
174 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
175 BUG();
176 list_del_init(&work->link);
177 atomic_inc(&vslow_work_executing_count);
178 very_slow = true;
179 } else if (!list_empty(&slow_work_queue)) {
180 work = list_entry(slow_work_queue.next,
181 struct slow_work, link);
182 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
183 BUG();
184 list_del_init(&work->link);
185 very_slow = false;
186 } else {
187 very_slow = false; /* avoid the compiler warning */
188 }
189 spin_unlock_irq(&slow_work_queue_lock);
190
191 if (!work)
192 return false;
193
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG();
196
197 work->ops->execute(work);
198
199 if (very_slow)
200 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202
203 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously
206 *
207 * there is, however, a race between us testing the pending flag and
208 * getting the spinlock, and between the enqueuer setting the pending
209 * flag and getting the spinlock, so we use a deferral bit to tell us
210 * if the enqueuer got there first
211 */
212 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
213 spin_lock_irq(&slow_work_queue_lock);
214
215 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
216 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
217 goto auto_requeue;
218
219 spin_unlock_irq(&slow_work_queue_lock);
220 }
221
222 work->ops->put_ref(work);
223 return true;
224
225auto_requeue:
226 /* we must complete the enqueue operation
227 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already
229 */
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue);
232 else
233 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock);
235 return true;
236}
237
238/**
239 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue
241 *
242 * Schedule a slow work item for processing. If the item is already undergoing
243 * execution, this guarantees not to re-enter the execution routine until the
244 * first execution finishes.
245 *
246 * The item is pinned by this function as it retains a reference to it, managed
247 * through the item operations. The item is unpinned once it has been
248 * executed.
249 *
250 * An item may hog the thread that is running it for a relatively large amount
251 * of time, sufficient, for example, to perform several lookup, mkdir, create
252 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
253 *
254 * Conversely, if a number of items are awaiting processing, it may take some
255 * time before any given item is given attention. The number of threads in the
256 * pool may be increased to deal with demand, but only up to a limit.
257 *
258 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
259 * the very slow queue, from which only a portion of the threads will be
260 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow.
262 *
263 * Returns 0 if successful, -EAGAIN if not.
264 */
265int slow_work_enqueue(struct slow_work *work)
266{
267 unsigned long flags;
268
269 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work);
271 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273
274 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once
276 * per enqueue request
277 *
278 * we use the PENDING bit to merge together repeat requests without
279 * having to disable IRQs and take the spinlock, whilst still
280 * maintaining our promise
281 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
283 spin_lock_irqsave(&slow_work_queue_lock, flags);
284
285 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously
287 *
288 * this, however, leaves us with a problem if we're asked to
289 * enqueue the work whilst someone is executing the work
290 * function as simply queueing the work immediately means that
291 * another thread may try executing it whilst it is already
292 * under execution
293 *
294 * to deal with this, we set the ENQ_DEFERRED bit instead of
295 * enqueueing, and the thread currently executing the work
296 * function will enqueue the work item when the work function
297 * returns and it has cleared the EXECUTING bit
298 */
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else {
302 if (work->ops->get_ref(work) < 0)
303 goto cant_get_ref;
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
305 list_add_tail(&work->link, &vslow_work_queue);
306 else
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq);
309 }
310
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 }
313 return 0;
314
315cant_get_ref:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN;
318}
319EXPORT_SYMBOL(slow_work_enqueue);
320
321/*
322 * Worker thread culling algorithm
323 */
324static bool slow_work_cull_thread(void)
325{
326 unsigned long flags;
327 bool do_cull = false;
328
329 spin_lock_irqsave(&slow_work_queue_lock, flags);
330
331 if (slow_work_cull) {
332 slow_work_cull = false;
333
334 if (list_empty(&slow_work_queue) &&
335 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer,
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true;
341 }
342 }
343
344 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
345 return do_cull;
346}
347
348/*
349 * Determine if there is slow work available for dispatch
350 */
351static inline bool slow_work_available(int vsmax)
352{
353 return !list_empty(&slow_work_queue) ||
354 (!list_empty(&vslow_work_queue) &&
355 atomic_read(&vslow_work_executing_count) < vsmax);
356}
357
358/*
359 * Worker thread dispatcher
360 */
361static int slow_work_thread(void *_data)
362{
363 int vsmax;
364
365 DEFINE_WAIT(wait);
366
367 set_freezable();
368 set_user_nice(current, -5);
369
370 for (;;) {
371 vsmax = vslow_work_proportion;
372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100;
374
375 prepare_to_wait(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) &&
378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) &&
380 !slow_work_cull)
381 schedule();
382 finish_wait(&slow_work_thread_wq, &wait);
383
384 try_to_freeze();
385
386 vsmax = vslow_work_proportion;
387 vsmax *= atomic_read(&slow_work_thread_count);
388 vsmax /= 100;
389
390 if (slow_work_available(vsmax) && slow_work_execute()) {
391 cond_resched();
392 if (list_empty(&slow_work_queue) &&
393 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer,
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue;
399 }
400
401 if (slow_work_threads_should_exit)
402 break;
403
404 if (slow_work_cull && slow_work_cull_thread())
405 break;
406 }
407
408 if (atomic_dec_and_test(&slow_work_thread_count))
409 complete_and_exit(&slow_work_last_thread_exited, 0);
410 return 0;
411}
412
413/*
414 * Handle thread cull timer expiration
415 */
416static void slow_work_cull_timeout(unsigned long data)
417{
418 slow_work_cull = true;
419 wake_up(&slow_work_thread_wq);
420}
421
422/*
423 * Get a reference on slow work thread starter
424 */
425static int slow_work_new_thread_get_ref(struct slow_work *work)
426{
427 return 0;
428}
429
430/*
431 * Drop a reference on slow work thread starter
432 */
433static void slow_work_new_thread_put_ref(struct slow_work *work)
434{
435}
436
437/*
438 * Start a new slow work thread
439 */
440static void slow_work_new_thread_execute(struct slow_work *work)
441{
442 struct task_struct *p;
443
444 if (slow_work_threads_should_exit)
445 return;
446
447 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
448 return;
449
450 if (!mutex_trylock(&slow_work_user_lock))
451 return;
452
453 slow_work_may_not_start_new_thread = true;
454 atomic_inc(&slow_work_thread_count);
455 p = kthread_run(slow_work_thread, NULL, "kslowd");
456 if (IS_ERR(p)) {
457 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
458 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT);
462 } else {
463 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1);
465 }
466
467 mutex_unlock(&slow_work_user_lock);
468}
469
470static const struct slow_work_ops slow_work_new_thread_ops = {
471 .get_ref = slow_work_new_thread_get_ref,
472 .put_ref = slow_work_new_thread_put_ref,
473 .execute = slow_work_new_thread_execute,
474};
475
476/*
477 * post-OOM new thread start suppression expiration
478 */
479static void slow_work_oom_timeout(unsigned long data)
480{
481 slow_work_may_not_start_new_thread = false;
482}
483
484#ifdef CONFIG_SYSCTL
485/*
486 * Handle adjustment of the minimum number of threads
487 */
488static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
489 struct file *filp, void __user *buffer,
490 size_t *lenp, loff_t *ppos)
491{
492 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
493 int n;
494
495 if (ret == 0) {
496 mutex_lock(&slow_work_user_lock);
497 if (slow_work_user_count > 0) {
498 /* see if we need to start or stop threads */
499 n = atomic_read(&slow_work_thread_count) -
500 slow_work_min_threads;
501
502 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0)
505 mod_timer(&slow_work_cull_timer,
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 }
508 mutex_unlock(&slow_work_user_lock);
509 }
510
511 return ret;
512}
513
514/*
515 * Handle adjustment of the maximum number of threads
516 */
517static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
518 struct file *filp, void __user *buffer,
519 size_t *lenp, loff_t *ppos)
520{
521 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
522 int n;
523
524 if (ret == 0) {
525 mutex_lock(&slow_work_user_lock);
526 if (slow_work_user_count > 0) {
527 /* see if we need to stop threads */
528 n = slow_work_max_threads -
529 atomic_read(&slow_work_thread_count);
530
531 if (n < 0)
532 mod_timer(&slow_work_cull_timer,
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 }
535 mutex_unlock(&slow_work_user_lock);
536 }
537
538 return ret;
539}
540#endif /* CONFIG_SYSCTL */
541
542/**
543 * slow_work_register_user - Register a user of the facility
544 *
545 * Register a user of the facility, starting up the initial threads if there
546 * aren't any other users at this point. This will return 0 if successful, or
547 * an error if not.
548 */
549int slow_work_register_user(void)
550{
551 struct task_struct *p;
552 int loop;
553
554 mutex_lock(&slow_work_user_lock);
555
556 if (slow_work_user_count == 0) {
557 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
558 init_completion(&slow_work_last_thread_exited);
559
560 slow_work_threads_should_exit = false;
561 slow_work_init(&slow_work_new_thread,
562 &slow_work_new_thread_ops);
563 slow_work_may_not_start_new_thread = false;
564 slow_work_cull = false;
565
566 /* start the minimum number of threads */
567 for (loop = 0; loop < slow_work_min_threads; loop++) {
568 atomic_inc(&slow_work_thread_count);
569 p = kthread_run(slow_work_thread, NULL, "kslowd");
570 if (IS_ERR(p))
571 goto error;
572 }
573 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
574 }
575
576 slow_work_user_count++;
577 mutex_unlock(&slow_work_user_lock);
578 return 0;
579
580error:
581 if (atomic_dec_and_test(&slow_work_thread_count))
582 complete(&slow_work_last_thread_exited);
583 if (loop > 0) {
584 printk(KERN_ERR "Slow work thread pool:"
585 " Aborting startup on ENOMEM\n");
586 slow_work_threads_should_exit = true;
587 wake_up_all(&slow_work_thread_wq);
588 wait_for_completion(&slow_work_last_thread_exited);
589 printk(KERN_ERR "Slow work thread pool: Aborted\n");
590 }
591 mutex_unlock(&slow_work_user_lock);
592 return PTR_ERR(p);
593}
594EXPORT_SYMBOL(slow_work_register_user);
595
596/**
597 * slow_work_unregister_user - Unregister a user of the facility
598 *
599 * Unregister a user of the facility, killing all the threads if this was the
600 * last one.
601 */
602void slow_work_unregister_user(void)
603{
604 mutex_lock(&slow_work_user_lock);
605
606 BUG_ON(slow_work_user_count <= 0);
607
608 slow_work_user_count--;
609 if (slow_work_user_count == 0) {
610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
611 slow_work_threads_should_exit = true;
612 wake_up_all(&slow_work_thread_wq);
613 wait_for_completion(&slow_work_last_thread_exited);
614 printk(KERN_NOTICE "Slow work thread pool:"
615 " Shut down complete\n");
616 }
617
618 del_timer_sync(&slow_work_cull_timer);
619
620 mutex_unlock(&slow_work_user_lock);
621}
622EXPORT_SYMBOL(slow_work_unregister_user);
623
624/*
625 * Initialise the slow work facility
626 */
627static int __init init_slow_work(void)
628{
629 unsigned nr_cpus = num_possible_cpus();
630
631 if (slow_work_max_threads < nr_cpus)
632 slow_work_max_threads = nr_cpus;
633#ifdef CONFIG_SYSCTL
634 if (slow_work_max_max_threads < nr_cpus * 2)
635 slow_work_max_max_threads = nr_cpus * 2;
636#endif
637 return 0;
638}
639
640subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7efe32..858baac568ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,40 +2,82 @@
2 * Generic helpers for smp ipi calls 2 * Generic helpers for smp ipi calls
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 *
6 */ 5 */
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/percpu.h>
10#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
11#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/percpu.h>
11#include <linux/init.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h>
13 14
14static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); 15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
15static LIST_HEAD(call_function_queue); 16
16__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); 17static struct {
18 struct list_head queue;
19 spinlock_t lock;
20} call_function __cacheline_aligned_in_smp =
21 {
22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
24 };
17 25
18enum { 26enum {
19 CSD_FLAG_WAIT = 0x01, 27 CSD_FLAG_LOCK = 0x01,
20 CSD_FLAG_ALLOC = 0x02,
21 CSD_FLAG_LOCK = 0x04,
22}; 28};
23 29
24struct call_function_data { 30struct call_function_data {
25 struct call_single_data csd; 31 struct call_single_data csd;
26 spinlock_t lock; 32 spinlock_t lock;
27 unsigned int refs; 33 unsigned int refs;
28 struct rcu_head rcu_head; 34 cpumask_var_t cpumask;
29 unsigned long cpumask_bits[];
30}; 35};
31 36
32struct call_single_queue { 37struct call_single_queue {
33 struct list_head list; 38 struct list_head list;
34 spinlock_t lock; 39 spinlock_t lock;
40};
41
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45
46static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48{
49 long cpu = (long)hcpu;
50 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
51
52 switch (action) {
53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu)))
57 return NOTIFY_BAD;
58 break;
59
60#ifdef CONFIG_CPU_HOTPLUG
61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN:
63
64 case CPU_DEAD:
65 case CPU_DEAD_FROZEN:
66 free_cpumask_var(cfd->cpumask);
67 break;
68#endif
69 };
70
71 return NOTIFY_OK;
72}
73
74static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
75 .notifier_call = hotplug_cfd,
35}; 76};
36 77
37static int __cpuinit init_call_single_data(void) 78static int __cpuinit init_call_single_data(void)
38{ 79{
80 void *cpu = (void *)(long)smp_processor_id();
39 int i; 81 int i;
40 82
41 for_each_possible_cpu(i) { 83 for_each_possible_cpu(i) {
@@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
44 spin_lock_init(&q->lock); 86 spin_lock_init(&q->lock);
45 INIT_LIST_HEAD(&q->list); 87 INIT_LIST_HEAD(&q->list);
46 } 88 }
89
90 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
91 register_cpu_notifier(&hotplug_cfd_notifier);
92
47 return 0; 93 return 0;
48} 94}
49early_initcall(init_call_single_data); 95early_initcall(init_call_single_data);
50 96
51static void csd_flag_wait(struct call_single_data *data) 97/*
98 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
99 *
100 * For non-synchronous ipi calls the csd can still be in use by the
101 * previous function call. For multi-cpu calls its even more interesting
102 * as we'll have to ensure no other cpu is observing our csd.
103 */
104static void csd_lock_wait(struct call_single_data *data)
52{ 105{
53 /* Wait for response */ 106 while (data->flags & CSD_FLAG_LOCK)
54 do {
55 if (!(data->flags & CSD_FLAG_WAIT))
56 break;
57 cpu_relax(); 107 cpu_relax();
58 } while (1); 108}
109
110static void csd_lock(struct call_single_data *data)
111{
112 csd_lock_wait(data);
113 data->flags = CSD_FLAG_LOCK;
114
115 /*
116 * prevent CPU from reordering the above assignment
117 * to ->flags with any subsequent assignments to other
118 * fields of the specified call_single_data structure:
119 */
120 smp_mb();
121}
122
123static void csd_unlock(struct call_single_data *data)
124{
125 WARN_ON(!(data->flags & CSD_FLAG_LOCK));
126
127 /*
128 * ensure we're all done before releasing data:
129 */
130 smp_mb();
131
132 data->flags &= ~CSD_FLAG_LOCK;
59} 133}
60 134
61/* 135/*
62 * Insert a previously allocated call_single_data element for execution 136 * Insert a previously allocated call_single_data element
63 * on the given CPU. data must already have ->func, ->info, and ->flags set. 137 * for execution on the given CPU. data must already have
138 * ->func, ->info, and ->flags set.
64 */ 139 */
65static void generic_exec_single(int cpu, struct call_single_data *data) 140static
141void generic_exec_single(int cpu, struct call_single_data *data, int wait)
66{ 142{
67 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 143 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
68 int wait = data->flags & CSD_FLAG_WAIT, ipi;
69 unsigned long flags; 144 unsigned long flags;
145 int ipi;
70 146
71 spin_lock_irqsave(&dst->lock, flags); 147 spin_lock_irqsave(&dst->lock, flags);
72 ipi = list_empty(&dst->list); 148 ipi = list_empty(&dst->list);
@@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
74 spin_unlock_irqrestore(&dst->lock, flags); 150 spin_unlock_irqrestore(&dst->lock, flags);
75 151
76 /* 152 /*
77 * Make the list addition visible before sending the ipi. 153 * The list addition should be visible before sending the IPI
154 * handler locks the list to pull the entry off it because of
155 * normal cache coherency rules implied by spinlocks.
156 *
157 * If IPIs can go out of order to the cache coherency protocol
158 * in an architecture, sufficient synchronisation should be added
159 * to arch code to make it appear to obey cache coherency WRT
160 * locking and barrier primitives. Generic code isn't really
161 * equipped to do the right thing...
78 */ 162 */
79 smp_mb();
80
81 if (ipi) 163 if (ipi)
82 arch_send_call_function_single_ipi(cpu); 164 arch_send_call_function_single_ipi(cpu);
83 165
84 if (wait) 166 if (wait)
85 csd_flag_wait(data); 167 csd_lock_wait(data);
86}
87
88static void rcu_free_call_data(struct rcu_head *head)
89{
90 struct call_function_data *data;
91
92 data = container_of(head, struct call_function_data, rcu_head);
93
94 kfree(data);
95} 168}
96 169
97/* 170/*
@@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)
104 int cpu = get_cpu(); 177 int cpu = get_cpu();
105 178
106 /* 179 /*
107 * It's ok to use list_for_each_rcu() here even though we may delete 180 * Ensure entry is visible on call_function_queue after we have
108 * 'pos', since list_del_rcu() doesn't clear ->next 181 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list
183 * and never get another IPI to process it.
184 */
185 smp_mb();
186
187 /*
188 * It's ok to use list_for_each_rcu() here even though we may
189 * delete 'pos', since list_del_rcu() doesn't clear ->next
109 */ 190 */
110 rcu_read_lock(); 191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
111 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
112 int refs; 192 int refs;
113 193
114 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) 194 spin_lock(&data->lock);
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
115 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
116 201
117 data->csd.func(data->csd.info); 202 data->csd.func(data->csd.info);
118 203
119 spin_lock(&data->lock); 204 spin_lock(&data->lock);
120 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
121 WARN_ON(data->refs == 0); 205 WARN_ON(data->refs == 0);
122 data->refs--; 206 refs = --data->refs;
123 refs = data->refs; 207 if (!refs) {
208 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock);
211 }
124 spin_unlock(&data->lock); 212 spin_unlock(&data->lock);
125 213
126 if (refs) 214 if (refs)
127 continue; 215 continue;
128 216
129 spin_lock(&call_function_lock); 217 csd_unlock(&data->csd);
130 list_del_rcu(&data->csd.list);
131 spin_unlock(&call_function_lock);
132
133 if (data->csd.flags & CSD_FLAG_WAIT) {
134 /*
135 * serialize stores to data with the flag clear
136 * and wakeup
137 */
138 smp_wmb();
139 data->csd.flags &= ~CSD_FLAG_WAIT;
140 }
141 if (data->csd.flags & CSD_FLAG_ALLOC)
142 call_rcu(&data->rcu_head, rcu_free_call_data);
143 } 218 }
144 rcu_read_unlock();
145 219
146 put_cpu(); 220 put_cpu();
147} 221}
148 222
149/* 223/*
150 * Invoked by arch to handle an IPI for call function single. Must be called 224 * Invoked by arch to handle an IPI for call function single. Must be
151 * from the arch with interrupts disabled. 225 * called from the arch with interrupts disabled.
152 */ 226 */
153void generic_smp_call_function_single_interrupt(void) 227void generic_smp_call_function_single_interrupt(void)
154{ 228{
155 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 229 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
230 unsigned int data_flags;
156 LIST_HEAD(list); 231 LIST_HEAD(list);
157 232
158 /* 233 spin_lock(&q->lock);
159 * Need to see other stores to list head for checking whether 234 list_replace_init(&q->list, &list);
160 * list is empty without holding q->lock 235 spin_unlock(&q->lock);
161 */ 236
162 smp_read_barrier_depends(); 237 while (!list_empty(&list)) {
163 while (!list_empty(&q->list)) { 238 struct call_single_data *data;
164 unsigned int data_flags; 239
165 240 data = list_entry(list.next, struct call_single_data, list);
166 spin_lock(&q->lock); 241 list_del(&data->list);
167 list_replace_init(&q->list, &list); 242
168 spin_unlock(&q->lock); 243 /*
169 244 * 'data' can be invalid after this call if flags == 0
170 while (!list_empty(&list)) { 245 * (when called through generic_exec_single()),
171 struct call_single_data *data; 246 * so save them away before making the call:
172 247 */
173 data = list_entry(list.next, struct call_single_data, 248 data_flags = data->flags;
174 list); 249
175 list_del(&data->list); 250 data->func(data->info);
176 251
177 /*
178 * 'data' can be invalid after this call if
179 * flags == 0 (when called through
180 * generic_exec_single(), so save them away before
181 * making the call.
182 */
183 data_flags = data->flags;
184
185 data->func(data->info);
186
187 if (data_flags & CSD_FLAG_WAIT) {
188 smp_wmb();
189 data->flags &= ~CSD_FLAG_WAIT;
190 } else if (data_flags & CSD_FLAG_LOCK) {
191 smp_wmb();
192 data->flags &= ~CSD_FLAG_LOCK;
193 } else if (data_flags & CSD_FLAG_ALLOC)
194 kfree(data);
195 }
196 /* 252 /*
197 * See comment on outer loop 253 * Unlocked CSDs are valid through generic_exec_single():
198 */ 254 */
199 smp_read_barrier_depends(); 255 if (data_flags & CSD_FLAG_LOCK)
256 csd_unlock(data);
200 } 257 }
201} 258}
202 259
@@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
215int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 272int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
216 int wait) 273 int wait)
217{ 274{
218 struct call_single_data d; 275 struct call_single_data d = {
276 .flags = 0,
277 };
219 unsigned long flags; 278 unsigned long flags;
220 /* prevent preemption and reschedule on another processor, 279 int this_cpu;
221 as well as CPU removal */
222 int me = get_cpu();
223 int err = 0; 280 int err = 0;
224 281
282 /*
283 * prevent preemption and reschedule on another processor,
284 * as well as CPU removal
285 */
286 this_cpu = get_cpu();
287
225 /* Can deadlock when called with interrupts disabled */ 288 /* Can deadlock when called with interrupts disabled */
226 WARN_ON(irqs_disabled()); 289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
227 290
228 if (cpu == me) { 291 if (cpu == this_cpu) {
229 local_irq_save(flags); 292 local_irq_save(flags);
230 func(info); 293 func(info);
231 local_irq_restore(flags); 294 local_irq_restore(flags);
232 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 295 } else {
233 struct call_single_data *data; 296 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
297 struct call_single_data *data = &d;
298
299 if (!wait)
300 data = &__get_cpu_var(csd_data);
234 301
235 if (!wait) { 302 csd_lock(data);
236 /* 303
237 * We are calling a function on a single CPU 304 data->func = func;
238 * and we are not going to wait for it to finish. 305 data->info = info;
239 * We first try to allocate the data, but if we 306 generic_exec_single(cpu, data, wait);
240 * fail, we fall back to use a per cpu data to pass
241 * the information to that CPU. Since all callers
242 * of this code will use the same data, we must
243 * synchronize the callers to prevent a new caller
244 * from corrupting the data before the callee
245 * can access it.
246 *
247 * The CSD_FLAG_LOCK is used to let us know when
248 * the IPI handler is done with the data.
249 * The first caller will set it, and the callee
250 * will clear it. The next caller must wait for
251 * it to clear before we set it again. This
252 * will make sure the callee is done with the
253 * data before a new caller will use it.
254 */
255 data = kmalloc(sizeof(*data), GFP_ATOMIC);
256 if (data)
257 data->flags = CSD_FLAG_ALLOC;
258 else {
259 data = &per_cpu(csd_data, me);
260 while (data->flags & CSD_FLAG_LOCK)
261 cpu_relax();
262 data->flags = CSD_FLAG_LOCK;
263 }
264 } else { 307 } else {
265 data = &d; 308 err = -ENXIO; /* CPU not online */
266 data->flags = CSD_FLAG_WAIT;
267 } 309 }
268
269 data->func = func;
270 data->info = info;
271 generic_exec_single(cpu, data);
272 } else {
273 err = -ENXIO; /* CPU not online */
274 } 310 }
275 311
276 put_cpu(); 312 put_cpu();
313
277 return err; 314 return err;
278} 315}
279EXPORT_SYMBOL(smp_call_function_single); 316EXPORT_SYMBOL(smp_call_function_single);
@@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
283 * @cpu: The CPU to run on. 320 * @cpu: The CPU to run on.
284 * @data: Pre-allocated and setup data structure 321 * @data: Pre-allocated and setup data structure
285 * 322 *
286 * Like smp_call_function_single(), but allow caller to pass in a pre-allocated 323 * Like smp_call_function_single(), but allow caller to pass in a
287 * data structure. Useful for embedding @data inside other structures, for 324 * pre-allocated data structure. Useful for embedding @data inside
288 * instance. 325 * other structures, for instance.
289 *
290 */ 326 */
291void __smp_call_function_single(int cpu, struct call_single_data *data) 327void __smp_call_function_single(int cpu, struct call_single_data *data,
328 int wait)
292{ 329{
330 csd_lock(data);
331
293 /* Can deadlock when called with interrupts disabled */ 332 /* Can deadlock when called with interrupts disabled */
294 WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); 333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
295 334
296 generic_exec_single(cpu, data); 335 generic_exec_single(cpu, data, wait);
297} 336}
298 337
299/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ 338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
300#ifndef arch_send_call_function_ipi_mask 340#ifndef arch_send_call_function_ipi_mask
301#define arch_send_call_function_ipi_mask(maskp) \ 341# define arch_send_call_function_ipi_mask(maskp) \
302 arch_send_call_function_ipi(*(maskp)) 342 arch_send_call_function_ipi(*(maskp))
303#endif 343#endif
304 344
305/** 345/**
@@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
307 * @mask: The set of cpus to run on (only runs on online subset). 347 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 348 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 349 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 350 * @wait: If true, wait (atomically) until function has completed
351 * on other CPUs.
311 * 352 *
312 * If @wait is true, then returns once @func has returned. Note that @wait 353 * If @wait is true, then returns once @func has returned. Note that @wait
313 * will be implicitly turned on in case of allocation failures, since 354 * will be implicitly turned on in case of allocation failures, since
@@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
318 * must be disabled when calling this function. 359 * must be disabled when calling this function.
319 */ 360 */
320void smp_call_function_many(const struct cpumask *mask, 361void smp_call_function_many(const struct cpumask *mask,
321 void (*func)(void *), void *info, 362 void (*func)(void *), void *info, bool wait)
322 bool wait)
323{ 363{
324 struct call_function_data *data; 364 struct call_function_data *data;
325 unsigned long flags; 365 unsigned long flags;
326 int cpu, next_cpu; 366 int cpu, next_cpu, this_cpu = smp_processor_id();
327 367
328 /* Can deadlock when called with interrupts disabled */ 368 /* Can deadlock when called with interrupts disabled */
329 WARN_ON(irqs_disabled()); 369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
330 370
331 /* So, what's a CPU they want? Ignoring this one. */ 371 /* So, what's a CPU they want? Ignoring this one. */
332 cpu = cpumask_first_and(mask, cpu_online_mask); 372 cpu = cpumask_first_and(mask, cpu_online_mask);
333 if (cpu == smp_processor_id()) 373 if (cpu == this_cpu)
334 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 374 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
375
335 /* No online cpus? We're done. */ 376 /* No online cpus? We're done. */
336 if (cpu >= nr_cpu_ids) 377 if (cpu >= nr_cpu_ids)
337 return; 378 return;
338 379
339 /* Do we have another CPU which isn't us? */ 380 /* Do we have another CPU which isn't us? */
340 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 381 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
341 if (next_cpu == smp_processor_id()) 382 if (next_cpu == this_cpu)
342 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); 383 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
343 384
344 /* Fastpath: do that cpu by itself. */ 385 /* Fastpath: do that cpu by itself. */
@@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
347 return; 388 return;
348 } 389 }
349 390
350 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); 391 data = &__get_cpu_var(cfd_data);
351 if (unlikely(!data)) { 392 csd_lock(&data->csd);
352 /* Slow path. */
353 for_each_online_cpu(cpu) {
354 if (cpu == smp_processor_id())
355 continue;
356 if (cpumask_test_cpu(cpu, mask))
357 smp_call_function_single(cpu, func, info, wait);
358 }
359 return;
360 }
361 393
362 spin_lock_init(&data->lock); 394 spin_lock_irqsave(&data->lock, flags);
363 data->csd.flags = CSD_FLAG_ALLOC;
364 if (wait)
365 data->csd.flags |= CSD_FLAG_WAIT;
366 data->csd.func = func; 395 data->csd.func = func;
367 data->csd.info = info; 396 data->csd.info = info;
368 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); 397 cpumask_and(data->cpumask, mask, cpu_online_mask);
369 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); 398 cpumask_clear_cpu(this_cpu, data->cpumask);
370 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); 399 data->refs = cpumask_weight(data->cpumask);
371 400
372 spin_lock_irqsave(&call_function_lock, flags); 401 spin_lock(&call_function.lock);
373 list_add_tail_rcu(&data->csd.list, &call_function_queue); 402 /*
374 spin_unlock_irqrestore(&call_function_lock, flags); 403 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries:
406 */
407 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock);
409
410 spin_unlock_irqrestore(&data->lock, flags);
375 411
376 /* 412 /*
377 * Make the list addition visible before sending the ipi. 413 * Make the list addition visible before sending the ipi.
414 * (IPIs must obey or appear to obey normal Linux cache
415 * coherency rules -- see comment in generic_exec_single).
378 */ 416 */
379 smp_mb(); 417 smp_mb();
380 418
381 /* Send a message to all CPUs in the map */ 419 /* Send a message to all CPUs in the map */
382 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); 420 arch_send_call_function_ipi_mask(data->cpumask);
383 421
384 /* optionally wait for the CPUs to complete */ 422 /* Optionally wait for the CPUs to complete */
385 if (wait) 423 if (wait)
386 csd_flag_wait(&data->csd); 424 csd_lock_wait(&data->csd);
387} 425}
388EXPORT_SYMBOL(smp_call_function_many); 426EXPORT_SYMBOL(smp_call_function_many);
389 427
@@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
391 * smp_call_function(): Run a function on all other CPUs. 429 * smp_call_function(): Run a function on all other CPUs.
392 * @func: The function to run. This must be fast and non-blocking. 430 * @func: The function to run. This must be fast and non-blocking.
393 * @info: An arbitrary pointer to pass to the function. 431 * @info: An arbitrary pointer to pass to the function.
394 * @wait: If true, wait (atomically) until function has completed on other CPUs. 432 * @wait: If true, wait (atomically) until function has completed
433 * on other CPUs.
395 * 434 *
396 * Returns 0. 435 * Returns 0.
397 * 436 *
@@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
407 preempt_disable(); 446 preempt_disable();
408 smp_call_function_many(cpu_online_mask, func, info, wait); 447 smp_call_function_many(cpu_online_mask, func, info, wait);
409 preempt_enable(); 448 preempt_enable();
449
410 return 0; 450 return 0;
411} 451}
412EXPORT_SYMBOL(smp_call_function); 452EXPORT_SYMBOL(smp_call_function);
413 453
414void ipi_call_lock(void) 454void ipi_call_lock(void)
415{ 455{
416 spin_lock(&call_function_lock); 456 spin_lock(&call_function.lock);
417} 457}
418 458
419void ipi_call_unlock(void) 459void ipi_call_unlock(void)
420{ 460{
421 spin_unlock(&call_function_lock); 461 spin_unlock(&call_function.lock);
422} 462}
423 463
424void ipi_call_lock_irq(void) 464void ipi_call_lock_irq(void)
425{ 465{
426 spin_lock_irq(&call_function_lock); 466 spin_lock_irq(&call_function.lock);
427} 467}
428 468
429void ipi_call_unlock_irq(void) 469void ipi_call_unlock_irq(void)
430{ 470{
431 spin_unlock_irq(&call_function_lock); 471 spin_unlock_irq(&call_function.lock);
432} 472}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 57d3f67f6f38..ea23ec087ee9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
180 account_system_vtime(current); 180 account_system_vtime(current);
181 181
182 __local_bh_disable((unsigned long)__builtin_return_address(0)); 182 __local_bh_disable((unsigned long)__builtin_return_address(0));
183 trace_softirq_enter(); 183 lockdep_softirq_enter();
184 184
185 cpu = smp_processor_id(); 185 cpu = smp_processor_id();
186restart: 186restart:
@@ -220,7 +220,7 @@ restart:
220 if (pending) 220 if (pending)
221 wakeup_softirqd(); 221 wakeup_softirqd();
222 222
223 trace_softirq_exit(); 223 lockdep_softirq_exit();
224 224
225 account_system_vtime(current); 225 account_system_vtime(current);
226 _local_bh_enable(); 226 _local_bh_enable();
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
496 cp->flags = 0; 496 cp->flags = 0;
497 cp->priv = softirq; 497 cp->priv = softirq;
498 498
499 __smp_call_function_single(cpu, cp); 499 __smp_call_function_single(cpu, cp, 0);
500 return 0; 500 return 0;
501 } 501 }
502 return 1; 502 return 1;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
121 local_irq_save(flags); 121 local_irq_save(flags);
122 preempt_disable(); 122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); 124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
125 return flags; 126 return flags;
126} 127}
127EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
151 local_irq_save(flags); 152 local_irq_save(flags);
152 preempt_disable(); 153 preempt_disable();
153 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
154 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); 155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
155 return flags; 157 return flags;
156} 158}
157EXPORT_SYMBOL(_write_lock_irqsave); 159EXPORT_SYMBOL(_write_lock_irqsave);
@@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
299 local_irq_save(flags); 301 local_irq_save(flags);
300 preempt_disable(); 302 preempt_disable();
301 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 303 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
302 /* 304 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
303 * On lockdep we dont want the hand-coded irq-enable of 305 _raw_spin_lock_flags, &flags);
304 * _raw_spin_lock_flags() code, because lockdep assumes
305 * that interrupts are not re-enabled during lock-acquire:
306 */
307#ifdef CONFIG_LOCKDEP
308 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
309#else
310 _raw_spin_lock_flags(lock, &flags);
311#endif
312 return flags; 306 return flags;
313} 307}
314EXPORT_SYMBOL(_spin_lock_irqsave_nested); 308EXPORT_SYMBOL(_spin_lock_irqsave_nested);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..51dbb55604e8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/fs_struct.h>
37 38
38#include <linux/compat.h> 39#include <linux/compat.h>
39#include <linux/syscalls.h> 40#include <linux/syscalls.h>
@@ -1013,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
1013 if (err) 1014 if (err)
1014 goto out; 1015 goto out;
1015 1016
1016 if (task_pgrp(p) != pgrp) { 1017 if (task_pgrp(p) != pgrp)
1017 change_pid(p, PIDTYPE_PGID, pgrp); 1018 change_pid(p, PIDTYPE_PGID, pgrp);
1018 set_task_pgrp(p, pid_nr(pgrp));
1019 }
1020 1019
1021 err = 0; 1020 err = 0;
1022out: 1021out:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
48#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/processor.h> 54#include <asm/processor.h>
@@ -95,12 +96,9 @@ static int sixty = 60;
95static int neg_one = -1; 96static int neg_one = -1;
96#endif 97#endif
97 98
98#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
99static int two = 2;
100#endif
101
102static int zero; 99static int zero;
103static int one = 1; 100static int one = 1;
101static int two = 2;
104static unsigned long one_ul = 1; 102static unsigned long one_ul = 1;
105static int one_hundred = 100; 103static int one_hundred = 100;
106 104
@@ -900,6 +898,14 @@ static struct ctl_table kern_table[] = {
900 .proc_handler = &scan_unevictable_handler, 898 .proc_handler = &scan_unevictable_handler,
901 }, 899 },
902#endif 900#endif
901#ifdef CONFIG_SLOW_WORK
902 {
903 .ctl_name = CTL_UNNUMBERED,
904 .procname = "slow-work",
905 .mode = 0555,
906 .child = slow_work_sysctls,
907 },
908#endif
903/* 909/*
904 * NOTE: do not add new entries to this table unless you have read 910 * NOTE: do not add new entries to this table unless you have read
905 * Documentation/sysctl/ctl_unnumbered.txt 911 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1010,7 +1016,7 @@ static struct ctl_table vm_table[] = {
1010 .data = &dirty_expire_interval, 1016 .data = &dirty_expire_interval,
1011 .maxlen = sizeof(dirty_expire_interval), 1017 .maxlen = sizeof(dirty_expire_interval),
1012 .mode = 0644, 1018 .mode = 0644,
1013 .proc_handler = &proc_dointvec_userhz_jiffies, 1019 .proc_handler = &proc_dointvec,
1014 }, 1020 },
1015 { 1021 {
1016 .ctl_name = VM_NR_PDFLUSH_THREADS, 1022 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -1373,10 +1379,7 @@ static struct ctl_table fs_table[] = {
1373 .data = &lease_break_time, 1379 .data = &lease_break_time,
1374 .maxlen = sizeof(int), 1380 .maxlen = sizeof(int),
1375 .mode = 0644, 1381 .mode = 0644,
1376 .proc_handler = &proc_dointvec_minmax, 1382 .proc_handler = &proc_dointvec,
1377 .strategy = &sysctl_intvec,
1378 .extra1 = &zero,
1379 .extra2 = &two,
1380 }, 1383 },
1381#endif 1384#endif
1382#ifdef CONFIG_AIO 1385#ifdef CONFIG_AIO
@@ -1417,7 +1420,10 @@ static struct ctl_table fs_table[] = {
1417 .data = &suid_dumpable, 1420 .data = &suid_dumpable,
1418 .maxlen = sizeof(int), 1421 .maxlen = sizeof(int),
1419 .mode = 0644, 1422 .mode = 0644,
1420 .proc_handler = &proc_dointvec, 1423 .proc_handler = &proc_dointvec_minmax,
1424 .strategy = &sysctl_intvec,
1425 .extra1 = &zero,
1426 .extra2 = &two,
1421 }, 1427 },
1422#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1428#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1423 { 1429 {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e5ab87..504086ab4443 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER
72 help 72 help
73 Enable the kernel to trace a function at both its return 73 Enable the kernel to trace a function at both its return
74 and its entry. 74 and its entry.
75 It's first purpose is to trace the duration of functions and 75 Its first purpose is to trace the duration of functions and
76 draw a call graph for each thread with some informations like 76 draw a call graph for each thread with some information like
77 the return value. 77 the return value. This is done by setting the current return
78 This is done by setting the current return address on the current 78 address on the current task structure into a stack of calls.
79 task structure into a stack of calls.
80 79
81config IRQSOFF_TRACER 80config IRQSOFF_TRACER
82 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..53e8c8bc0c98 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
1908} 1908}
1909 1909
1910/** 1910/**
1911 * unregister_ftrace_function - unresgister a function for profiling. 1911 * unregister_ftrace_function - unregister a function for profiling.
1912 * @ops - ops structure that holds the function to unregister 1912 * @ops - ops structure that holds the function to unregister
1913 * 1913 *
1914 * Unregister a function that was added to be called by ftrace profiling. 1914 * Unregister a function that was added to be called by ftrace profiling.
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
37 up_write(&uts_sem); 37 up_write(&uts_sem);
38} 38}
39 39
40#ifdef CONFIG_PROC_FS 40#ifdef CONFIG_PROC_SYSCTL
41/* 41/*
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9aedd9fd825b..32f8e0d2bf5a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,8 +48,6 @@ struct cpu_workqueue_struct {
48 48
49 struct workqueue_struct *wq; 49 struct workqueue_struct *wq;
50 struct task_struct *thread; 50 struct task_struct *thread;
51
52 int run_depth; /* Detect run_workqueue() recursion depth */
53} ____cacheline_aligned; 51} ____cacheline_aligned;
54 52
55/* 53/*
@@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
262static void run_workqueue(struct cpu_workqueue_struct *cwq) 260static void run_workqueue(struct cpu_workqueue_struct *cwq)
263{ 261{
264 spin_lock_irq(&cwq->lock); 262 spin_lock_irq(&cwq->lock);
265 cwq->run_depth++;
266 if (cwq->run_depth > 3) {
267 /* morton gets to eat his hat */
268 printk("%s: recursion depth exceeded: %d\n",
269 __func__, cwq->run_depth);
270 dump_stack();
271 }
272 while (!list_empty(&cwq->worklist)) { 263 while (!list_empty(&cwq->worklist)) {
273 struct work_struct *work = list_entry(cwq->worklist.next, 264 struct work_struct *work = list_entry(cwq->worklist.next,
274 struct work_struct, entry); 265 struct work_struct, entry);
@@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
311 spin_lock_irq(&cwq->lock); 302 spin_lock_irq(&cwq->lock);
312 cwq->current_work = NULL; 303 cwq->current_work = NULL;
313 } 304 }
314 cwq->run_depth--;
315 spin_unlock_irq(&cwq->lock); 305 spin_unlock_irq(&cwq->lock);
316} 306}
317 307
@@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
368 358
369static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 359static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
370{ 360{
371 int active; 361 int active = 0;
362 struct wq_barrier barr;
372 363
373 if (cwq->thread == current) { 364 WARN_ON(cwq->thread == current);
374 /*
375 * Probably keventd trying to flush its own queue. So simply run
376 * it by hand rather than deadlocking.
377 */
378 run_workqueue(cwq);
379 active = 1;
380 } else {
381 struct wq_barrier barr;
382 365
383 active = 0; 366 spin_lock_irq(&cwq->lock);
384 spin_lock_irq(&cwq->lock); 367 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
385 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 368 insert_wq_barrier(cwq, &barr, &cwq->worklist);
386 insert_wq_barrier(cwq, &barr, &cwq->worklist); 369 active = 1;
387 active = 1;
388 }
389 spin_unlock_irq(&cwq->lock);
390
391 if (active)
392 wait_for_completion(&barr.done);
393 } 370 }
371 spin_unlock_irq(&cwq->lock);
372
373 if (active)
374 wait_for_completion(&barr.done);
394 375
395 return active; 376 return active;
396} 377}