aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/cgroup.c430
-rw-r--r--kernel/cgroup_debug.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c254
-rw-r--r--kernel/exec_domain.c23
-rw-r--r--kernel/exit.c245
-rw-r--r--kernel/fork.c72
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c31
-rw-r--r--kernel/irq/pm.c79
-rw-r--r--kernel/kexec.c22
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c528
-rw-r--r--kernel/lockdep_internals.h45
-rw-r--r--kernel/lockdep_proc.c22
-rw-r--r--kernel/lockdep_states.h9
-rw-r--r--kernel/mutex-debug.c9
-rw-r--r--kernel/mutex-debug.h18
-rw-r--r--kernel/mutex.c121
-rw-r--r--kernel/mutex.h22
-rw-r--r--kernel/ns_cgroup.c14
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/pid_namespace.c15
-rw-r--r--kernel/power/disk.c139
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swsusp.c18
-rw-r--r--kernel/printk.c19
-rw-r--r--kernel/ptrace.c101
-rw-r--r--kernel/rcutorture.c25
-rw-r--r--kernel/relay.c8
-rw-r--r--kernel/sched.c94
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/signal.c63
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/spinlock.c18
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/timer.c68
-rw-r--r--kernel/trace/trace_functions_graph.c75
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/workqueue.c47
48 files changed, 2462 insertions, 966 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o
96 97
97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 98ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 99# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h>
69 70
70#include "audit.h" 71#include "audit.h"
71 72
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
585 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
586 * This is called before css refcnt check. 623 * This is called before css refcnt check.
587 */ 624 */
588static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
589{ 626{
590 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
591 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
592 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
593 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
594 return; 633 if (ret)
634 break;
635 }
636 return ret;
595} 637}
596 638
597static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
685 remove_dir(dentry); 727 remove_dir(dentry);
686} 728}
687 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
688static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
689 unsigned long final_bits) 747 unsigned long final_bits)
690{ 748{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
857 } 915 }
858 916
859 ret = rebind_subsystems(root, opts.subsys_bits); 917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
860 920
861 /* (re)populate subsystem files */ 921 /* (re)populate subsystem files */
862 if (!ret) 922 cgroup_populate_dir(cgrp);
863 cgroup_populate_dir(cgrp);
864 923
865 if (opts.release_agent) 924 if (opts.release_agent)
866 strcpy(root->release_agent_path, opts.release_agent); 925 strcpy(root->release_agent_path, opts.release_agent);
867 out_unlock: 926 out_unlock:
868 if (opts.release_agent) 927 kfree(opts.release_agent);
869 kfree(opts.release_agent);
870 mutex_unlock(&cgroup_mutex); 928 mutex_unlock(&cgroup_mutex);
871 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
872 return ret; 930 return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
969 /* First find the desired set of subsystems */ 1027 /* First find the desired set of subsystems */
970 ret = parse_cgroupfs_options(data, &opts); 1028 ret = parse_cgroupfs_options(data, &opts);
971 if (ret) { 1029 if (ret) {
972 if (opts.release_agent) 1030 kfree(opts.release_agent);
973 kfree(opts.release_agent);
974 return ret; 1031 return ret;
975 } 1032 }
976 1033
977 root = kzalloc(sizeof(*root), GFP_KERNEL); 1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
978 if (!root) { 1035 if (!root) {
979 if (opts.release_agent) 1036 kfree(opts.release_agent);
980 kfree(opts.release_agent);
981 return -ENOMEM; 1037 return -ENOMEM;
982 } 1038 }
983 1039
@@ -1280,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1280 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1336 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1281 synchronize_rcu(); 1337 synchronize_rcu();
1282 put_css_set(cg); 1338 put_css_set(cg);
1339
1340 /*
1341 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1342 * is no longer empty.
1343 */
1344 cgroup_wakeup_rmdir_waiters(cgrp);
1283 return 0; 1345 return 0;
1284} 1346}
1285 1347
@@ -1625,7 +1687,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
1625 .rename = cgroup_rename, 1687 .rename = cgroup_rename,
1626}; 1688};
1627 1689
1628static int cgroup_create_file(struct dentry *dentry, int mode, 1690static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1629 struct super_block *sb) 1691 struct super_block *sb)
1630{ 1692{
1631 static const struct dentry_operations cgroup_dops = { 1693 static const struct dentry_operations cgroup_dops = {
@@ -1671,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
1671 * @mode: mode to set on new directory. 1733 * @mode: mode to set on new directory.
1672 */ 1734 */
1673static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1674 int mode) 1736 mode_t mode)
1675{ 1737{
1676 struct dentry *parent; 1738 struct dentry *parent;
1677 int error = 0; 1739 int error = 0;
@@ -1689,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1689 return error; 1751 return error;
1690} 1752}
1691 1753
1754/**
1755 * cgroup_file_mode - deduce file mode of a control file
1756 * @cft: the control file in question
1757 *
1758 * returns cft->mode if ->mode is not 0
1759 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1760 * returns S_IRUGO if it has only a read handler
1761 * returns S_IWUSR if it has only a write hander
1762 */
1763static mode_t cgroup_file_mode(const struct cftype *cft)
1764{
1765 mode_t mode = 0;
1766
1767 if (cft->mode)
1768 return cft->mode;
1769
1770 if (cft->read || cft->read_u64 || cft->read_s64 ||
1771 cft->read_map || cft->read_seq_string)
1772 mode |= S_IRUGO;
1773
1774 if (cft->write || cft->write_u64 || cft->write_s64 ||
1775 cft->write_string || cft->trigger)
1776 mode |= S_IWUSR;
1777
1778 return mode;
1779}
1780
1692int cgroup_add_file(struct cgroup *cgrp, 1781int cgroup_add_file(struct cgroup *cgrp,
1693 struct cgroup_subsys *subsys, 1782 struct cgroup_subsys *subsys,
1694 const struct cftype *cft) 1783 const struct cftype *cft)
@@ -1696,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
1696 struct dentry *dir = cgrp->dentry; 1785 struct dentry *dir = cgrp->dentry;
1697 struct dentry *dentry; 1786 struct dentry *dentry;
1698 int error; 1787 int error;
1788 mode_t mode;
1699 1789
1700 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1790 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1701 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1791 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1706,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
1706 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1796 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1707 dentry = lookup_one_len(name, dir, strlen(name)); 1797 dentry = lookup_one_len(name, dir, strlen(name));
1708 if (!IS_ERR(dentry)) { 1798 if (!IS_ERR(dentry)) {
1709 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1799 mode = cgroup_file_mode(cft);
1800 error = cgroup_create_file(dentry, mode | S_IFREG,
1710 cgrp->root->sb); 1801 cgrp->root->sb);
1711 if (!error) 1802 if (!error)
1712 dentry->d_fsdata = (void *)cft; 1803 dentry->d_fsdata = (void *)cft;
@@ -2288,6 +2379,7 @@ static struct cftype files[] = {
2288 .write_u64 = cgroup_tasks_write, 2379 .write_u64 = cgroup_tasks_write,
2289 .release = cgroup_tasks_release, 2380 .release = cgroup_tasks_release,
2290 .private = FILE_TASKLIST, 2381 .private = FILE_TASKLIST,
2382 .mode = S_IRUGO | S_IWUSR,
2291 }, 2383 },
2292 2384
2293 { 2385 {
@@ -2327,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2327 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2419 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2328 return err; 2420 return err;
2329 } 2421 }
2422 /* This cgroup is ready now */
2423 for_each_subsys(cgrp->root, ss) {
2424 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2425 /*
2426 * Update id->css pointer and make this css visible from
2427 * CSS ID functions. This pointer will be dereferened
2428 * from RCU-read-side without locks.
2429 */
2430 if (css->id)
2431 rcu_assign_pointer(css->id->css, css);
2432 }
2330 2433
2331 return 0; 2434 return 0;
2332} 2435}
@@ -2338,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2338 css->cgroup = cgrp; 2441 css->cgroup = cgrp;
2339 atomic_set(&css->refcnt, 1); 2442 atomic_set(&css->refcnt, 1);
2340 css->flags = 0; 2443 css->flags = 0;
2444 css->id = NULL;
2341 if (cgrp == dummytop) 2445 if (cgrp == dummytop)
2342 set_bit(CSS_ROOT, &css->flags); 2446 set_bit(CSS_ROOT, &css->flags);
2343 BUG_ON(cgrp->subsys[ss->subsys_id]); 2447 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2376,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2376 * Must be called with the mutex on the parent inode held 2480 * Must be called with the mutex on the parent inode held
2377 */ 2481 */
2378static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2482static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2379 int mode) 2483 mode_t mode)
2380{ 2484{
2381 struct cgroup *cgrp; 2485 struct cgroup *cgrp;
2382 struct cgroupfs_root *root = parent->root; 2486 struct cgroupfs_root *root = parent->root;
@@ -2413,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2413 goto err_destroy; 2517 goto err_destroy;
2414 } 2518 }
2415 init_cgroup_css(css, ss, cgrp); 2519 init_cgroup_css(css, ss, cgrp);
2520 if (ss->use_id)
2521 if (alloc_css_id(ss, parent, cgrp))
2522 goto err_destroy;
2523 /* At error, ->destroy() callback has to free assigned ID. */
2416 } 2524 }
2417 2525
2418 cgroup_lock_hierarchy(root); 2526 cgroup_lock_hierarchy(root);
@@ -2555,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2555 struct cgroup *cgrp = dentry->d_fsdata; 2663 struct cgroup *cgrp = dentry->d_fsdata;
2556 struct dentry *d; 2664 struct dentry *d;
2557 struct cgroup *parent; 2665 struct cgroup *parent;
2666 DEFINE_WAIT(wait);
2667 int ret;
2558 2668
2559 /* the vfs holds both inode->i_mutex already */ 2669 /* the vfs holds both inode->i_mutex already */
2560 2670again:
2561 mutex_lock(&cgroup_mutex); 2671 mutex_lock(&cgroup_mutex);
2562 if (atomic_read(&cgrp->count) != 0) { 2672 if (atomic_read(&cgrp->count) != 0) {
2563 mutex_unlock(&cgroup_mutex); 2673 mutex_unlock(&cgroup_mutex);
@@ -2573,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2573 * Call pre_destroy handlers of subsys. Notify subsystems 2683 * Call pre_destroy handlers of subsys. Notify subsystems
2574 * that rmdir() request comes. 2684 * that rmdir() request comes.
2575 */ 2685 */
2576 cgroup_call_pre_destroy(cgrp); 2686 ret = cgroup_call_pre_destroy(cgrp);
2687 if (ret)
2688 return ret;
2577 2689
2578 mutex_lock(&cgroup_mutex); 2690 mutex_lock(&cgroup_mutex);
2579 parent = cgrp->parent; 2691 parent = cgrp->parent;
2580 2692 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2581 if (atomic_read(&cgrp->count)
2582 || !list_empty(&cgrp->children)
2583 || !cgroup_clear_css_refs(cgrp)) {
2584 mutex_unlock(&cgroup_mutex); 2693 mutex_unlock(&cgroup_mutex);
2585 return -EBUSY; 2694 return -EBUSY;
2586 } 2695 }
2696 /*
2697 * css_put/get is provided for subsys to grab refcnt to css. In typical
2698 * case, subsystem has no reference after pre_destroy(). But, under
2699 * hierarchy management, some *temporal* refcnt can be hold.
2700 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2701 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2702 * is called when css_put() is called and refcnt goes down to 0.
2703 */
2704 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2705 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2706
2707 if (!cgroup_clear_css_refs(cgrp)) {
2708 mutex_unlock(&cgroup_mutex);
2709 schedule();
2710 finish_wait(&cgroup_rmdir_waitq, &wait);
2711 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2712 if (signal_pending(current))
2713 return -EINTR;
2714 goto again;
2715 }
2716 /* NO css_tryget() can success after here. */
2717 finish_wait(&cgroup_rmdir_waitq, &wait);
2718 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2587 2719
2588 spin_lock(&release_list_lock); 2720 spin_lock(&release_list_lock);
2589 set_bit(CGRP_REMOVED, &cgrp->flags); 2721 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2708,6 +2840,8 @@ int __init cgroup_init(void)
2708 struct cgroup_subsys *ss = subsys[i]; 2840 struct cgroup_subsys *ss = subsys[i];
2709 if (!ss->early_init) 2841 if (!ss->early_init)
2710 cgroup_init_subsys(ss); 2842 cgroup_init_subsys(ss);
2843 if (ss->use_id)
2844 cgroup_subsys_init_idr(ss);
2711 } 2845 }
2712 2846
2713 /* Add init_css_set to the hash table */ 2847 /* Add init_css_set to the hash table */
@@ -3084,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3084} 3218}
3085 3219
3086/** 3220/**
3087 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 3221 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
3088 * @cgrp: the cgroup in question 3222 * @cgrp: the cgroup in question
3223 * @task: the task in question
3089 * 3224 *
3090 * See if @cgrp is a descendant of the current task's cgroup in 3225 * See if @cgrp is a descendant of @task's cgroup in the appropriate
3091 * the appropriate hierarchy. 3226 * hierarchy.
3092 * 3227 *
3093 * If we are sending in dummytop, then presumably we are creating 3228 * If we are sending in dummytop, then presumably we are creating
3094 * the top cgroup in the subsystem. 3229 * the top cgroup in the subsystem.
3095 * 3230 *
3096 * Called only by the ns (nsproxy) cgroup. 3231 * Called only by the ns (nsproxy) cgroup.
3097 */ 3232 */
3098int cgroup_is_descendant(const struct cgroup *cgrp) 3233int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3099{ 3234{
3100 int ret; 3235 int ret;
3101 struct cgroup *target; 3236 struct cgroup *target;
@@ -3105,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
3105 return 1; 3240 return 1;
3106 3241
3107 get_first_subsys(cgrp, NULL, &subsys_id); 3242 get_first_subsys(cgrp, NULL, &subsys_id);
3108 target = task_cgroup(current, subsys_id); 3243 target = task_cgroup(task, subsys_id);
3109 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3244 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3110 cgrp = cgrp->parent; 3245 cgrp = cgrp->parent;
3111 ret = (cgrp == target); 3246 ret = (cgrp == target);
@@ -3138,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
3138{ 3273{
3139 struct cgroup *cgrp = css->cgroup; 3274 struct cgroup *cgrp = css->cgroup;
3140 rcu_read_lock(); 3275 rcu_read_lock();
3141 if ((atomic_dec_return(&css->refcnt) == 1) && 3276 if (atomic_dec_return(&css->refcnt) == 1) {
3142 notify_on_release(cgrp)) { 3277 if (notify_on_release(cgrp)) {
3143 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3278 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3144 check_for_release(cgrp); 3279 check_for_release(cgrp);
3280 }
3281 cgroup_wakeup_rmdir_waiters(cgrp);
3145 } 3282 }
3146 rcu_read_unlock(); 3283 rcu_read_unlock();
3147} 3284}
@@ -3241,3 +3378,232 @@ static int __init cgroup_disable(char *str)
3241 return 1; 3378 return 1;
3242} 3379}
3243__setup("cgroup_disable=", cgroup_disable); 3380__setup("cgroup_disable=", cgroup_disable);
3381
3382/*
3383 * Functons for CSS ID.
3384 */
3385
3386/*
3387 *To get ID other than 0, this should be called when !cgroup_is_removed().
3388 */
3389unsigned short css_id(struct cgroup_subsys_state *css)
3390{
3391 struct css_id *cssid = rcu_dereference(css->id);
3392
3393 if (cssid)
3394 return cssid->id;
3395 return 0;
3396}
3397
3398unsigned short css_depth(struct cgroup_subsys_state *css)
3399{
3400 struct css_id *cssid = rcu_dereference(css->id);
3401
3402 if (cssid)
3403 return cssid->depth;
3404 return 0;
3405}
3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 const struct cgroup_subsys_state *root)
3409{
3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id);
3412
3413 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3414 return false;
3415 return child_id->stack[root_id->depth] == root_id->id;
3416}
3417
3418static void __free_css_id_cb(struct rcu_head *head)
3419{
3420 struct css_id *id;
3421
3422 id = container_of(head, struct css_id, rcu_head);
3423 kfree(id);
3424}
3425
3426void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3427{
3428 struct css_id *id = css->id;
3429 /* When this is called before css_id initialization, id can be NULL */
3430 if (!id)
3431 return;
3432
3433 BUG_ON(!ss->use_id);
3434
3435 rcu_assign_pointer(id->css, NULL);
3436 rcu_assign_pointer(css->id, NULL);
3437 spin_lock(&ss->id_lock);
3438 idr_remove(&ss->idr, id->id);
3439 spin_unlock(&ss->id_lock);
3440 call_rcu(&id->rcu_head, __free_css_id_cb);
3441}
3442
3443/*
3444 * This is called by init or create(). Then, calls to this function are
3445 * always serialized (By cgroup_mutex() at create()).
3446 */
3447
3448static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3449{
3450 struct css_id *newid;
3451 int myid, error, size;
3452
3453 BUG_ON(!ss->use_id);
3454
3455 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3456 newid = kzalloc(size, GFP_KERNEL);
3457 if (!newid)
3458 return ERR_PTR(-ENOMEM);
3459 /* get id */
3460 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3461 error = -ENOMEM;
3462 goto err_out;
3463 }
3464 spin_lock(&ss->id_lock);
3465 /* Don't use 0. allocates an ID of 1-65535 */
3466 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3467 spin_unlock(&ss->id_lock);
3468
3469 /* Returns error when there are no free spaces for new ID.*/
3470 if (error) {
3471 error = -ENOSPC;
3472 goto err_out;
3473 }
3474 if (myid > CSS_ID_MAX)
3475 goto remove_idr;
3476
3477 newid->id = myid;
3478 newid->depth = depth;
3479 return newid;
3480remove_idr:
3481 error = -ENOSPC;
3482 spin_lock(&ss->id_lock);
3483 idr_remove(&ss->idr, myid);
3484 spin_unlock(&ss->id_lock);
3485err_out:
3486 kfree(newid);
3487 return ERR_PTR(error);
3488
3489}
3490
3491static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3492{
3493 struct css_id *newid;
3494 struct cgroup_subsys_state *rootcss;
3495
3496 spin_lock_init(&ss->id_lock);
3497 idr_init(&ss->idr);
3498
3499 rootcss = init_css_set.subsys[ss->subsys_id];
3500 newid = get_new_cssid(ss, 0);
3501 if (IS_ERR(newid))
3502 return PTR_ERR(newid);
3503
3504 newid->stack[0] = newid->id;
3505 newid->css = rootcss;
3506 rootcss->id = newid;
3507 return 0;
3508}
3509
3510static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3511 struct cgroup *child)
3512{
3513 int subsys_id, i, depth = 0;
3514 struct cgroup_subsys_state *parent_css, *child_css;
3515 struct css_id *child_id, *parent_id = NULL;
3516
3517 subsys_id = ss->subsys_id;
3518 parent_css = parent->subsys[subsys_id];
3519 child_css = child->subsys[subsys_id];
3520 depth = css_depth(parent_css) + 1;
3521 parent_id = parent_css->id;
3522
3523 child_id = get_new_cssid(ss, depth);
3524 if (IS_ERR(child_id))
3525 return PTR_ERR(child_id);
3526
3527 for (i = 0; i < depth; i++)
3528 child_id->stack[i] = parent_id->stack[i];
3529 child_id->stack[depth] = child_id->id;
3530 /*
3531 * child_id->css pointer will be set after this cgroup is available
3532 * see cgroup_populate_dir()
3533 */
3534 rcu_assign_pointer(child_css->id, child_id);
3535
3536 return 0;
3537}
3538
3539/**
3540 * css_lookup - lookup css by id
3541 * @ss: cgroup subsys to be looked into.
3542 * @id: the id
3543 *
3544 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3545 * NULL if not. Should be called under rcu_read_lock()
3546 */
3547struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3548{
3549 struct css_id *cssid = NULL;
3550
3551 BUG_ON(!ss->use_id);
3552 cssid = idr_find(&ss->idr, id);
3553
3554 if (unlikely(!cssid))
3555 return NULL;
3556
3557 return rcu_dereference(cssid->css);
3558}
3559
3560/**
3561 * css_get_next - lookup next cgroup under specified hierarchy.
3562 * @ss: pointer to subsystem
3563 * @id: current position of iteration.
3564 * @root: pointer to css. search tree under this.
3565 * @foundid: position of found object.
3566 *
3567 * Search next css under the specified hierarchy of rootid. Calling under
3568 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3569 */
3570struct cgroup_subsys_state *
3571css_get_next(struct cgroup_subsys *ss, int id,
3572 struct cgroup_subsys_state *root, int *foundid)
3573{
3574 struct cgroup_subsys_state *ret = NULL;
3575 struct css_id *tmp;
3576 int tmpid;
3577 int rootid = css_id(root);
3578 int depth = css_depth(root);
3579
3580 if (!rootid)
3581 return NULL;
3582
3583 BUG_ON(!ss->use_id);
3584 /* fill start point for scan */
3585 tmpid = id;
3586 while (1) {
3587 /*
3588 * scan next entry from bitmap(tree), tmpid is updated after
3589 * idr_get_next().
3590 */
3591 spin_lock(&ss->id_lock);
3592 tmp = idr_get_next(&ss->idr, &tmpid);
3593 spin_unlock(&ss->id_lock);
3594
3595 if (!tmp)
3596 break;
3597 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3598 ret = rcu_dereference(tmp->css);
3599 if (ret) {
3600 *foundid = tmpid;
3601 break;
3602 }
3603 }
3604 /* continue to scan from next id */
3605 tmpid = tmpid + 1;
3606 }
3607 return ret;
3608}
3609
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{ 40{
41 u64 count; 41 u64 count;
42 42
43 cgroup_lock();
44 count = cgroup_task_count(cont); 43 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count; 44 return count;
47} 45}
48 46
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb8..395b6974dc8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
281 goto out; 281 goto out;
282 } 282 }
283 283
284 cpu_clear(cpu, cpu_active_map); 284 set_cpu_active(cpu, false);
285 285
286 /* 286 /*
287 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
296 err = _cpu_down(cpu, 0); 296 err = _cpu_down(cpu, 0);
297 297
298 if (cpu_online(cpu)) 298 if (cpu_online(cpu))
299 cpu_set(cpu, cpu_active_map); 299 set_cpu_active(cpu, true);
300 300
301out: 301out:
302 cpu_maps_update_done(); 302 cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
333 goto out_notify; 333 goto out_notify;
334 BUG_ON(!cpu_online(cpu)); 334 BUG_ON(!cpu_online(cpu));
335 335
336 cpu_set(cpu, cpu_active_map); 336 set_cpu_active(cpu, true);
337 337
338 /* Now call notifier in preparation. */ 338 /* Now call notifier in preparation. */
339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 128 return container_of(task_subsys_state(task, cpuset_subsys_id),
129 struct cpuset, css); 129 struct cpuset, css);
130} 130}
131struct cpuset_hotplug_scanner {
132 struct cgroup_scanner scan;
133 struct cgroup *to;
134};
135 131
136/* bits in struct cpuset flags field */ 132/* bits in struct cpuset flags field */
137typedef enum { 133typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
521 return 0; 517 return 0;
522} 518}
523 519
520#ifdef CONFIG_SMP
524/* 521/*
525 * Helper routine for generate_sched_domains(). 522 * Helper routine for generate_sched_domains().
526 * Do cpusets a, b have overlapping cpus_allowed masks? 523 * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
815 812
816 put_online_cpus(); 813 put_online_cpus();
817} 814}
815#else /* !CONFIG_SMP */
816static void do_rebuild_sched_domains(struct work_struct *unused)
817{
818}
819
820static int generate_sched_domains(struct cpumask **domains,
821 struct sched_domain_attr **attributes)
822{
823 *domains = NULL;
824 return 1;
825}
826#endif /* CONFIG_SMP */
818 827
819static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); 828static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
820 829
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1026 mutex_unlock(&callback_mutex); 1035 mutex_unlock(&callback_mutex);
1027} 1036}
1028 1037
1038/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
1041 */
1042static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan)
1044{
1045 struct mm_struct *mm;
1046 struct cpuset *cs;
1047 int migrate;
1048 const nodemask_t *oldmem = scan->data;
1049
1050 mm = get_task_mm(p);
1051 if (!mm)
1052 return;
1053
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs);
1056
1057 mpol_rebind_mm(mm, &cs->mems_allowed);
1058 if (migrate)
1059 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1060 mmput(mm);
1061}
1062
1029static void *cpuset_being_rebound; 1063static void *cpuset_being_rebound;
1030 1064
1031/** 1065/**
1032 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1066 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1033 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1067 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1034 * @oldmem: old mems_allowed of cpuset cs 1068 * @oldmem: old mems_allowed of cpuset cs
1069 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1035 * 1070 *
1036 * Called with cgroup_mutex held 1071 * Called with cgroup_mutex held
1037 * Return 0 if successful, -errno if not. 1072 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1073 * if @heap != NULL.
1038 */ 1074 */
1039static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) 1075static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1076 struct ptr_heap *heap)
1040{ 1077{
1041 struct task_struct *p; 1078 struct cgroup_scanner scan;
1042 struct mm_struct **mmarray;
1043 int i, n, ntasks;
1044 int migrate;
1045 int fudge;
1046 struct cgroup_iter it;
1047 int retval;
1048 1079
1049 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1080 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1050 1081
1051 fudge = 10; /* spare mmarray[] slots */ 1082 scan.cg = cs->css.cgroup;
1052 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ 1083 scan.test_task = NULL;
1053 retval = -ENOMEM; 1084 scan.process_task = cpuset_change_nodemask;
1054 1085 scan.heap = heap;
1055 /* 1086 scan.data = (nodemask_t *)oldmem;
1056 * Allocate mmarray[] to hold mm reference for each task
1057 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
1058 * tasklist_lock. We could use GFP_ATOMIC, but with a
1059 * few more lines of code, we can retry until we get a big
1060 * enough mmarray[] w/o using GFP_ATOMIC.
1061 */
1062 while (1) {
1063 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
1064 ntasks += fudge;
1065 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1066 if (!mmarray)
1067 goto done;
1068 read_lock(&tasklist_lock); /* block fork */
1069 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1070 break; /* got enough */
1071 read_unlock(&tasklist_lock); /* try again */
1072 kfree(mmarray);
1073 }
1074
1075 n = 0;
1076
1077 /* Load up mmarray[] with mm reference for each task in cpuset. */
1078 cgroup_iter_start(cs->css.cgroup, &it);
1079 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1080 struct mm_struct *mm;
1081
1082 if (n >= ntasks) {
1083 printk(KERN_WARNING
1084 "Cpuset mempolicy rebind incomplete.\n");
1085 break;
1086 }
1087 mm = get_task_mm(p);
1088 if (!mm)
1089 continue;
1090 mmarray[n++] = mm;
1091 }
1092 cgroup_iter_end(cs->css.cgroup, &it);
1093 read_unlock(&tasklist_lock);
1094 1087
1095 /* 1088 /*
1096 * Now that we've dropped the tasklist spinlock, we can 1089 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1097 * rebind the vma mempolicies of each mm in mmarray[] to their 1090 * take while holding tasklist_lock. Forks can happen - the
1098 * new cpuset, and release that mm. The mpol_rebind_mm() 1091 * mpol_dup() cpuset_being_rebound check will catch such forks,
1099 * call takes mmap_sem, which we couldn't take while holding 1092 * and rebind their vma mempolicies too. Because we still hold
1100 * tasklist_lock. Forks can happen again now - the mpol_dup() 1093 * the global cgroup_mutex, we know that no other rebind effort
1101 * cpuset_being_rebound check will catch such forks, and rebind 1094 * will be contending for the global variable cpuset_being_rebound.
1102 * their vma mempolicies too. Because we still hold the global
1103 * cgroup_mutex, we know that no other rebind effort will
1104 * be contending for the global variable cpuset_being_rebound.
1105 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1095 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1106 * is idempotent. Also migrate pages in each mm to new nodes. 1096 * is idempotent. Also migrate pages in each mm to new nodes.
1107 */ 1097 */
1108 migrate = is_memory_migrate(cs); 1098 cgroup_scan_tasks(&scan);
1109 for (i = 0; i < n; i++) {
1110 struct mm_struct *mm = mmarray[i];
1111
1112 mpol_rebind_mm(mm, &cs->mems_allowed);
1113 if (migrate)
1114 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1115 mmput(mm);
1116 }
1117 1099
1118 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1100 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1119 kfree(mmarray);
1120 cpuset_being_rebound = NULL; 1101 cpuset_being_rebound = NULL;
1121 retval = 0;
1122done:
1123 return retval;
1124} 1102}
1125 1103
1126/* 1104/*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1141{ 1119{
1142 nodemask_t oldmem; 1120 nodemask_t oldmem;
1143 int retval; 1121 int retval;
1122 struct ptr_heap heap;
1144 1123
1145 /* 1124 /*
1146 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1125 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1175 if (retval < 0) 1154 if (retval < 0)
1176 goto done; 1155 goto done;
1177 1156
1157 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1158 if (retval < 0)
1159 goto done;
1160
1178 mutex_lock(&callback_mutex); 1161 mutex_lock(&callback_mutex);
1179 cs->mems_allowed = trialcs->mems_allowed; 1162 cs->mems_allowed = trialcs->mems_allowed;
1180 cs->mems_generation = cpuset_mems_generation++; 1163 cs->mems_generation = cpuset_mems_generation++;
1181 mutex_unlock(&callback_mutex); 1164 mutex_unlock(&callback_mutex);
1182 1165
1183 retval = update_tasks_nodemask(cs, &oldmem); 1166 update_tasks_nodemask(cs, &oldmem, &heap);
1167
1168 heap_free(&heap);
1184done: 1169done:
1185 return retval; 1170 return retval;
1186} 1171}
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
1192 1177
1193static int update_relax_domain_level(struct cpuset *cs, s64 val) 1178static int update_relax_domain_level(struct cpuset *cs, s64 val)
1194{ 1179{
1180#ifdef CONFIG_SMP
1195 if (val < -1 || val >= SD_LV_MAX) 1181 if (val < -1 || val >= SD_LV_MAX)
1196 return -EINVAL; 1182 return -EINVAL;
1183#endif
1197 1184
1198 if (val != cs->relax_domain_level) { 1185 if (val != cs->relax_domain_level) {
1199 cs->relax_domain_level = val; 1186 cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1355 struct cgroup *cont, struct task_struct *tsk) 1342 struct cgroup *cont, struct task_struct *tsk)
1356{ 1343{
1357 struct cpuset *cs = cgroup_cs(cont); 1344 struct cpuset *cs = cgroup_cs(cont);
1358 int ret = 0;
1359 1345
1360 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1346 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1361 return -ENOSPC; 1347 return -ENOSPC;
1362 1348
1363 if (tsk->flags & PF_THREAD_BOUND) { 1349 /*
1364 mutex_lock(&callback_mutex); 1350 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1365 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) 1351 * cannot change their cpu affinity and isolating such threads by their
1366 ret = -EINVAL; 1352 * set of allowed nodes is unnecessary. Thus, cpusets are not
1367 mutex_unlock(&callback_mutex); 1353 * applicable for such threads. This prevents checking for success of
1368 } 1354 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1355 * be changed.
1356 */
1357 if (tsk->flags & PF_THREAD_BOUND)
1358 return -EINVAL;
1369 1359
1370 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); 1360 return security_task_setscheduler(tsk, 0, NULL);
1371} 1361}
1372 1362
1373static void cpuset_attach(struct cgroup_subsys *ss, 1363static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
1706 .read_u64 = cpuset_read_u64, 1696 .read_u64 = cpuset_read_u64,
1707 .write_u64 = cpuset_write_u64, 1697 .write_u64 = cpuset_write_u64,
1708 .private = FILE_MEMORY_PRESSURE, 1698 .private = FILE_MEMORY_PRESSURE,
1699 .mode = S_IRUGO,
1709 }, 1700 },
1710 1701
1711 { 1702 {
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
1913static void cpuset_do_move_task(struct task_struct *tsk, 1904static void cpuset_do_move_task(struct task_struct *tsk,
1914 struct cgroup_scanner *scan) 1905 struct cgroup_scanner *scan)
1915{ 1906{
1916 struct cpuset_hotplug_scanner *chsp; 1907 struct cgroup *new_cgroup = scan->data;
1917 1908
1918 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); 1909 cgroup_attach_task(new_cgroup, tsk);
1919 cgroup_attach_task(chsp->to, tsk);
1920} 1910}
1921 1911
1922/** 1912/**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 */ 1922 */
1933static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) 1923static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1934{ 1924{
1935 struct cpuset_hotplug_scanner scan; 1925 struct cgroup_scanner scan;
1936 1926
1937 scan.scan.cg = from->css.cgroup; 1927 scan.cg = from->css.cgroup;
1938 scan.scan.test_task = NULL; /* select all tasks in cgroup */ 1928 scan.test_task = NULL; /* select all tasks in cgroup */
1939 scan.scan.process_task = cpuset_do_move_task; 1929 scan.process_task = cpuset_do_move_task;
1940 scan.scan.heap = NULL; 1930 scan.heap = NULL;
1941 scan.to = to->css.cgroup; 1931 scan.data = to->css.cgroup;
1942 1932
1943 if (cgroup_scan_tasks(&scan.scan)) 1933 if (cgroup_scan_tasks(&scan))
1944 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1934 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1945 "cgroup_scan_tasks failed\n"); 1935 "cgroup_scan_tasks failed\n");
1946} 1936}
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2033 remove_tasks_in_empty_cpuset(cp); 2023 remove_tasks_in_empty_cpuset(cp);
2034 else { 2024 else {
2035 update_tasks_cpumask(cp, NULL); 2025 update_tasks_cpumask(cp, NULL);
2036 update_tasks_nodemask(cp, &oldmems); 2026 update_tasks_nodemask(cp, &oldmems, NULL);
2037 } 2027 }
2038 } 2028 }
2039} 2029}
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2069 } 2059 }
2070 2060
2071 cgroup_lock(); 2061 cgroup_lock();
2062 mutex_lock(&callback_mutex);
2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2063 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2064 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2092 cgroup_lock(); 2084 cgroup_lock();
2093 switch (action) { 2085 switch (action) {
2094 case MEM_ONLINE: 2086 case MEM_ONLINE:
2095 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2096 break;
2097 case MEM_OFFLINE: 2087 case MEM_OFFLINE:
2088 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2089 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 scan_for_empty_cpusets(&top_cpuset); 2090 mutex_unlock(&callback_mutex);
2091 if (action == MEM_OFFLINE)
2092 scan_for_empty_cpusets(&top_cpuset);
2100 break; 2093 break;
2101 default: 2094 default:
2102 break; 2095 break;
@@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2206} 2199}
2207 2200
2208/** 2201/**
2209 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? 2202 * cpuset_node_allowed_softwall - Can we allocate on a memory node?
2210 * @z: is this zone on an allowed node? 2203 * @node: is this an allowed node?
2211 * @gfp_mask: memory allocation flags 2204 * @gfp_mask: memory allocation flags
2212 * 2205 *
2213 * If we're in interrupt, yes, we can always allocate. If 2206 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2214 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2207 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2215 * z's node is in our tasks mems_allowed, yes. If it's not a 2208 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
2216 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2209 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
2217 * hardwalled cpuset ancestor to this tasks cpuset, yes. 2210 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2218 * If the task has been OOM killed and has access to memory reserves 2211 * flag, yes.
2219 * as specified by the TIF_MEMDIE flag, yes.
2220 * Otherwise, no. 2212 * Otherwise, no.
2221 * 2213 *
2222 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() 2214 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2223 * reduces to cpuset_zone_allowed_hardwall(). Otherwise, 2215 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2224 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone 2216 * might sleep, and might allow a node from an enclosing cpuset.
2225 * from an enclosing cpuset.
2226 * 2217 *
2227 * cpuset_zone_allowed_hardwall() only handles the simpler case of 2218 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2228 * hardwall cpusets, and never sleeps. 2219 * cpusets, and never sleeps.
2229 * 2220 *
2230 * The __GFP_THISNODE placement logic is really handled elsewhere, 2221 * The __GFP_THISNODE placement logic is really handled elsewhere,
2231 * by forcibly using a zonelist starting at a specified node, and by 2222 * by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2264 * GFP_USER - only nodes in current tasks mems allowed ok. 2255 * GFP_USER - only nodes in current tasks mems allowed ok.
2265 * 2256 *
2266 * Rule: 2257 * Rule:
2267 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you 2258 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2268 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2259 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2269 * the code that might scan up ancestor cpusets and sleep. 2260 * the code that might scan up ancestor cpusets and sleep.
2270 */ 2261 */
2271 2262int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2272int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2273{ 2263{
2274 int node; /* node that zone z is on */
2275 const struct cpuset *cs; /* current cpuset ancestors */ 2264 const struct cpuset *cs; /* current cpuset ancestors */
2276 int allowed; /* is allocation in zone z allowed? */ 2265 int allowed; /* is allocation in zone z allowed? */
2277 2266
2278 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2279 return 1; 2268 return 1;
2280 node = zone_to_nid(z);
2281 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2269 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2282 if (node_isset(node, current->mems_allowed)) 2270 if (node_isset(node, current->mems_allowed))
2283 return 1; 2271 return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2306} 2294}
2307 2295
2308/* 2296/*
2309 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? 2297 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2310 * @z: is this zone on an allowed node? 2298 * @node: is this an allowed node?
2311 * @gfp_mask: memory allocation flags 2299 * @gfp_mask: memory allocation flags
2312 * 2300 *
2313 * If we're in interrupt, yes, we can always allocate. 2301 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2314 * If __GFP_THISNODE is set, yes, we can always allocate. If zone 2302 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2315 * z's node is in our tasks mems_allowed, yes. If the task has been 2303 * yes. If the task has been OOM killed and has access to memory reserves as
2316 * OOM killed and has access to memory reserves as specified by the 2304 * specified by the TIF_MEMDIE flag, yes.
2317 * TIF_MEMDIE flag, yes. Otherwise, no. 2305 * Otherwise, no.
2318 * 2306 *
2319 * The __GFP_THISNODE placement logic is really handled elsewhere, 2307 * The __GFP_THISNODE placement logic is really handled elsewhere,
2320 * by forcibly using a zonelist starting at a specified node, and by 2308 * by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2322 * any node on the zonelist except the first. By the time any such 2310 * any node on the zonelist except the first. By the time any such
2323 * calls get to this routine, we should just shut up and say 'yes'. 2311 * calls get to this routine, we should just shut up and say 'yes'.
2324 * 2312 *
2325 * Unlike the cpuset_zone_allowed_softwall() variant, above, 2313 * Unlike the cpuset_node_allowed_softwall() variant, above,
2326 * this variant requires that the zone be in the current tasks 2314 * this variant requires that the node be in the current task's
2327 * mems_allowed or that we're in interrupt. It does not scan up the 2315 * mems_allowed or that we're in interrupt. It does not scan up the
2328 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. 2316 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2329 * It never sleeps. 2317 * It never sleeps.
2330 */ 2318 */
2331 2319int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2332int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2333{ 2320{
2334 int node; /* node that zone z is on */
2335
2336 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2321 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2337 return 1; 2322 return 1;
2338 node = zone_to_nid(z);
2339 if (node_isset(node, current->mems_allowed)) 2323 if (node_isset(node, current->mems_allowed))
2340 return 1; 2324 return 1;
2341 /* 2325 /*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c2952..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/fs_struct.h>
21 22
22 23
23static void default_handler(int, struct pt_regs *); 24static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
145 return 0; 146 return 0;
146 } 147 }
147 148
148 if (atomic_read(&current->fs->count) != 1) {
149 struct fs_struct *fsp, *ofsp;
150
151 fsp = copy_fs_struct(current->fs);
152 if (fsp == NULL) {
153 module_put(ep->module);
154 return -ENOMEM;
155 }
156
157 task_lock(current);
158 ofsp = current->fs;
159 current->fs = fsp;
160 task_unlock(current);
161
162 put_fs_struct(ofsp);
163 }
164
165 /*
166 * At that point we are guaranteed to be the sole owner of
167 * current->fs.
168 */
169
170 current->personality = personality; 149 current->personality = personality;
171 oep = current_thread_info()->exec_domain; 150 oep = current_thread_info()->exec_domain;
172 current_thread_info()->exec_domain = ep; 151 current_thread_info()->exec_domain = ep;
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..6686ed1e4aa3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h>
49#include <linux/init_task.h> 50#include <linux/init_task.h>
50#include <trace/sched.h> 51#include <trace/sched.h>
51 52
@@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
61 62
62static void exit_mm(struct task_struct * tsk); 63static void exit_mm(struct task_struct * tsk);
63 64
64static inline int task_detached(struct task_struct *p)
65{
66 return p->exit_signal == -1;
67}
68
69static void __unhash_process(struct task_struct *p) 65static void __unhash_process(struct task_struct *p)
70{ 66{
71 nr_threads--; 67 nr_threads--;
@@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)
362void __set_special_pids(struct pid *pid) 358void __set_special_pids(struct pid *pid)
363{ 359{
364 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
365 pid_t nr = pid_nr(pid);
366 361
367 if (task_session(curr) != pid) { 362 if (task_session(curr) != pid)
368 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
369 set_task_session(curr, nr); 364
370 } 365 if (task_pgrp(curr) != pid)
371 if (task_pgrp(curr) != pid) {
372 change_pid(curr, PIDTYPE_PGID, pid); 366 change_pid(curr, PIDTYPE_PGID, pid);
373 set_task_pgrp(curr, nr);
374 }
375} 367}
376 368
377static void set_special_pids(struct pid *pid) 369static void set_special_pids(struct pid *pid)
@@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
429void daemonize(const char *name, ...) 421void daemonize(const char *name, ...)
430{ 422{
431 va_list args; 423 va_list args;
432 struct fs_struct *fs;
433 sigset_t blocked; 424 sigset_t blocked;
434 425
435 va_start(args, name); 426 va_start(args, name);
@@ -462,11 +453,7 @@ void daemonize(const char *name, ...)
462 453
463 /* Become as one with the init task */ 454 /* Become as one with the init task */
464 455
465 exit_fs(current); /* current->fs->count--; */ 456 daemonize_fs_struct();
466 fs = init_task.fs;
467 current->fs = fs;
468 atomic_inc(&fs->count);
469
470 exit_files(current); 457 exit_files(current);
471 current->files = init_task.files; 458 current->files = init_task.files;
472 atomic_inc(&current->files->count); 459 atomic_inc(&current->files->count);
@@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)
565 } 552 }
566} 553}
567 554
568void put_fs_struct(struct fs_struct *fs)
569{
570 /* No need to hold fs->lock if we are killing it */
571 if (atomic_dec_and_test(&fs->count)) {
572 path_put(&fs->root);
573 path_put(&fs->pwd);
574 kmem_cache_free(fs_cachep, fs);
575 }
576}
577
578void exit_fs(struct task_struct *tsk)
579{
580 struct fs_struct * fs = tsk->fs;
581
582 if (fs) {
583 task_lock(tsk);
584 tsk->fs = NULL;
585 task_unlock(tsk);
586 put_fs_struct(fs);
587 }
588}
589
590EXPORT_SYMBOL_GPL(exit_fs);
591
592#ifdef CONFIG_MM_OWNER 555#ifdef CONFIG_MM_OWNER
593/* 556/*
594 * Task p is exiting and it owned mm, lets find a new owner for it 557 * Task p is exiting and it owned mm, lets find a new owner for it
@@ -732,119 +695,6 @@ static void exit_mm(struct task_struct * tsk)
732} 695}
733 696
734/* 697/*
735 * Return nonzero if @parent's children should reap themselves.
736 *
737 * Called with write_lock_irq(&tasklist_lock) held.
738 */
739static int ignoring_children(struct task_struct *parent)
740{
741 int ret;
742 struct sighand_struct *psig = parent->sighand;
743 unsigned long flags;
744 spin_lock_irqsave(&psig->siglock, flags);
745 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
746 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
747 spin_unlock_irqrestore(&psig->siglock, flags);
748 return ret;
749}
750
751/*
752 * Detach all tasks we were using ptrace on.
753 * Any that need to be release_task'd are put on the @dead list.
754 *
755 * Called with write_lock(&tasklist_lock) held.
756 */
757static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
758{
759 struct task_struct *p, *n;
760 int ign = -1;
761
762 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
763 __ptrace_unlink(p);
764
765 if (p->exit_state != EXIT_ZOMBIE)
766 continue;
767
768 /*
769 * If it's a zombie, our attachedness prevented normal
770 * parent notification or self-reaping. Do notification
771 * now if it would have happened earlier. If it should
772 * reap itself, add it to the @dead list. We can't call
773 * release_task() here because we already hold tasklist_lock.
774 *
775 * If it's our own child, there is no notification to do.
776 * But if our normal children self-reap, then this child
777 * was prevented by ptrace and we must reap it now.
778 */
779 if (!task_detached(p) && thread_group_empty(p)) {
780 if (!same_thread_group(p->real_parent, parent))
781 do_notify_parent(p, p->exit_signal);
782 else {
783 if (ign < 0)
784 ign = ignoring_children(parent);
785 if (ign)
786 p->exit_signal = -1;
787 }
788 }
789
790 if (task_detached(p)) {
791 /*
792 * Mark it as in the process of being reaped.
793 */
794 p->exit_state = EXIT_DEAD;
795 list_add(&p->ptrace_entry, dead);
796 }
797 }
798}
799
800/*
801 * Finish up exit-time ptrace cleanup.
802 *
803 * Called without locks.
804 */
805static void ptrace_exit_finish(struct task_struct *parent,
806 struct list_head *dead)
807{
808 struct task_struct *p, *n;
809
810 BUG_ON(!list_empty(&parent->ptraced));
811
812 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
813 list_del_init(&p->ptrace_entry);
814 release_task(p);
815 }
816}
817
818static void reparent_thread(struct task_struct *p, struct task_struct *father)
819{
820 if (p->pdeath_signal)
821 /* We already hold the tasklist_lock here. */
822 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
823
824 list_move_tail(&p->sibling, &p->real_parent->children);
825
826 /* If this is a threaded reparent there is no need to
827 * notify anyone anything has happened.
828 */
829 if (same_thread_group(p->real_parent, father))
830 return;
831
832 /* We don't want people slaying init. */
833 if (!task_detached(p))
834 p->exit_signal = SIGCHLD;
835
836 /* If we'd notified the old parent about this child's death,
837 * also notify the new parent.
838 */
839 if (!ptrace_reparented(p) &&
840 p->exit_state == EXIT_ZOMBIE &&
841 !task_detached(p) && thread_group_empty(p))
842 do_notify_parent(p, p->exit_signal);
843
844 kill_orphaned_pgrp(p, father);
845}
846
847/*
848 * When we die, we re-parent all our children. 698 * When we die, we re-parent all our children.
849 * Try to give them to another thread in our thread 699 * Try to give them to another thread in our thread
850 * group, and if no such member exists, give it to 700 * group, and if no such member exists, give it to
@@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
883 return pid_ns->child_reaper; 733 return pid_ns->child_reaper;
884} 734}
885 735
736/*
737* Any that need to be release_task'd are put on the @dead list.
738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead)
741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children);
746
747 if (task_detached(p))
748 return;
749 /*
750 * If this is a threaded reparent there is no need to
751 * notify anyone anything has happened.
752 */
753 if (same_thread_group(p->real_parent, father))
754 return;
755
756 /* We don't want people slaying init. */
757 p->exit_signal = SIGCHLD;
758
759 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) {
764 p->exit_state = EXIT_DEAD;
765 list_move_tail(&p->sibling, dead);
766 }
767 }
768
769 kill_orphaned_pgrp(p, father);
770}
771
886static void forget_original_parent(struct task_struct *father) 772static void forget_original_parent(struct task_struct *father)
887{ 773{
888 struct task_struct *p, *n, *reaper; 774 struct task_struct *p, *n, *reaper;
889 LIST_HEAD(ptrace_dead); 775 LIST_HEAD(dead_children);
776
777 exit_ptrace(father);
890 778
891 write_lock_irq(&tasklist_lock); 779 write_lock_irq(&tasklist_lock);
892 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
893 /*
894 * First clean up ptrace if we were using it.
895 */
896 ptrace_exit(father, &ptrace_dead);
897 781
898 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
899 p->real_parent = reaper; 783 p->real_parent = reaper;
@@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
901 BUG_ON(p->ptrace); 785 BUG_ON(p->ptrace);
902 p->parent = p->real_parent; 786 p->parent = p->real_parent;
903 } 787 }
904 reparent_thread(p, father); 788 reparent_thread(father, p, &dead_children);
905 } 789 }
906
907 write_unlock_irq(&tasklist_lock); 790 write_unlock_irq(&tasklist_lock);
791
908 BUG_ON(!list_empty(&father->children)); 792 BUG_ON(!list_empty(&father->children));
909 793
910 ptrace_exit_finish(father, &ptrace_dead); 794 list_for_each_entry_safe(p, n, &dead_children, sibling) {
795 list_del_init(&p->sibling);
796 release_task(p);
797 }
911} 798}
912 799
913/* 800/*
@@ -1417,6 +1304,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
1417 return retval; 1304 return retval;
1418} 1305}
1419 1306
1307static int *task_stopped_code(struct task_struct *p, bool ptrace)
1308{
1309 if (ptrace) {
1310 if (task_is_stopped_or_traced(p))
1311 return &p->exit_code;
1312 } else {
1313 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1314 return &p->signal->group_exit_code;
1315 }
1316 return NULL;
1317}
1318
1420/* 1319/*
1421 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1320 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1422 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1321 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1427,7 +1326,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1427 int options, struct siginfo __user *infop, 1326 int options, struct siginfo __user *infop,
1428 int __user *stat_addr, struct rusage __user *ru) 1327 int __user *stat_addr, struct rusage __user *ru)
1429{ 1328{
1430 int retval, exit_code, why; 1329 int retval, exit_code, *p_code, why;
1431 uid_t uid = 0; /* unneeded, required by compiler */ 1330 uid_t uid = 0; /* unneeded, required by compiler */
1432 pid_t pid; 1331 pid_t pid;
1433 1332
@@ -1437,22 +1336,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1437 exit_code = 0; 1336 exit_code = 0;
1438 spin_lock_irq(&p->sighand->siglock); 1337 spin_lock_irq(&p->sighand->siglock);
1439 1338
1440 if (unlikely(!task_is_stopped_or_traced(p))) 1339 p_code = task_stopped_code(p, ptrace);
1441 goto unlock_sig; 1340 if (unlikely(!p_code))
1442
1443 if (!ptrace && p->signal->group_stop_count > 0)
1444 /*
1445 * A group stop is in progress and this is the group leader.
1446 * We won't report until all threads have stopped.
1447 */
1448 goto unlock_sig; 1341 goto unlock_sig;
1449 1342
1450 exit_code = p->exit_code; 1343 exit_code = *p_code;
1451 if (!exit_code) 1344 if (!exit_code)
1452 goto unlock_sig; 1345 goto unlock_sig;
1453 1346
1454 if (!unlikely(options & WNOWAIT)) 1347 if (!unlikely(options & WNOWAIT))
1455 p->exit_code = 0; 1348 *p_code = 0;
1456 1349
1457 /* don't need the RCU readlock here as we're holding a spinlock */ 1350 /* don't need the RCU readlock here as we're holding a spinlock */
1458 uid = __task_cred(p)->uid; 1351 uid = __task_cred(p)->uid;
@@ -1608,7 +1501,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1608 */ 1501 */
1609 *notask_error = 0; 1502 *notask_error = 0;
1610 1503
1611 if (task_is_stopped_or_traced(p)) 1504 if (task_stopped_code(p, ptrace))
1612 return wait_task_stopped(ptrace, p, options, 1505 return wait_task_stopped(ptrace, p, options,
1613 infop, stat_addr, ru); 1506 infop, stat_addr, ru);
1614 1507
@@ -1812,7 +1705,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1812 pid = find_get_pid(-upid); 1705 pid = find_get_pid(-upid);
1813 } else if (upid == 0) { 1706 } else if (upid == 0) {
1814 type = PIDTYPE_PGID; 1707 type = PIDTYPE_PGID;
1815 pid = get_pid(task_pgrp(current)); 1708 pid = get_task_pid(current, PIDTYPE_PGID);
1816 } else /* upid > 0 */ { 1709 } else /* upid > 0 */ {
1817 type = PIDTYPE_PID; 1710 type = PIDTYPE_PID;
1818 pid = find_get_pid(upid); 1711 pid = find_get_pid(upid);
diff --git a/kernel/fork.c b/kernel/fork.c
index 6715ebc3761d..660c2b8765bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
60#include <linux/tty.h> 60#include <linux/tty.h>
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h>
63#include <trace/sched.h> 64#include <trace/sched.h>
64#include <linux/magic.h> 65#include <linux/magic.h>
65 66
@@ -284,7 +285,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
284 mm->free_area_cache = oldmm->mmap_base; 285 mm->free_area_cache = oldmm->mmap_base;
285 mm->cached_hole_size = ~0UL; 286 mm->cached_hole_size = ~0UL;
286 mm->map_count = 0; 287 mm->map_count = 0;
287 cpus_clear(mm->cpu_vm_mask); 288 cpumask_clear(mm_cpumask(mm));
288 mm->mm_rb = RB_ROOT; 289 mm->mm_rb = RB_ROOT;
289 rb_link = &mm->mm_rb.rb_node; 290 rb_link = &mm->mm_rb.rb_node;
290 rb_parent = NULL; 291 rb_parent = NULL;
@@ -681,38 +682,21 @@ fail_nomem:
681 return retval; 682 return retval;
682} 683}
683 684
684static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
685{
686 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
687 /* We don't need to lock fs - think why ;-) */
688 if (fs) {
689 atomic_set(&fs->count, 1);
690 rwlock_init(&fs->lock);
691 fs->umask = old->umask;
692 read_lock(&old->lock);
693 fs->root = old->root;
694 path_get(&old->root);
695 fs->pwd = old->pwd;
696 path_get(&old->pwd);
697 read_unlock(&old->lock);
698 }
699 return fs;
700}
701
702struct fs_struct *copy_fs_struct(struct fs_struct *old)
703{
704 return __copy_fs_struct(old);
705}
706
707EXPORT_SYMBOL_GPL(copy_fs_struct);
708
709static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) 685static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
710{ 686{
687 struct fs_struct *fs = current->fs;
711 if (clone_flags & CLONE_FS) { 688 if (clone_flags & CLONE_FS) {
712 atomic_inc(&current->fs->count); 689 /* tsk->fs is already what we want */
690 write_lock(&fs->lock);
691 if (fs->in_exec) {
692 write_unlock(&fs->lock);
693 return -EAGAIN;
694 }
695 fs->users++;
696 write_unlock(&fs->lock);
713 return 0; 697 return 0;
714 } 698 }
715 tsk->fs = __copy_fs_struct(current->fs); 699 tsk->fs = copy_fs_struct(fs);
716 if (!tsk->fs) 700 if (!tsk->fs)
717 return -ENOMEM; 701 return -ENOMEM;
718 return 0; 702 return 0;
@@ -841,6 +825,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
841 atomic_set(&sig->live, 1); 825 atomic_set(&sig->live, 1);
842 init_waitqueue_head(&sig->wait_chldexit); 826 init_waitqueue_head(&sig->wait_chldexit);
843 sig->flags = 0; 827 sig->flags = 0;
828 if (clone_flags & CLONE_NEWPID)
829 sig->flags |= SIGNAL_UNKILLABLE;
844 sig->group_exit_code = 0; 830 sig->group_exit_code = 0;
845 sig->group_exit_task = NULL; 831 sig->group_exit_task = NULL;
846 sig->group_stop_count = 0; 832 sig->group_stop_count = 0;
@@ -1125,7 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1125 goto bad_fork_cleanup_mm; 1111 goto bad_fork_cleanup_mm;
1126 if ((retval = copy_io(clone_flags, p))) 1112 if ((retval = copy_io(clone_flags, p)))
1127 goto bad_fork_cleanup_namespaces; 1113 goto bad_fork_cleanup_namespaces;
1128 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1114 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1129 if (retval) 1115 if (retval)
1130 goto bad_fork_cleanup_io; 1116 goto bad_fork_cleanup_io;
1131 1117
@@ -1263,8 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1263 p->signal->leader_pid = pid; 1249 p->signal->leader_pid = pid;
1264 tty_kref_put(p->signal->tty); 1250 tty_kref_put(p->signal->tty);
1265 p->signal->tty = tty_kref_get(current->signal->tty); 1251 p->signal->tty = tty_kref_get(current->signal->tty);
1266 set_task_pgrp(p, task_pgrp_nr(current));
1267 set_task_session(p, task_session_nr(current));
1268 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1252 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1269 attach_pid(p, PIDTYPE_SID, task_session(current)); 1253 attach_pid(p, PIDTYPE_SID, task_session(current));
1270 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1488,6 +1472,7 @@ void __init proc_caches_init(void)
1488 mm_cachep = kmem_cache_create("mm_struct", 1472 mm_cachep = kmem_cache_create("mm_struct",
1489 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1473 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1490 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1474 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1475 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1491 mmap_init(); 1476 mmap_init();
1492} 1477}
1493 1478
@@ -1543,12 +1528,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1543{ 1528{
1544 struct fs_struct *fs = current->fs; 1529 struct fs_struct *fs = current->fs;
1545 1530
1546 if ((unshare_flags & CLONE_FS) && 1531 if (!(unshare_flags & CLONE_FS) || !fs)
1547 (fs && atomic_read(&fs->count) > 1)) { 1532 return 0;
1548 *new_fsp = __copy_fs_struct(current->fs); 1533
1549 if (!*new_fsp) 1534 /* don't need lock here; in the worst case we'll do useless copy */
1550 return -ENOMEM; 1535 if (fs->users == 1)
1551 } 1536 return 0;
1537
1538 *new_fsp = copy_fs_struct(fs);
1539 if (!*new_fsp)
1540 return -ENOMEM;
1552 1541
1553 return 0; 1542 return 0;
1554} 1543}
@@ -1664,8 +1653,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1664 1653
1665 if (new_fs) { 1654 if (new_fs) {
1666 fs = current->fs; 1655 fs = current->fs;
1656 write_lock(&fs->lock);
1667 current->fs = new_fs; 1657 current->fs = new_fs;
1668 new_fs = fs; 1658 if (--fs->users)
1659 new_fs = NULL;
1660 else
1661 new_fs = fs;
1662 write_unlock(&fs->lock);
1669 } 1663 }
1670 1664
1671 if (new_mm) { 1665 if (new_mm) {
@@ -1704,7 +1698,7 @@ bad_unshare_cleanup_sigh:
1704 1698
1705bad_unshare_cleanup_fs: 1699bad_unshare_cleanup_fs:
1706 if (new_fs) 1700 if (new_fs)
1707 put_fs_struct(new_fs); 1701 free_fs_struct(new_fs);
1708 1702
1709bad_unshare_cleanup_thread: 1703bad_unshare_cleanup_thread:
1710bad_unshare_out: 1704bad_unshare_out:
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac98..3394f8f52964 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ee1aa9f8e8b9..01ce20eab38f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
15 17
16extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..1516ab77355c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -162,6 +162,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
162} 162}
163#endif 163#endif
164 164
165void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
166{
167 if (suspend) {
168 if (!desc->action || (desc->action->flags & IRQF_TIMER))
169 return;
170 desc->status |= IRQ_SUSPENDED;
171 }
172
173 if (!desc->depth++) {
174 desc->status |= IRQ_DISABLED;
175 desc->chip->disable(irq);
176 }
177}
178
165/** 179/**
166 * disable_irq_nosync - disable an irq without waiting 180 * disable_irq_nosync - disable an irq without waiting
167 * @irq: Interrupt to disable 181 * @irq: Interrupt to disable
@@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)
182 return; 196 return;
183 197
184 spin_lock_irqsave(&desc->lock, flags); 198 spin_lock_irqsave(&desc->lock, flags);
185 if (!desc->depth++) { 199 __disable_irq(desc, irq, false);
186 desc->status |= IRQ_DISABLED;
187 desc->chip->disable(irq);
188 }
189 spin_unlock_irqrestore(&desc->lock, flags); 200 spin_unlock_irqrestore(&desc->lock, flags);
190} 201}
191EXPORT_SYMBOL(disable_irq_nosync); 202EXPORT_SYMBOL(disable_irq_nosync);
@@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)
215} 226}
216EXPORT_SYMBOL(disable_irq); 227EXPORT_SYMBOL(disable_irq);
217 228
218static void __enable_irq(struct irq_desc *desc, unsigned int irq) 229void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
219{ 230{
231 if (resume)
232 desc->status &= ~IRQ_SUSPENDED;
233
220 switch (desc->depth) { 234 switch (desc->depth) {
221 case 0: 235 case 0:
236 err_out:
222 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 237 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
223 break; 238 break;
224 case 1: { 239 case 1: {
225 unsigned int status = desc->status & ~IRQ_DISABLED; 240 unsigned int status = desc->status & ~IRQ_DISABLED;
226 241
242 if (desc->status & IRQ_SUSPENDED)
243 goto err_out;
227 /* Prevent probing on this irq: */ 244 /* Prevent probing on this irq: */
228 desc->status = status | IRQ_NOPROBE; 245 desc->status = status | IRQ_NOPROBE;
229 check_irq_resend(desc, irq); 246 check_irq_resend(desc, irq);
@@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)
253 return; 270 return;
254 271
255 spin_lock_irqsave(&desc->lock, flags); 272 spin_lock_irqsave(&desc->lock, flags);
256 __enable_irq(desc, irq); 273 __enable_irq(desc, irq, false);
257 spin_unlock_irqrestore(&desc->lock, flags); 274 spin_unlock_irqrestore(&desc->lock, flags);
258} 275}
259EXPORT_SYMBOL(enable_irq); 276EXPORT_SYMBOL(enable_irq);
@@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
511 */ 528 */
512 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 529 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
513 desc->status &= ~IRQ_SPURIOUS_DISABLED; 530 desc->status &= ~IRQ_SPURIOUS_DISABLED;
514 __enable_irq(desc, irq); 531 __enable_irq(desc, irq, false);
515 } 532 }
516 533
517 spin_unlock_irqrestore(&desc->lock, flags); 534 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000000..638d8bedec14
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
1/*
2 * linux/kernel/irq/pm.c
3 *
4 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file contains power management functions related to interrupts.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/interrupt.h>
12
13#include "internals.h"
14
15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 *
18 * During system-wide suspend or hibernation device interrupts need to be
19 * disabled at the chip level and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the
21 * IRQ_SUSPENDED flag for them.
22 */
23void suspend_device_irqs(void)
24{
25 struct irq_desc *desc;
26 int irq;
27
28 for_each_irq_desc(irq, desc) {
29 unsigned long flags;
30
31 spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags);
34 }
35
36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED)
38 synchronize_irq(irq);
39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs);
41
42/**
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{
50 struct irq_desc *desc;
51 int irq;
52
53 for_each_irq_desc(irq, desc) {
54 unsigned long flags;
55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags);
62 }
63}
64EXPORT_SYMBOL_GPL(resume_device_irqs);
65
66/**
67 * check_wakeup_irqs - check if any wake-up interrupts are pending
68 */
69int check_wakeup_irqs(void)
70{
71 struct irq_desc *desc;
72 int irq;
73
74 for_each_irq_desc(irq, desc)
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
76 return -EBUSY;
77
78 return 0;
79}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c7fd6692939d..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
42note_buf_t* crash_notes; 42note_buf_t* crash_notes;
43 43
44/* vmcoreinfo stuff */ 44/* vmcoreinfo stuff */
45unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1409 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_OFFSET(list_head, prev);
1410 VMCOREINFO_OFFSET(vm_struct, addr); 1410 VMCOREINFO_OFFSET(vm_struct, addr);
1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1412 log_buf_kexec_setup();
1412 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1413 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1414 VMCOREINFO_NUMBER(PG_lru); 1415 VMCOREINFO_NUMBER(PG_lru);
@@ -1450,11 +1451,7 @@ int kernel_kexec(void)
1450 error = device_suspend(PMSG_FREEZE); 1451 error = device_suspend(PMSG_FREEZE);
1451 if (error) 1452 if (error)
1452 goto Resume_console; 1453 goto Resume_console;
1453 error = disable_nonboot_cpus();
1454 if (error)
1455 goto Resume_devices;
1456 device_pm_lock(); 1454 device_pm_lock();
1457 local_irq_disable();
1458 /* At this point, device_suspend() has been called, 1455 /* At this point, device_suspend() has been called,
1459 * but *not* device_power_down(). We *must* 1456 * but *not* device_power_down(). We *must*
1460 * device_power_down() now. Otherwise, drivers for 1457 * device_power_down() now. Otherwise, drivers for
@@ -1464,12 +1461,15 @@ int kernel_kexec(void)
1464 */ 1461 */
1465 error = device_power_down(PMSG_FREEZE); 1462 error = device_power_down(PMSG_FREEZE);
1466 if (error) 1463 if (error)
1467 goto Enable_irqs; 1464 goto Resume_devices;
1468 1465 error = disable_nonboot_cpus();
1466 if (error)
1467 goto Enable_cpus;
1468 local_irq_disable();
1469 /* Suspend system devices */ 1469 /* Suspend system devices */
1470 error = sysdev_suspend(PMSG_FREEZE); 1470 error = sysdev_suspend(PMSG_FREEZE);
1471 if (error) 1471 if (error)
1472 goto Power_up_devices; 1472 goto Enable_irqs;
1473 } else 1473 } else
1474#endif 1474#endif
1475 { 1475 {
@@ -1483,13 +1483,13 @@ int kernel_kexec(void)
1483#ifdef CONFIG_KEXEC_JUMP 1483#ifdef CONFIG_KEXEC_JUMP
1484 if (kexec_image->preserve_context) { 1484 if (kexec_image->preserve_context) {
1485 sysdev_resume(); 1485 sysdev_resume();
1486 Power_up_devices:
1487 device_power_up(PMSG_RESTORE);
1488 Enable_irqs: 1486 Enable_irqs:
1489 local_irq_enable(); 1487 local_irq_enable();
1490 device_pm_unlock(); 1488 Enable_cpus:
1491 enable_nonboot_cpus(); 1489 enable_nonboot_cpus();
1490 device_power_up(PMSG_RESTORE);
1492 Resume_devices: 1491 Resume_devices:
1492 device_pm_unlock();
1493 device_resume(PMSG_RESTORE); 1493 device_resume(PMSG_RESTORE);
1494 Resume_console: 1494 Resume_console:
1495 resume_console(); 1495 resume_console();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443d..f0c8f545180d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)
167 } 167 }
168 168
169 /* We can run anywhere, unlike our parent keventd(). */ 169 /* We can run anywhere, unlike our parent keventd(). */
170 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); 170 set_cpus_allowed_ptr(current, cpu_all_mask);
171 171
172 /* 172 /*
173 * Our parent is keventd, which runs with elevated scheduling priority. 173 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..84bbadd4d021 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
110 */ 110 */
111 sched_setscheduler(create->result, SCHED_NORMAL, &param); 111 sched_setscheduler(create->result, SCHED_NORMAL, &param);
112 set_user_nice(create->result, KTHREAD_NICE_LEVEL); 112 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
113 set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); 113 set_cpus_allowed_ptr(create->result, cpu_all_mask);
114 } 114 }
115 complete(&create->done); 115 complete(&create->done);
116} 116}
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
240 set_task_comm(tsk, "kthreadd"); 240 set_task_comm(tsk, "kthreadd");
241 ignore_signals(tsk); 241 ignore_signals(tsk);
242 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 242 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
243 set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); 243 set_cpus_allowed_ptr(tsk, cpu_all_mask);
244 244
245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
246 246
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0b..3673a3f44d9d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,6 +41,7 @@
41#include <linux/utsname.h> 41#include <linux/utsname.h>
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h>
44 45
45#include <asm/sections.h> 46#include <asm/sections.h>
46 47
@@ -310,12 +311,14 @@ EXPORT_SYMBOL(lockdep_on);
310#if VERBOSE 311#if VERBOSE
311# define HARDIRQ_VERBOSE 1 312# define HARDIRQ_VERBOSE 1
312# define SOFTIRQ_VERBOSE 1 313# define SOFTIRQ_VERBOSE 1
314# define RECLAIM_VERBOSE 1
313#else 315#else
314# define HARDIRQ_VERBOSE 0 316# define HARDIRQ_VERBOSE 0
315# define SOFTIRQ_VERBOSE 0 317# define SOFTIRQ_VERBOSE 0
318# define RECLAIM_VERBOSE 0
316#endif 319#endif
317 320
318#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE 321#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
319/* 322/*
320 * Quick filtering for interesting events: 323 * Quick filtering for interesting events:
321 */ 324 */
@@ -443,17 +446,18 @@ atomic_t nr_find_usage_backwards_recursions;
443 * Locking printouts: 446 * Locking printouts:
444 */ 447 */
445 448
449#define __USAGE(__STATE) \
450 [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
451 [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
452 [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
453 [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
454
446static const char *usage_str[] = 455static const char *usage_str[] =
447{ 456{
448 [LOCK_USED] = "initial-use ", 457#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
449 [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", 458#include "lockdep_states.h"
450 [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", 459#undef LOCKDEP_STATE
451 [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", 460 [LOCK_USED] = "INITIAL USE",
452 [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
453 [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
454 [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
455 [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
456 [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
457}; 461};
458 462
459const char * __get_key_name(struct lockdep_subclass_key *key, char *str) 463const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -461,46 +465,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
461 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); 465 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
462} 466}
463 467
464void 468static inline unsigned long lock_flag(enum lock_usage_bit bit)
465get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
466{ 469{
467 *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; 470 return 1UL << bit;
468 471}
469 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
470 *c1 = '+';
471 else
472 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
473 *c1 = '-';
474 472
475 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 473static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
476 *c2 = '+'; 474{
477 else 475 char c = '.';
478 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
479 *c2 = '-';
480 476
481 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 477 if (class->usage_mask & lock_flag(bit + 2))
482 *c3 = '-'; 478 c = '+';
483 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { 479 if (class->usage_mask & lock_flag(bit)) {
484 *c3 = '+'; 480 c = '-';
485 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 481 if (class->usage_mask & lock_flag(bit + 2))
486 *c3 = '?'; 482 c = '?';
487 } 483 }
488 484
489 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 485 return c;
490 *c4 = '-'; 486}
491 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { 487
492 *c4 = '+'; 488void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
493 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 489{
494 *c4 = '?'; 490 int i = 0;
495 } 491
492#define LOCKDEP_STATE(__STATE) \
493 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
494 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
495#include "lockdep_states.h"
496#undef LOCKDEP_STATE
497
498 usage[i] = '\0';
496} 499}
497 500
498static void print_lock_name(struct lock_class *class) 501static void print_lock_name(struct lock_class *class)
499{ 502{
500 char str[KSYM_NAME_LEN], c1, c2, c3, c4; 503 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
501 const char *name; 504 const char *name;
502 505
503 get_usage_chars(class, &c1, &c2, &c3, &c4); 506 get_usage_chars(class, usage);
504 507
505 name = class->name; 508 name = class->name;
506 if (!name) { 509 if (!name) {
@@ -513,7 +516,7 @@ static void print_lock_name(struct lock_class *class)
513 if (class->subclass) 516 if (class->subclass)
514 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
515 } 518 }
516 printk("){%c%c%c%c}", c1, c2, c3, c4); 519 printk("){%s}", usage);
517} 520}
518 521
519static void print_lockdep_cache(struct lockdep_map *lock) 522static void print_lockdep_cache(struct lockdep_map *lock)
@@ -1263,9 +1266,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1263 bit_backwards, bit_forwards, irqclass); 1266 bit_backwards, bit_forwards, irqclass);
1264} 1267}
1265 1268
1266static int 1269static const char *state_names[] = {
1267check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 1270#define LOCKDEP_STATE(__STATE) \
1268 struct held_lock *next) 1271 __stringify(__STATE),
1272#include "lockdep_states.h"
1273#undef LOCKDEP_STATE
1274};
1275
1276static const char *state_rnames[] = {
1277#define LOCKDEP_STATE(__STATE) \
1278 __stringify(__STATE)"-READ",
1279#include "lockdep_states.h"
1280#undef LOCKDEP_STATE
1281};
1282
1283static inline const char *state_name(enum lock_usage_bit bit)
1284{
1285 return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
1286}
1287
1288static int exclusive_bit(int new_bit)
1289{
1290 /*
1291 * USED_IN
1292 * USED_IN_READ
1293 * ENABLED
1294 * ENABLED_READ
1295 *
1296 * bit 0 - write/read
1297 * bit 1 - used_in/enabled
1298 * bit 2+ state
1299 */
1300
1301 int state = new_bit & ~3;
1302 int dir = new_bit & 2;
1303
1304 /*
1305 * keep state, bit flip the direction and strip read.
1306 */
1307 return state | (dir ^ 2);
1308}
1309
1310static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
1311 struct held_lock *next, enum lock_usage_bit bit)
1269{ 1312{
1270 /* 1313 /*
1271 * Prove that the new dependency does not connect a hardirq-safe 1314 * Prove that the new dependency does not connect a hardirq-safe
@@ -1273,38 +1316,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1273 * the backwards-subgraph starting at <prev>, and the 1316 * the backwards-subgraph starting at <prev>, and the
1274 * forwards-subgraph starting at <next>: 1317 * forwards-subgraph starting at <next>:
1275 */ 1318 */
1276 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, 1319 if (!check_usage(curr, prev, next, bit,
1277 LOCK_ENABLED_HARDIRQS, "hard")) 1320 exclusive_bit(bit), state_name(bit)))
1278 return 0; 1321 return 0;
1279 1322
1323 bit++; /* _READ */
1324
1280 /* 1325 /*
1281 * Prove that the new dependency does not connect a hardirq-safe-read 1326 * Prove that the new dependency does not connect a hardirq-safe-read
1282 * lock with a hardirq-unsafe lock - to achieve this we search 1327 * lock with a hardirq-unsafe lock - to achieve this we search
1283 * the backwards-subgraph starting at <prev>, and the 1328 * the backwards-subgraph starting at <prev>, and the
1284 * forwards-subgraph starting at <next>: 1329 * forwards-subgraph starting at <next>:
1285 */ 1330 */
1286 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, 1331 if (!check_usage(curr, prev, next, bit,
1287 LOCK_ENABLED_HARDIRQS, "hard-read")) 1332 exclusive_bit(bit), state_name(bit)))
1288 return 0; 1333 return 0;
1289 1334
1290 /* 1335 return 1;
1291 * Prove that the new dependency does not connect a softirq-safe 1336}
1292 * lock with a softirq-unsafe lock - to achieve this we search 1337
1293 * the backwards-subgraph starting at <prev>, and the 1338static int
1294 * forwards-subgraph starting at <next>: 1339check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1295 */ 1340 struct held_lock *next)
1296 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, 1341{
1297 LOCK_ENABLED_SOFTIRQS, "soft")) 1342#define LOCKDEP_STATE(__STATE) \
1298 return 0; 1343 if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
1299 /*
1300 * Prove that the new dependency does not connect a softirq-safe-read
1301 * lock with a softirq-unsafe lock - to achieve this we search
1302 * the backwards-subgraph starting at <prev>, and the
1303 * forwards-subgraph starting at <next>:
1304 */
1305 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1306 LOCK_ENABLED_SOFTIRQS, "soft"))
1307 return 0; 1344 return 0;
1345#include "lockdep_states.h"
1346#undef LOCKDEP_STATE
1308 1347
1309 return 1; 1348 return 1;
1310} 1349}
@@ -1933,7 +1972,7 @@ void print_irqtrace_events(struct task_struct *curr)
1933 print_ip_sym(curr->softirq_disable_ip); 1972 print_ip_sym(curr->softirq_disable_ip);
1934} 1973}
1935 1974
1936static int hardirq_verbose(struct lock_class *class) 1975static int HARDIRQ_verbose(struct lock_class *class)
1937{ 1976{
1938#if HARDIRQ_VERBOSE 1977#if HARDIRQ_VERBOSE
1939 return class_filter(class); 1978 return class_filter(class);
@@ -1941,7 +1980,7 @@ static int hardirq_verbose(struct lock_class *class)
1941 return 0; 1980 return 0;
1942} 1981}
1943 1982
1944static int softirq_verbose(struct lock_class *class) 1983static int SOFTIRQ_verbose(struct lock_class *class)
1945{ 1984{
1946#if SOFTIRQ_VERBOSE 1985#if SOFTIRQ_VERBOSE
1947 return class_filter(class); 1986 return class_filter(class);
@@ -1949,185 +1988,94 @@ static int softirq_verbose(struct lock_class *class)
1949 return 0; 1988 return 0;
1950} 1989}
1951 1990
1991static int RECLAIM_FS_verbose(struct lock_class *class)
1992{
1993#if RECLAIM_VERBOSE
1994 return class_filter(class);
1995#endif
1996 return 0;
1997}
1998
1952#define STRICT_READ_CHECKS 1 1999#define STRICT_READ_CHECKS 1
1953 2000
1954static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 2001static int (*state_verbose_f[])(struct lock_class *class) = {
1955 enum lock_usage_bit new_bit) 2002#define LOCKDEP_STATE(__STATE) \
2003 __STATE##_verbose,
2004#include "lockdep_states.h"
2005#undef LOCKDEP_STATE
2006};
2007
2008static inline int state_verbose(enum lock_usage_bit bit,
2009 struct lock_class *class)
1956{ 2010{
1957 int ret = 1; 2011 return state_verbose_f[bit >> 2](class);
2012}
1958 2013
1959 switch(new_bit) { 2014typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
1960 case LOCK_USED_IN_HARDIRQ: 2015 enum lock_usage_bit bit, const char *name);
1961 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 2016
1962 return 0; 2017static int
1963 if (!valid_state(curr, this, new_bit, 2018mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
1964 LOCK_ENABLED_HARDIRQS_READ)) 2019{
1965 return 0; 2020 int excl_bit = exclusive_bit(new_bit);
1966 /* 2021 int read = new_bit & 1;
1967 * just marked it hardirq-safe, check that this lock 2022 int dir = new_bit & 2;
1968 * took no hardirq-unsafe lock in the past: 2023
1969 */ 2024 /*
1970 if (!check_usage_forwards(curr, this, 2025 * mark USED_IN has to look forwards -- to ensure no dependency
1971 LOCK_ENABLED_HARDIRQS, "hard")) 2026 * has ENABLED state, which would allow recursion deadlocks.
1972 return 0; 2027 *
1973#if STRICT_READ_CHECKS 2028 * mark ENABLED has to look backwards -- to ensure no dependee
1974 /* 2029 * has USED_IN state, which, again, would allow recursion deadlocks.
1975 * just marked it hardirq-safe, check that this lock 2030 */
1976 * took no hardirq-unsafe-read lock in the past: 2031 check_usage_f usage = dir ?
1977 */ 2032 check_usage_backwards : check_usage_forwards;
1978 if (!check_usage_forwards(curr, this, 2033
1979 LOCK_ENABLED_HARDIRQS_READ, "hard-read")) 2034 /*
1980 return 0; 2035 * Validate that this particular lock does not have conflicting
1981#endif 2036 * usage states.
1982 if (hardirq_verbose(hlock_class(this))) 2037 */
1983 ret = 2; 2038 if (!valid_state(curr, this, new_bit, excl_bit))
1984 break; 2039 return 0;
1985 case LOCK_USED_IN_SOFTIRQ: 2040
1986 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) 2041 /*
1987 return 0; 2042 * Validate that the lock dependencies don't have conflicting usage
1988 if (!valid_state(curr, this, new_bit, 2043 * states.
1989 LOCK_ENABLED_SOFTIRQS_READ)) 2044 */
1990 return 0; 2045 if ((!read || !dir || STRICT_READ_CHECKS) &&
1991 /* 2046 !usage(curr, this, excl_bit, state_name(new_bit)))
1992 * just marked it softirq-safe, check that this lock 2047 return 0;
1993 * took no softirq-unsafe lock in the past: 2048
1994 */ 2049 /*
1995 if (!check_usage_forwards(curr, this, 2050 * Check for read in write conflicts
1996 LOCK_ENABLED_SOFTIRQS, "soft")) 2051 */
1997 return 0; 2052 if (!read) {
1998#if STRICT_READ_CHECKS 2053 if (!valid_state(curr, this, new_bit, excl_bit + 1))
1999 /*
2000 * just marked it softirq-safe, check that this lock
2001 * took no softirq-unsafe-read lock in the past:
2002 */
2003 if (!check_usage_forwards(curr, this,
2004 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
2005 return 0;
2006#endif
2007 if (softirq_verbose(hlock_class(this)))
2008 ret = 2;
2009 break;
2010 case LOCK_USED_IN_HARDIRQ_READ:
2011 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
2012 return 0;
2013 /*
2014 * just marked it hardirq-read-safe, check that this lock
2015 * took no hardirq-unsafe lock in the past:
2016 */
2017 if (!check_usage_forwards(curr, this,
2018 LOCK_ENABLED_HARDIRQS, "hard"))
2019 return 0;
2020 if (hardirq_verbose(hlock_class(this)))
2021 ret = 2;
2022 break;
2023 case LOCK_USED_IN_SOFTIRQ_READ:
2024 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
2025 return 0;
2026 /*
2027 * just marked it softirq-read-safe, check that this lock
2028 * took no softirq-unsafe lock in the past:
2029 */
2030 if (!check_usage_forwards(curr, this,
2031 LOCK_ENABLED_SOFTIRQS, "soft"))
2032 return 0;
2033 if (softirq_verbose(hlock_class(this)))
2034 ret = 2;
2035 break;
2036 case LOCK_ENABLED_HARDIRQS:
2037 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2038 return 0;
2039 if (!valid_state(curr, this, new_bit,
2040 LOCK_USED_IN_HARDIRQ_READ))
2041 return 0;
2042 /*
2043 * just marked it hardirq-unsafe, check that no hardirq-safe
2044 * lock in the system ever took it in the past:
2045 */
2046 if (!check_usage_backwards(curr, this,
2047 LOCK_USED_IN_HARDIRQ, "hard"))
2048 return 0;
2049#if STRICT_READ_CHECKS
2050 /*
2051 * just marked it hardirq-unsafe, check that no
2052 * hardirq-safe-read lock in the system ever took
2053 * it in the past:
2054 */
2055 if (!check_usage_backwards(curr, this,
2056 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
2057 return 0;
2058#endif
2059 if (hardirq_verbose(hlock_class(this)))
2060 ret = 2;
2061 break;
2062 case LOCK_ENABLED_SOFTIRQS:
2063 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2064 return 0;
2065 if (!valid_state(curr, this, new_bit,
2066 LOCK_USED_IN_SOFTIRQ_READ))
2067 return 0;
2068 /*
2069 * just marked it softirq-unsafe, check that no softirq-safe
2070 * lock in the system ever took it in the past:
2071 */
2072 if (!check_usage_backwards(curr, this,
2073 LOCK_USED_IN_SOFTIRQ, "soft"))
2074 return 0;
2075#if STRICT_READ_CHECKS
2076 /*
2077 * just marked it softirq-unsafe, check that no
2078 * softirq-safe-read lock in the system ever took
2079 * it in the past:
2080 */
2081 if (!check_usage_backwards(curr, this,
2082 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
2083 return 0;
2084#endif
2085 if (softirq_verbose(hlock_class(this)))
2086 ret = 2;
2087 break;
2088 case LOCK_ENABLED_HARDIRQS_READ:
2089 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2090 return 0;
2091#if STRICT_READ_CHECKS
2092 /*
2093 * just marked it hardirq-read-unsafe, check that no
2094 * hardirq-safe lock in the system ever took it in the past:
2095 */
2096 if (!check_usage_backwards(curr, this,
2097 LOCK_USED_IN_HARDIRQ, "hard"))
2098 return 0;
2099#endif
2100 if (hardirq_verbose(hlock_class(this)))
2101 ret = 2;
2102 break;
2103 case LOCK_ENABLED_SOFTIRQS_READ:
2104 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2105 return 0; 2054 return 0;
2106#if STRICT_READ_CHECKS 2055
2107 /* 2056 if (STRICT_READ_CHECKS &&
2108 * just marked it softirq-read-unsafe, check that no 2057 !usage(curr, this, excl_bit + 1,
2109 * softirq-safe lock in the system ever took it in the past: 2058 state_name(new_bit + 1)))
2110 */
2111 if (!check_usage_backwards(curr, this,
2112 LOCK_USED_IN_SOFTIRQ, "soft"))
2113 return 0; 2059 return 0;
2114#endif
2115 if (softirq_verbose(hlock_class(this)))
2116 ret = 2;
2117 break;
2118 default:
2119 WARN_ON(1);
2120 break;
2121 } 2060 }
2122 2061
2123 return ret; 2062 if (state_verbose(new_bit, hlock_class(this)))
2063 return 2;
2064
2065 return 1;
2124} 2066}
2125 2067
2068enum mark_type {
2069#define LOCKDEP_STATE(__STATE) __STATE,
2070#include "lockdep_states.h"
2071#undef LOCKDEP_STATE
2072};
2073
2126/* 2074/*
2127 * Mark all held locks with a usage bit: 2075 * Mark all held locks with a usage bit:
2128 */ 2076 */
2129static int 2077static int
2130mark_held_locks(struct task_struct *curr, int hardirq) 2078mark_held_locks(struct task_struct *curr, enum mark_type mark)
2131{ 2079{
2132 enum lock_usage_bit usage_bit; 2080 enum lock_usage_bit usage_bit;
2133 struct held_lock *hlock; 2081 struct held_lock *hlock;
@@ -2136,17 +2084,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)
2136 for (i = 0; i < curr->lockdep_depth; i++) { 2084 for (i = 0; i < curr->lockdep_depth; i++) {
2137 hlock = curr->held_locks + i; 2085 hlock = curr->held_locks + i;
2138 2086
2139 if (hardirq) { 2087 usage_bit = 2 + (mark << 2); /* ENABLED */
2140 if (hlock->read) 2088 if (hlock->read)
2141 usage_bit = LOCK_ENABLED_HARDIRQS_READ; 2089 usage_bit += 1; /* READ */
2142 else 2090
2143 usage_bit = LOCK_ENABLED_HARDIRQS; 2091 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2144 } else { 2092
2145 if (hlock->read)
2146 usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
2147 else
2148 usage_bit = LOCK_ENABLED_SOFTIRQS;
2149 }
2150 if (!mark_lock(curr, hlock, usage_bit)) 2093 if (!mark_lock(curr, hlock, usage_bit))
2151 return 0; 2094 return 0;
2152 } 2095 }
@@ -2200,7 +2143,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2200 * We are going to turn hardirqs on, so set the 2143 * We are going to turn hardirqs on, so set the
2201 * usage bit for all held locks: 2144 * usage bit for all held locks:
2202 */ 2145 */
2203 if (!mark_held_locks(curr, 1)) 2146 if (!mark_held_locks(curr, HARDIRQ))
2204 return; 2147 return;
2205 /* 2148 /*
2206 * If we have softirqs enabled, then set the usage 2149 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2151,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2208 * this bit from being set before) 2151 * this bit from being set before)
2209 */ 2152 */
2210 if (curr->softirqs_enabled) 2153 if (curr->softirqs_enabled)
2211 if (!mark_held_locks(curr, 0)) 2154 if (!mark_held_locks(curr, SOFTIRQ))
2212 return; 2155 return;
2213 2156
2214 curr->hardirq_enable_ip = ip; 2157 curr->hardirq_enable_ip = ip;
@@ -2288,7 +2231,7 @@ void trace_softirqs_on(unsigned long ip)
2288 * enabled too: 2231 * enabled too:
2289 */ 2232 */
2290 if (curr->hardirqs_enabled) 2233 if (curr->hardirqs_enabled)
2291 mark_held_locks(curr, 0); 2234 mark_held_locks(curr, SOFTIRQ);
2292} 2235}
2293 2236
2294/* 2237/*
@@ -2317,6 +2260,48 @@ void trace_softirqs_off(unsigned long ip)
2317 debug_atomic_inc(&redundant_softirqs_off); 2260 debug_atomic_inc(&redundant_softirqs_off);
2318} 2261}
2319 2262
2263static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2264{
2265 struct task_struct *curr = current;
2266
2267 if (unlikely(!debug_locks))
2268 return;
2269
2270 /* no reclaim without waiting on it */
2271 if (!(gfp_mask & __GFP_WAIT))
2272 return;
2273
2274 /* this guy won't enter reclaim */
2275 if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
2276 return;
2277
2278 /* We're only interested __GFP_FS allocations for now */
2279 if (!(gfp_mask & __GFP_FS))
2280 return;
2281
2282 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2283 return;
2284
2285 mark_held_locks(curr, RECLAIM_FS);
2286}
2287
2288static void check_flags(unsigned long flags);
2289
2290void lockdep_trace_alloc(gfp_t gfp_mask)
2291{
2292 unsigned long flags;
2293
2294 if (unlikely(current->lockdep_recursion))
2295 return;
2296
2297 raw_local_irq_save(flags);
2298 check_flags(flags);
2299 current->lockdep_recursion = 1;
2300 __lockdep_trace_alloc(gfp_mask, flags);
2301 current->lockdep_recursion = 0;
2302 raw_local_irq_restore(flags);
2303}
2304
2320static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 2305static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2321{ 2306{
2322 /* 2307 /*
@@ -2345,19 +2330,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2345 if (!hlock->hardirqs_off) { 2330 if (!hlock->hardirqs_off) {
2346 if (hlock->read) { 2331 if (hlock->read) {
2347 if (!mark_lock(curr, hlock, 2332 if (!mark_lock(curr, hlock,
2348 LOCK_ENABLED_HARDIRQS_READ)) 2333 LOCK_ENABLED_HARDIRQ_READ))
2349 return 0; 2334 return 0;
2350 if (curr->softirqs_enabled) 2335 if (curr->softirqs_enabled)
2351 if (!mark_lock(curr, hlock, 2336 if (!mark_lock(curr, hlock,
2352 LOCK_ENABLED_SOFTIRQS_READ)) 2337 LOCK_ENABLED_SOFTIRQ_READ))
2353 return 0; 2338 return 0;
2354 } else { 2339 } else {
2355 if (!mark_lock(curr, hlock, 2340 if (!mark_lock(curr, hlock,
2356 LOCK_ENABLED_HARDIRQS)) 2341 LOCK_ENABLED_HARDIRQ))
2357 return 0; 2342 return 0;
2358 if (curr->softirqs_enabled) 2343 if (curr->softirqs_enabled)
2359 if (!mark_lock(curr, hlock, 2344 if (!mark_lock(curr, hlock,
2360 LOCK_ENABLED_SOFTIRQS)) 2345 LOCK_ENABLED_SOFTIRQ))
2346 return 0;
2347 }
2348 }
2349
2350 /*
2351 * We reuse the irq context infrastructure more broadly as a general
2352 * context checking code. This tests GFP_FS recursion (a lock taken
2353 * during reclaim for a GFP_FS allocation is held over a GFP_FS
2354 * allocation).
2355 */
2356 if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
2357 if (hlock->read) {
2358 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
2359 return 0;
2360 } else {
2361 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
2361 return 0; 2362 return 0;
2362 } 2363 }
2363 } 2364 }
@@ -2412,6 +2413,10 @@ static inline int separate_irq_context(struct task_struct *curr,
2412 return 0; 2413 return 0;
2413} 2414}
2414 2415
2416void lockdep_trace_alloc(gfp_t gfp_mask)
2417{
2418}
2419
2415#endif 2420#endif
2416 2421
2417/* 2422/*
@@ -2445,14 +2450,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2445 return 0; 2450 return 0;
2446 2451
2447 switch (new_bit) { 2452 switch (new_bit) {
2448 case LOCK_USED_IN_HARDIRQ: 2453#define LOCKDEP_STATE(__STATE) \
2449 case LOCK_USED_IN_SOFTIRQ: 2454 case LOCK_USED_IN_##__STATE: \
2450 case LOCK_USED_IN_HARDIRQ_READ: 2455 case LOCK_USED_IN_##__STATE##_READ: \
2451 case LOCK_USED_IN_SOFTIRQ_READ: 2456 case LOCK_ENABLED_##__STATE: \
2452 case LOCK_ENABLED_HARDIRQS: 2457 case LOCK_ENABLED_##__STATE##_READ:
2453 case LOCK_ENABLED_SOFTIRQS: 2458#include "lockdep_states.h"
2454 case LOCK_ENABLED_HARDIRQS_READ: 2459#undef LOCKDEP_STATE
2455 case LOCK_ENABLED_SOFTIRQS_READ:
2456 ret = mark_lock_irq(curr, this, new_bit); 2460 ret = mark_lock_irq(curr, this, new_bit);
2457 if (!ret) 2461 if (!ret)
2458 return 0; 2462 return 0;
@@ -2966,6 +2970,16 @@ void lock_release(struct lockdep_map *lock, int nested,
2966} 2970}
2967EXPORT_SYMBOL_GPL(lock_release); 2971EXPORT_SYMBOL_GPL(lock_release);
2968 2972
2973void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2974{
2975 current->lockdep_reclaim_gfp = gfp_mask;
2976}
2977
2978void lockdep_clear_current_reclaim_state(void)
2979{
2980 current->lockdep_reclaim_gfp = 0;
2981}
2982
2969#ifdef CONFIG_LOCK_STAT 2983#ifdef CONFIG_LOCK_STAT
2970static int 2984static int
2971print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, 2985print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c08..a2cc7e9a6e84 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -7,6 +7,45 @@
7 */ 7 */
8 8
9/* 9/*
10 * Lock-class usage-state bits:
11 */
12enum lock_usage_bit {
13#define LOCKDEP_STATE(__STATE) \
14 LOCK_USED_IN_##__STATE, \
15 LOCK_USED_IN_##__STATE##_READ, \
16 LOCK_ENABLED_##__STATE, \
17 LOCK_ENABLED_##__STATE##_READ,
18#include "lockdep_states.h"
19#undef LOCKDEP_STATE
20 LOCK_USED,
21 LOCK_USAGE_STATES
22};
23
24/*
25 * Usage-state bitmasks:
26 */
27#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
28
29enum {
30#define LOCKDEP_STATE(__STATE) \
31 __LOCKF(USED_IN_##__STATE) \
32 __LOCKF(USED_IN_##__STATE##_READ) \
33 __LOCKF(ENABLED_##__STATE) \
34 __LOCKF(ENABLED_##__STATE##_READ)
35#include "lockdep_states.h"
36#undef LOCKDEP_STATE
37 __LOCKF(USED)
38};
39
40#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
41#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
42
43#define LOCKF_ENABLED_IRQ_READ \
44 (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
45#define LOCKF_USED_IN_IRQ_READ \
46 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
47
48/*
10 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies 49 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
11 * we track. 50 * we track.
12 * 51 *
@@ -31,8 +70,10 @@
31extern struct list_head all_lock_classes; 70extern struct list_head all_lock_classes;
32extern struct lock_chain lock_chains[]; 71extern struct lock_chain lock_chains[];
33 72
34extern void 73#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); 74
75extern void get_usage_chars(struct lock_class *class,
76 char usage[LOCK_USAGE_CHARS]);
36 77
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); 78extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38 79
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b813896..d7135aa2d2c4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
84{ 84{
85 struct lock_class *class = v; 85 struct lock_class *class = v;
86 struct lock_list *entry; 86 struct lock_list *entry;
87 char c1, c2, c3, c4; 87 char usage[LOCK_USAGE_CHARS];
88 88
89 if (v == SEQ_START_TOKEN) { 89 if (v == SEQ_START_TOKEN) {
90 seq_printf(m, "all lock classes:\n"); 90 seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
101#endif 101#endif
102 102
103 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, usage);
104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %s", usage);
105 105
106 seq_printf(m, ": "); 106 seq_printf(m, ": ");
107 print_name(m, class); 107 print_name(m, class);
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
300 nr_uncategorized++; 300 nr_uncategorized++;
301 if (class->usage_mask & LOCKF_USED_IN_IRQ) 301 if (class->usage_mask & LOCKF_USED_IN_IRQ)
302 nr_irq_safe++; 302 nr_irq_safe++;
303 if (class->usage_mask & LOCKF_ENABLED_IRQS) 303 if (class->usage_mask & LOCKF_ENABLED_IRQ)
304 nr_irq_unsafe++; 304 nr_irq_unsafe++;
305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
306 nr_softirq_safe++; 306 nr_softirq_safe++;
307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) 307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
308 nr_softirq_unsafe++; 308 nr_softirq_unsafe++;
309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) 309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
310 nr_hardirq_safe++; 310 nr_hardirq_safe++;
311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) 311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
312 nr_hardirq_unsafe++; 312 nr_hardirq_unsafe++;
313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) 313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
314 nr_irq_read_safe++; 314 nr_irq_read_safe++;
315 if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) 315 if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
316 nr_irq_read_unsafe++; 316 nr_irq_read_unsafe++;
317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) 317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
318 nr_softirq_read_safe++; 318 nr_softirq_read_safe++;
319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
320 nr_softirq_read_unsafe++; 320 nr_softirq_read_unsafe++;
321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) 321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
322 nr_hardirq_read_safe++; 322 nr_hardirq_read_safe++;
323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
324 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
325 325
326#ifdef CONFIG_PROVE_LOCKING 326#ifdef CONFIG_PROVE_LOCKING
@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
601static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
602{ 602{
603 seq_printf(m, "lock_stat version 0.3\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
604
605 if (unlikely(!debug_locks))
606 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
607
604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 608 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 609 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
606 "%14s %14s\n", 610 "%14s %14s\n",
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 000000000000..995b0cc2b84c
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,9 @@
1/*
2 * Lockdep states,
3 *
4 * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
5 * you add one, or come up with a nice dynamic solution.
6 */
7LOCKDEP_STATE(HARDIRQ)
8LOCKDEP_STATE(SOFTIRQ)
9LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb532..50d022e5a560 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
26/* 26/*
27 * Must be called with lock->wait_lock held. 27 * Must be called with lock->wait_lock held.
28 */ 28 */
29void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
30{
31 lock->owner = new_owner;
32}
33
34void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) 29void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
35{ 30{
36 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); 31 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
59 54
60 /* Mark the current thread as blocked on the lock: */ 55 /* Mark the current thread as blocked on the lock: */
61 ti->task->blocked_on = waiter; 56 ti->task->blocked_on = waiter;
62 waiter->lock = lock;
63} 57}
64 58
65void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 59void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
82 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 76 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 77 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 78 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 79 mutex_clear_owner(lock);
86} 80}
87 81
88void debug_mutex_init(struct mutex *lock, const char *name, 82void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
95 debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 89 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
96 lockdep_init_map(&lock->dep_map, name, key, 0); 90 lockdep_init_map(&lock->dep_map, name, key, 0);
97#endif 91#endif
98 lock->owner = NULL;
99 lock->magic = lock; 92 lock->magic = lock;
100} 93}
101 94
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534b..6b2d735846a5 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
13/* 13/*
14 * This must be called with lock->wait_lock held. 14 * This must be called with lock->wait_lock held.
15 */ 15 */
16extern void
17debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
18
19static inline void debug_mutex_clear_owner(struct mutex *lock)
20{
21 lock->owner = NULL;
22}
23
24extern void debug_mutex_lock_common(struct mutex *lock, 16extern void debug_mutex_lock_common(struct mutex *lock,
25 struct mutex_waiter *waiter); 17 struct mutex_waiter *waiter);
26extern void debug_mutex_wake_waiter(struct mutex *lock, 18extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
35extern void debug_mutex_init(struct mutex *lock, const char *name, 27extern void debug_mutex_init(struct mutex *lock, const char *name,
36 struct lock_class_key *key); 28 struct lock_class_key *key);
37 29
30static inline void mutex_set_owner(struct mutex *lock)
31{
32 lock->owner = current_thread_info();
33}
34
35static inline void mutex_clear_owner(struct mutex *lock)
36{
37 lock->owner = NULL;
38}
39
38#define spin_lock_mutex(lock, flags) \ 40#define spin_lock_mutex(lock, flags) \
39 do { \ 41 do { \
40 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 42 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658ef..5d79781394a3 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and 10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
11 * David Howells for suggestions and improvements. 11 * David Howells for suggestions and improvements.
12 * 12 *
13 * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
14 * from the -rt tree, where it was originally implemented for rtmutexes
15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
16 * and Sven Dietrich.
17 *
13 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/mutex-design.txt.
14 */ 19 */
15#include <linux/mutex.h> 20#include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
46 atomic_set(&lock->count, 1); 51 atomic_set(&lock->count, 1);
47 spin_lock_init(&lock->wait_lock); 52 spin_lock_init(&lock->wait_lock);
48 INIT_LIST_HEAD(&lock->wait_list); 53 INIT_LIST_HEAD(&lock->wait_list);
54 mutex_clear_owner(lock);
49 55
50 debug_mutex_init(lock, name, key); 56 debug_mutex_init(lock, name, key);
51} 57}
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
91 * 'unlocked' into 'locked' state. 97 * 'unlocked' into 'locked' state.
92 */ 98 */
93 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); 99 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
100 mutex_set_owner(lock);
94} 101}
95 102
96EXPORT_SYMBOL(mutex_lock); 103EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
115 * The unlocking fastpath is the 0->1 transition from 'locked' 122 * The unlocking fastpath is the 0->1 transition from 'locked'
116 * into 'unlocked' state: 123 * into 'unlocked' state:
117 */ 124 */
125#ifndef CONFIG_DEBUG_MUTEXES
126 /*
127 * When debugging is enabled we must not clear the owner before time,
128 * the slow path will always be taken, and that clears the owner field
129 * after verifying that it was indeed current.
130 */
131 mutex_clear_owner(lock);
132#endif
118 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); 133 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
119} 134}
120 135
@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
129{ 144{
130 struct task_struct *task = current; 145 struct task_struct *task = current;
131 struct mutex_waiter waiter; 146 struct mutex_waiter waiter;
132 unsigned int old_val;
133 unsigned long flags; 147 unsigned long flags;
134 148
149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
152 /*
153 * Optimistic spinning.
154 *
155 * We try to spin for acquisition when we find that there are no
156 * pending waiters and the lock owner is currently running on a
157 * (different) CPU.
158 *
159 * The rationale is that if the lock owner is running, it is likely to
160 * release the lock soon.
161 *
162 * Since this needs the lock owner, and this mutex implementation
163 * doesn't track the owner atomically in the lock field, we need to
164 * track it non-atomically.
165 *
166 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
167 * to serialize everything.
168 */
169
170 for (;;) {
171 struct thread_info *owner;
172
173 /*
174 * If there's an owner, wait for it to either
175 * release the lock or go to sleep.
176 */
177 owner = ACCESS_ONCE(lock->owner);
178 if (owner && !mutex_spin_on_owner(lock, owner))
179 break;
180
181 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
182 lock_acquired(&lock->dep_map, ip);
183 mutex_set_owner(lock);
184 preempt_enable();
185 return 0;
186 }
187
188 /*
189 * When there's no owner, we might have preempted between the
190 * owner acquiring the lock and setting the owner field. If
191 * we're an RT task that will live-lock because we won't let
192 * the owner complete.
193 */
194 if (!owner && (need_resched() || rt_task(task)))
195 break;
196
197 /*
198 * The cpu_relax() call is a compiler barrier which forces
199 * everything in this loop to be re-loaded. We don't need
200 * memory barriers as we'll eventually observe the right
201 * values at the cost of a few extra spins.
202 */
203 cpu_relax();
204 }
205#endif
135 spin_lock_mutex(&lock->wait_lock, flags); 206 spin_lock_mutex(&lock->wait_lock, flags);
136 207
137 debug_mutex_lock_common(lock, &waiter); 208 debug_mutex_lock_common(lock, &waiter);
138 mutex_acquire(&lock->dep_map, subclass, 0, ip);
139 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 209 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
140 210
141 /* add waiting tasks to the end of the waitqueue (FIFO): */ 211 /* add waiting tasks to the end of the waitqueue (FIFO): */
142 list_add_tail(&waiter.list, &lock->wait_list); 212 list_add_tail(&waiter.list, &lock->wait_list);
143 waiter.task = task; 213 waiter.task = task;
144 214
145 old_val = atomic_xchg(&lock->count, -1); 215 if (atomic_xchg(&lock->count, -1) == 1)
146 if (old_val == 1)
147 goto done; 216 goto done;
148 217
149 lock_contended(&lock->dep_map, ip); 218 lock_contended(&lock->dep_map, ip);
@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * that when we release the lock, we properly wake up the 227 * that when we release the lock, we properly wake up the
159 * other waiters: 228 * other waiters:
160 */ 229 */
161 old_val = atomic_xchg(&lock->count, -1); 230 if (atomic_xchg(&lock->count, -1) == 1)
162 if (old_val == 1)
163 break; 231 break;
164 232
165 /* 233 /*
@@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
173 spin_unlock_mutex(&lock->wait_lock, flags); 241 spin_unlock_mutex(&lock->wait_lock, flags);
174 242
175 debug_mutex_free_waiter(&waiter); 243 debug_mutex_free_waiter(&waiter);
244 preempt_enable();
176 return -EINTR; 245 return -EINTR;
177 } 246 }
178 __set_task_state(task, state); 247 __set_task_state(task, state);
179 248
180 /* didnt get the lock, go to sleep: */ 249 /* didnt get the lock, go to sleep: */
181 spin_unlock_mutex(&lock->wait_lock, flags); 250 spin_unlock_mutex(&lock->wait_lock, flags);
182 schedule(); 251 __schedule();
183 spin_lock_mutex(&lock->wait_lock, flags); 252 spin_lock_mutex(&lock->wait_lock, flags);
184 } 253 }
185 254
186done: 255done:
187 lock_acquired(&lock->dep_map, ip); 256 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 257 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 258 mutex_remove_waiter(lock, &waiter, current_thread_info());
190 debug_mutex_set_owner(lock, task_thread_info(task)); 259 mutex_set_owner(lock);
191 260
192 /* set it to 0 if there are no waiters left: */ 261 /* set it to 0 if there are no waiters left: */
193 if (likely(list_empty(&lock->wait_list))) 262 if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +265,7 @@ done:
196 spin_unlock_mutex(&lock->wait_lock, flags); 265 spin_unlock_mutex(&lock->wait_lock, flags);
197 266
198 debug_mutex_free_waiter(&waiter); 267 debug_mutex_free_waiter(&waiter);
268 preempt_enable();
199 269
200 return 0; 270 return 0;
201} 271}
@@ -222,7 +292,8 @@ int __sched
222mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 292mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
223{ 293{
224 might_sleep(); 294 might_sleep();
225 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); 295 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
296 subclass, _RET_IP_);
226} 297}
227 298
228EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 299EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
260 wake_up_process(waiter->task); 331 wake_up_process(waiter->task);
261 } 332 }
262 333
263 debug_mutex_clear_owner(lock);
264
265 spin_unlock_mutex(&lock->wait_lock, flags); 334 spin_unlock_mutex(&lock->wait_lock, flags);
266} 335}
267 336
@@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
298 */ 367 */
299int __sched mutex_lock_interruptible(struct mutex *lock) 368int __sched mutex_lock_interruptible(struct mutex *lock)
300{ 369{
370 int ret;
371
301 might_sleep(); 372 might_sleep();
302 return __mutex_fastpath_lock_retval 373 ret = __mutex_fastpath_lock_retval
303 (&lock->count, __mutex_lock_interruptible_slowpath); 374 (&lock->count, __mutex_lock_interruptible_slowpath);
375 if (!ret)
376 mutex_set_owner(lock);
377
378 return ret;
304} 379}
305 380
306EXPORT_SYMBOL(mutex_lock_interruptible); 381EXPORT_SYMBOL(mutex_lock_interruptible);
307 382
308int __sched mutex_lock_killable(struct mutex *lock) 383int __sched mutex_lock_killable(struct mutex *lock)
309{ 384{
385 int ret;
386
310 might_sleep(); 387 might_sleep();
311 return __mutex_fastpath_lock_retval 388 ret = __mutex_fastpath_lock_retval
312 (&lock->count, __mutex_lock_killable_slowpath); 389 (&lock->count, __mutex_lock_killable_slowpath);
390 if (!ret)
391 mutex_set_owner(lock);
392
393 return ret;
313} 394}
314EXPORT_SYMBOL(mutex_lock_killable); 395EXPORT_SYMBOL(mutex_lock_killable);
315 396
@@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
352 433
353 prev = atomic_xchg(&lock->count, -1); 434 prev = atomic_xchg(&lock->count, -1);
354 if (likely(prev == 1)) { 435 if (likely(prev == 1)) {
355 debug_mutex_set_owner(lock, current_thread_info()); 436 mutex_set_owner(lock);
356 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); 437 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
357 } 438 }
439
358 /* Set it back to 0 if there are no waiters: */ 440 /* Set it back to 0 if there are no waiters: */
359 if (likely(list_empty(&lock->wait_list))) 441 if (likely(list_empty(&lock->wait_list)))
360 atomic_set(&lock->count, 0); 442 atomic_set(&lock->count, 0);
@@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
380 */ 462 */
381int __sched mutex_trylock(struct mutex *lock) 463int __sched mutex_trylock(struct mutex *lock)
382{ 464{
383 return __mutex_fastpath_trylock(&lock->count, 465 int ret;
384 __mutex_trylock_slowpath); 466
467 ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
468 if (ret)
469 mutex_set_owner(lock);
470
471 return ret;
385} 472}
386 473
387EXPORT_SYMBOL(mutex_trylock); 474EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb290..67578ca48f94 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#define debug_mutex_set_owner(lock, new_owner) do { } while (0) 19#ifdef CONFIG_SMP
20#define debug_mutex_clear_owner(lock) do { } while (0) 20static inline void mutex_set_owner(struct mutex *lock)
21{
22 lock->owner = current_thread_info();
23}
24
25static inline void mutex_clear_owner(struct mutex *lock)
26{
27 lock->owner = NULL;
28}
29#else
30static inline void mutex_set_owner(struct mutex *lock)
31{
32}
33
34static inline void mutex_clear_owner(struct mutex *lock)
35{
36}
37#endif
38
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) 39#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0) 40#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) 41#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
34 34
35/* 35/*
36 * Rules: 36 * Rules:
37 * 1. you can only enter a cgroup which is a child of your current 37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup 38 * cgroup
39 * 2. you can only place another process into a cgroup if 39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN 40 * a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct cgroup *new_cgroup, struct task_struct *task)
47{ 47{
48 struct cgroup *orig;
49
50 if (current != task) { 48 if (current != task) {
51 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
52 return -EPERM; 50 return -EPERM;
53 51
54 if (!cgroup_is_descendant(new_cgroup)) 52 if (!cgroup_is_descendant(new_cgroup, current))
55 return -EPERM; 53 return -EPERM;
56 } 54 }
57 55
58 if (atomic_read(&new_cgroup->count) != 0) 56 if (!cgroup_is_descendant(new_cgroup, task))
59 return -EPERM;
60
61 orig = task_cgroup(task, ns_subsys_id);
62 if (orig && orig != new_cgroup->parent)
63 return -EPERM; 57 return -EPERM;
64 58
65 return 0; 59 return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
77 71
78 if (!capable(CAP_SYS_ADMIN)) 72 if (!capable(CAP_SYS_ADMIN))
79 return ERR_PTR(-EPERM); 73 return ERR_PTR(-EPERM);
80 if (!cgroup_is_descendant(cgroup)) 74 if (!cgroup_is_descendant(cgroup, current))
81 return ERR_PTR(-EPERM); 75 return ERR_PTR(-EPERM);
82 76
83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 77 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 403{
404 struct pid *pid; 404 struct pid *pid;
405 rcu_read_lock(); 405 rcu_read_lock();
406 if (type != PIDTYPE_PID)
407 task = task->group_leader;
406 pid = get_pid(task->pids[type].pid); 408 pid = get_pid(task->pids[type].pid);
407 rcu_read_unlock(); 409 rcu_read_unlock();
408 return pid; 410 return pid;
@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
450} 452}
451EXPORT_SYMBOL_GPL(pid_vnr); 453EXPORT_SYMBOL_GPL(pid_vnr);
452 454
453pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 455pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
456 struct pid_namespace *ns)
454{ 457{
455 return pid_nr_ns(task_pid(tsk), ns); 458 pid_t nr = 0;
459
460 rcu_read_lock();
461 if (!ns)
462 ns = current->nsproxy->pid_ns;
463 if (likely(pid_alive(task))) {
464 if (type != PIDTYPE_PID)
465 task = task->group_leader;
466 nr = pid_nr_ns(task->pids[type].pid, ns);
467 }
468 rcu_read_unlock();
469
470 return nr;
456} 471}
457EXPORT_SYMBOL(task_pid_nr_ns); 472EXPORT_SYMBOL(__task_pid_nr_ns);
458 473
459pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 474pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
460{ 475{
@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
462} 477}
463EXPORT_SYMBOL(task_tgid_nr_ns); 478EXPORT_SYMBOL(task_tgid_nr_ns);
464 479
465pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
466{
467 return pid_nr_ns(task_pgrp(tsk), ns);
468}
469EXPORT_SYMBOL(task_pgrp_nr_ns);
470
471pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
472{
473 return pid_nr_ns(task_session(tsk), ns);
474}
475EXPORT_SYMBOL(task_session_nr_ns);
476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 480struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{ 481{
479 return ns_of_pid(task_pid(tsk)); 482 return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
152{ 152{
153 int nr; 153 int nr;
154 int rc; 154 int rc;
155 struct task_struct *task;
155 156
156 /* 157 /*
157 * The last thread in the cgroup-init thread group is terminating. 158 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
169 read_lock(&tasklist_lock); 170 read_lock(&tasklist_lock);
170 nr = next_pidmap(pid_ns, 1); 171 nr = next_pidmap(pid_ns, 1);
171 while (nr > 0) { 172 while (nr > 0) {
172 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); 173 rcu_read_lock();
174
175 /*
176 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
177 * any nested-container's init processes don't ignore the
178 * signal
179 */
180 task = pid_task(find_vpid(nr), PIDTYPE_PID);
181 if (task)
182 force_sig(SIGKILL, task);
183
184 rcu_read_unlock();
185
173 nr = next_pidmap(pid_ns, nr); 186 nr = next_pidmap(pid_ns, nr);
174 } 187 }
175 read_unlock(&tasklist_lock); 188 read_unlock(&tasklist_lock);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9d1c1a0de350..5f21ab2bbcdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <asm/suspend.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -214,7 +215,7 @@ static int create_image(int platform_mode)
214 return error; 215 return error;
215 216
216 device_pm_lock(); 217 device_pm_lock();
217 local_irq_disable(); 218
218 /* At this point, device_suspend() has been called, but *not* 219 /* At this point, device_suspend() has been called, but *not*
219 * device_power_down(). We *must* call device_power_down() now. 220 * device_power_down(). We *must* call device_power_down() now.
220 * Otherwise, drivers for some devices (e.g. interrupt controllers) 221 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -225,13 +226,25 @@ static int create_image(int platform_mode)
225 if (error) { 226 if (error) {
226 printk(KERN_ERR "PM: Some devices failed to power down, " 227 printk(KERN_ERR "PM: Some devices failed to power down, "
227 "aborting hibernation\n"); 228 "aborting hibernation\n");
228 goto Enable_irqs; 229 goto Unlock;
229 } 230 }
231
232 error = platform_pre_snapshot(platform_mode);
233 if (error || hibernation_test(TEST_PLATFORM))
234 goto Platform_finish;
235
236 error = disable_nonboot_cpus();
237 if (error || hibernation_test(TEST_CPUS)
238 || hibernation_testmode(HIBERNATION_TEST))
239 goto Enable_cpus;
240
241 local_irq_disable();
242
230 sysdev_suspend(PMSG_FREEZE); 243 sysdev_suspend(PMSG_FREEZE);
231 if (error) { 244 if (error) {
232 printk(KERN_ERR "PM: Some devices failed to power down, " 245 printk(KERN_ERR "PM: Some devices failed to power down, "
233 "aborting hibernation\n"); 246 "aborting hibernation\n");
234 goto Power_up_devices; 247 goto Enable_irqs;
235 } 248 }
236 249
237 if (hibernation_test(TEST_CORE)) 250 if (hibernation_test(TEST_CORE))
@@ -247,17 +260,28 @@ static int create_image(int platform_mode)
247 restore_processor_state(); 260 restore_processor_state();
248 if (!in_suspend) 261 if (!in_suspend)
249 platform_leave(platform_mode); 262 platform_leave(platform_mode);
263
250 Power_up: 264 Power_up:
251 sysdev_resume(); 265 sysdev_resume();
252 /* NOTE: device_power_up() is just a resume() for devices 266 /* NOTE: device_power_up() is just a resume() for devices
253 * that suspended with irqs off ... no overall powerup. 267 * that suspended with irqs off ... no overall powerup.
254 */ 268 */
255 Power_up_devices: 269
256 device_power_up(in_suspend ?
257 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
258 Enable_irqs: 270 Enable_irqs:
259 local_irq_enable(); 271 local_irq_enable();
272
273 Enable_cpus:
274 enable_nonboot_cpus();
275
276 Platform_finish:
277 platform_finish(platform_mode);
278
279 device_power_up(in_suspend ?
280 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
281
282 Unlock:
260 device_pm_unlock(); 283 device_pm_unlock();
284
261 return error; 285 return error;
262} 286}
263 287
@@ -291,25 +315,9 @@ int hibernation_snapshot(int platform_mode)
291 if (hibernation_test(TEST_DEVICES)) 315 if (hibernation_test(TEST_DEVICES))
292 goto Recover_platform; 316 goto Recover_platform;
293 317
294 error = platform_pre_snapshot(platform_mode); 318 error = create_image(platform_mode);
295 if (error || hibernation_test(TEST_PLATFORM)) 319 /* Control returns here after successful restore */
296 goto Finish;
297
298 error = disable_nonboot_cpus();
299 if (!error) {
300 if (hibernation_test(TEST_CPUS))
301 goto Enable_cpus;
302
303 if (hibernation_testmode(HIBERNATION_TEST))
304 goto Enable_cpus;
305 320
306 error = create_image(platform_mode);
307 /* Control returns here after successful restore */
308 }
309 Enable_cpus:
310 enable_nonboot_cpus();
311 Finish:
312 platform_finish(platform_mode);
313 Resume_devices: 321 Resume_devices:
314 device_resume(in_suspend ? 322 device_resume(in_suspend ?
315 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@@ -331,19 +339,33 @@ int hibernation_snapshot(int platform_mode)
331 * kernel. 339 * kernel.
332 */ 340 */
333 341
334static int resume_target_kernel(void) 342static int resume_target_kernel(bool platform_mode)
335{ 343{
336 int error; 344 int error;
337 345
338 device_pm_lock(); 346 device_pm_lock();
339 local_irq_disable(); 347
340 error = device_power_down(PMSG_QUIESCE); 348 error = device_power_down(PMSG_QUIESCE);
341 if (error) { 349 if (error) {
342 printk(KERN_ERR "PM: Some devices failed to power down, " 350 printk(KERN_ERR "PM: Some devices failed to power down, "
343 "aborting resume\n"); 351 "aborting resume\n");
344 goto Enable_irqs; 352 goto Unlock;
345 } 353 }
346 sysdev_suspend(PMSG_QUIESCE); 354
355 error = platform_pre_restore(platform_mode);
356 if (error)
357 goto Cleanup;
358
359 error = disable_nonboot_cpus();
360 if (error)
361 goto Enable_cpus;
362
363 local_irq_disable();
364
365 error = sysdev_suspend(PMSG_QUIESCE);
366 if (error)
367 goto Enable_irqs;
368
347 /* We'll ignore saved state, but this gets preempt count (etc) right */ 369 /* We'll ignore saved state, but this gets preempt count (etc) right */
348 save_processor_state(); 370 save_processor_state();
349 error = restore_highmem(); 371 error = restore_highmem();
@@ -366,11 +388,23 @@ static int resume_target_kernel(void)
366 swsusp_free(); 388 swsusp_free();
367 restore_processor_state(); 389 restore_processor_state();
368 touch_softlockup_watchdog(); 390 touch_softlockup_watchdog();
391
369 sysdev_resume(); 392 sysdev_resume();
370 device_power_up(PMSG_RECOVER); 393
371 Enable_irqs: 394 Enable_irqs:
372 local_irq_enable(); 395 local_irq_enable();
396
397 Enable_cpus:
398 enable_nonboot_cpus();
399
400 Cleanup:
401 platform_restore_cleanup(platform_mode);
402
403 device_power_up(PMSG_RECOVER);
404
405 Unlock:
373 device_pm_unlock(); 406 device_pm_unlock();
407
374 return error; 408 return error;
375} 409}
376 410
@@ -390,19 +424,10 @@ int hibernation_restore(int platform_mode)
390 pm_prepare_console(); 424 pm_prepare_console();
391 suspend_console(); 425 suspend_console();
392 error = device_suspend(PMSG_QUIESCE); 426 error = device_suspend(PMSG_QUIESCE);
393 if (error)
394 goto Finish;
395
396 error = platform_pre_restore(platform_mode);
397 if (!error) { 427 if (!error) {
398 error = disable_nonboot_cpus(); 428 error = resume_target_kernel(platform_mode);
399 if (!error) 429 device_resume(PMSG_RECOVER);
400 error = resume_target_kernel();
401 enable_nonboot_cpus();
402 } 430 }
403 platform_restore_cleanup(platform_mode);
404 device_resume(PMSG_RECOVER);
405 Finish:
406 resume_console(); 431 resume_console();
407 pm_restore_console(); 432 pm_restore_console();
408 return error; 433 return error;
@@ -438,38 +463,46 @@ int hibernation_platform_enter(void)
438 goto Resume_devices; 463 goto Resume_devices;
439 } 464 }
440 465
466 device_pm_lock();
467
468 error = device_power_down(PMSG_HIBERNATE);
469 if (error)
470 goto Unlock;
471
441 error = hibernation_ops->prepare(); 472 error = hibernation_ops->prepare();
442 if (error) 473 if (error)
443 goto Resume_devices; 474 goto Platofrm_finish;
444 475
445 error = disable_nonboot_cpus(); 476 error = disable_nonboot_cpus();
446 if (error) 477 if (error)
447 goto Finish; 478 goto Platofrm_finish;
448 479
449 device_pm_lock();
450 local_irq_disable(); 480 local_irq_disable();
451 error = device_power_down(PMSG_HIBERNATE); 481 sysdev_suspend(PMSG_HIBERNATE);
452 if (!error) { 482 hibernation_ops->enter();
453 sysdev_suspend(PMSG_HIBERNATE); 483 /* We should never get here */
454 hibernation_ops->enter(); 484 while (1);
455 /* We should never get here */
456 while (1);
457 }
458 local_irq_enable();
459 device_pm_unlock();
460 485
461 /* 486 /*
462 * We don't need to reenable the nonboot CPUs or resume consoles, since 487 * We don't need to reenable the nonboot CPUs or resume consoles, since
463 * the system is going to be halted anyway. 488 * the system is going to be halted anyway.
464 */ 489 */
465 Finish: 490 Platofrm_finish:
466 hibernation_ops->finish(); 491 hibernation_ops->finish();
492
493 device_power_up(PMSG_RESTORE);
494
495 Unlock:
496 device_pm_unlock();
497
467 Resume_devices: 498 Resume_devices:
468 entering_platform_hibernation = false; 499 entering_platform_hibernation = false;
469 device_resume(PMSG_RESTORE); 500 device_resume(PMSG_RESTORE);
470 resume_console(); 501 resume_console();
502
471 Close: 503 Close:
472 hibernation_ops->end(); 504 hibernation_ops->end();
505
473 return error; 506 return error;
474} 507}
475 508
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c9632f841f64..f172f41858bb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -287,17 +287,32 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
287 */ 287 */
288static int suspend_enter(suspend_state_t state) 288static int suspend_enter(suspend_state_t state)
289{ 289{
290 int error = 0; 290 int error;
291 291
292 device_pm_lock(); 292 device_pm_lock();
293 arch_suspend_disable_irqs();
294 BUG_ON(!irqs_disabled());
295 293
296 if ((error = device_power_down(PMSG_SUSPEND))) { 294 error = device_power_down(PMSG_SUSPEND);
295 if (error) {
297 printk(KERN_ERR "PM: Some devices failed to power down\n"); 296 printk(KERN_ERR "PM: Some devices failed to power down\n");
298 goto Done; 297 goto Done;
299 } 298 }
300 299
300 if (suspend_ops->prepare) {
301 error = suspend_ops->prepare();
302 if (error)
303 goto Power_up_devices;
304 }
305
306 if (suspend_test(TEST_PLATFORM))
307 goto Platfrom_finish;
308
309 error = disable_nonboot_cpus();
310 if (error || suspend_test(TEST_CPUS))
311 goto Enable_cpus;
312
313 arch_suspend_disable_irqs();
314 BUG_ON(!irqs_disabled());
315
301 error = sysdev_suspend(PMSG_SUSPEND); 316 error = sysdev_suspend(PMSG_SUSPEND);
302 if (!error) { 317 if (!error) {
303 if (!suspend_test(TEST_CORE)) 318 if (!suspend_test(TEST_CORE))
@@ -305,11 +320,22 @@ static int suspend_enter(suspend_state_t state)
305 sysdev_resume(); 320 sysdev_resume();
306 } 321 }
307 322
308 device_power_up(PMSG_RESUME);
309 Done:
310 arch_suspend_enable_irqs(); 323 arch_suspend_enable_irqs();
311 BUG_ON(irqs_disabled()); 324 BUG_ON(irqs_disabled());
325
326 Enable_cpus:
327 enable_nonboot_cpus();
328
329 Platfrom_finish:
330 if (suspend_ops->finish)
331 suspend_ops->finish();
332
333 Power_up_devices:
334 device_power_up(PMSG_RESUME);
335
336 Done:
312 device_pm_unlock(); 337 device_pm_unlock();
338
313 return error; 339 return error;
314} 340}
315 341
@@ -341,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
341 if (suspend_test(TEST_DEVICES)) 367 if (suspend_test(TEST_DEVICES))
342 goto Recover_platform; 368 goto Recover_platform;
343 369
344 if (suspend_ops->prepare) { 370 suspend_enter(state);
345 error = suspend_ops->prepare();
346 if (error)
347 goto Resume_devices;
348 }
349
350 if (suspend_test(TEST_PLATFORM))
351 goto Finish;
352
353 error = disable_nonboot_cpus();
354 if (!error && !suspend_test(TEST_CPUS))
355 suspend_enter(state);
356 371
357 enable_nonboot_cpus();
358 Finish:
359 if (suspend_ops->finish)
360 suspend_ops->finish();
361 Resume_devices: 372 Resume_devices:
362 suspend_test_start(); 373 suspend_test_start();
363 device_resume(PMSG_RESUME); 374 device_resume(PMSG_RESUME);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
321 321
322 INIT_LIST_HEAD(list); 322 INIT_LIST_HEAD(list);
323 323
324 for_each_zone(zone) { 324 for_each_populated_zone(zone) {
325 unsigned long zone_start, zone_end; 325 unsigned long zone_start, zone_end;
326 struct mem_extent *ext, *cur, *aux; 326 struct mem_extent *ext, *cur, *aux;
327 327
328 if (!populated_zone(zone))
329 continue;
330
331 zone_start = zone->zone_start_pfn; 328 zone_start = zone->zone_start_pfn;
332 zone_end = zone->zone_start_pfn + zone->spanned_pages; 329 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333 330
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
804 struct zone *zone; 801 struct zone *zone;
805 unsigned int cnt = 0; 802 unsigned int cnt = 0;
806 803
807 for_each_zone(zone) 804 for_each_populated_zone(zone)
808 if (populated_zone(zone) && is_highmem(zone)) 805 if (is_highmem(zone))
809 cnt += zone_page_state(zone, NR_FREE_PAGES); 806 cnt += zone_page_state(zone, NR_FREE_PAGES);
810 807
811 return cnt; 808 return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h> 52#include <linux/time.h>
53#include <linux/rbtree.h> 53#include <linux/rbtree.h>
54#include <linux/io.h>
54 55
55#include "power.h" 56#include "power.h"
56 57
@@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)
229 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; 230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
230 tmp = size; 231 tmp = size;
231 size += highmem_size; 232 size += highmem_size;
232 for_each_zone (zone) 233 for_each_populated_zone(zone) {
233 if (populated_zone(zone)) { 234 tmp += snapshot_additional_pages(zone);
234 tmp += snapshot_additional_pages(zone); 235 if (is_highmem(zone)) {
235 if (is_highmem(zone)) { 236 highmem_size -=
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES); 237 zone_page_state(zone, NR_FREE_PAGES);
238 } else { 238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES); 239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 } 241 }
242 }
243 243
244 if (highmem_size < 0) 244 if (highmem_size < 0)
245 highmem_size = 0; 245 highmem_size = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..a5f61a9acedb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37 38
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
135static int log_buf_len = __LOG_BUF_LEN; 136static int log_buf_len = __LOG_BUF_LEN;
136static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 137static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
137 138
139#ifdef CONFIG_KEXEC
140/*
141 * This appends the listed symbols to /proc/vmcoreinfo
142 *
143 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
144 * obtain access to symbols that are otherwise very difficult to locate. These
145 * symbols are specifically used so that utilities can access and extract the
146 * dmesg log from a vmcore file after a crash.
147 */
148void log_buf_kexec_setup(void)
149{
150 VMCOREINFO_SYMBOL(log_buf);
151 VMCOREINFO_SYMBOL(log_end);
152 VMCOREINFO_SYMBOL(log_buf_len);
153 VMCOREINFO_SYMBOL(logged_chars);
154}
155#endif
156
138static int __init log_buf_len_setup(char *str) 157static int __init log_buf_len_setup(char *str)
139{ 158{
140 unsigned size = memparse(str, &str); 159 unsigned size = memparse(str, &str);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..5105f5a6a2ce 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
60{ 60{
61 spin_lock(&child->sighand->siglock); 61 spin_lock(&child->sighand->siglock);
62 if (task_is_traced(child)) { 62 if (task_is_traced(child)) {
63 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 63 /*
64 * If the group stop is completed or in progress,
65 * this thread was already counted as stopped.
66 */
67 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
68 child->signal->group_stop_count)
64 __set_task_state(child, TASK_STOPPED); 69 __set_task_state(child, TASK_STOPPED);
65 } else { 70 else
66 signal_wake_up(child, 1); 71 signal_wake_up(child, 1);
67 }
68 } 72 }
69 spin_unlock(&child->sighand->siglock); 73 spin_unlock(&child->sighand->siglock);
70} 74}
@@ -235,18 +239,58 @@ out:
235 return retval; 239 return retval;
236} 240}
237 241
238static inline void __ptrace_detach(struct task_struct *child, unsigned int data) 242/*
243 * Called with irqs disabled, returns true if childs should reap themselves.
244 */
245static int ignoring_children(struct sighand_struct *sigh)
239{ 246{
240 child->exit_code = data; 247 int ret;
241 /* .. re-parent .. */ 248 spin_lock(&sigh->siglock);
242 __ptrace_unlink(child); 249 ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
243 /* .. and wake it up. */ 250 (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
244 if (child->exit_state != EXIT_ZOMBIE) 251 spin_unlock(&sigh->siglock);
245 wake_up_process(child); 252 return ret;
253}
254
255/*
256 * Called with tasklist_lock held for writing.
257 * Unlink a traced task, and clean it up if it was a traced zombie.
258 * Return true if it needs to be reaped with release_task().
259 * (We can't call release_task() here because we already hold tasklist_lock.)
260 *
261 * If it's a zombie, our attachedness prevented normal parent notification
262 * or self-reaping. Do notification now if it would have happened earlier.
263 * If it should reap itself, return true.
264 *
265 * If it's our own child, there is no notification to do.
266 * But if our normal children self-reap, then this child
267 * was prevented by ptrace and we must reap it now.
268 */
269static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
270{
271 __ptrace_unlink(p);
272
273 if (p->exit_state == EXIT_ZOMBIE) {
274 if (!task_detached(p) && thread_group_empty(p)) {
275 if (!same_thread_group(p->real_parent, tracer))
276 do_notify_parent(p, p->exit_signal);
277 else if (ignoring_children(tracer->sighand))
278 p->exit_signal = -1;
279 }
280 if (task_detached(p)) {
281 /* Mark it as in the process of being reaped. */
282 p->exit_state = EXIT_DEAD;
283 return true;
284 }
285 }
286
287 return false;
246} 288}
247 289
248int ptrace_detach(struct task_struct *child, unsigned int data) 290int ptrace_detach(struct task_struct *child, unsigned int data)
249{ 291{
292 bool dead = false;
293
250 if (!valid_signal(data)) 294 if (!valid_signal(data))
251 return -EIO; 295 return -EIO;
252 296
@@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
255 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 299 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
256 300
257 write_lock_irq(&tasklist_lock); 301 write_lock_irq(&tasklist_lock);
258 /* protect against de_thread()->release_task() */ 302 /*
259 if (child->ptrace) 303 * This child can be already killed. Make sure de_thread() or
260 __ptrace_detach(child, data); 304 * our sub-thread doing do_wait() didn't do release_task() yet.
305 */
306 if (child->ptrace) {
307 child->exit_code = data;
308 dead = __ptrace_detach(current, child);
309 }
261 write_unlock_irq(&tasklist_lock); 310 write_unlock_irq(&tasklist_lock);
262 311
312 if (unlikely(dead))
313 release_task(child);
314
263 return 0; 315 return 0;
264} 316}
265 317
318/*
319 * Detach all tasks we were using ptrace on.
320 */
321void exit_ptrace(struct task_struct *tracer)
322{
323 struct task_struct *p, *n;
324 LIST_HEAD(ptrace_dead);
325
326 write_lock_irq(&tasklist_lock);
327 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
328 if (__ptrace_detach(tracer, p))
329 list_add(&p->ptrace_entry, &ptrace_dead);
330 }
331 write_unlock_irq(&tasklist_lock);
332
333 BUG_ON(!list_empty(&tracer->ptraced));
334
335 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
336 list_del_init(&p->ptrace_entry);
337 release_task(p);
338 }
339}
340
266int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 341int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
267{ 342{
268 int copied = 0; 343 int copied = 0;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0a..9b4a975a4b4a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers = 0;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask;
129 130
130static int stutter_pause_test = 0; 131static int stutter_pause_test = 0;
131 132
@@ -889,10 +890,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
889 */ 890 */
890static void rcu_torture_shuffle_tasks(void) 891static void rcu_torture_shuffle_tasks(void)
891{ 892{
892 cpumask_t tmp_mask;
893 int i; 893 int i;
894 894
895 cpus_setall(tmp_mask); 895 cpumask_setall(shuffle_tmp_mask);
896 get_online_cpus(); 896 get_online_cpus();
897 897
898 /* No point in shuffling if there is only one online CPU (ex: UP) */ 898 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
902 } 902 }
903 903
904 if (rcu_idle_cpu != -1) 904 if (rcu_idle_cpu != -1)
905 cpu_clear(rcu_idle_cpu, tmp_mask); 905 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
906 906
907 set_cpus_allowed_ptr(current, &tmp_mask); 907 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
908 908
909 if (reader_tasks) { 909 if (reader_tasks) {
910 for (i = 0; i < nrealreaders; i++) 910 for (i = 0; i < nrealreaders; i++)
911 if (reader_tasks[i]) 911 if (reader_tasks[i])
912 set_cpus_allowed_ptr(reader_tasks[i], 912 set_cpus_allowed_ptr(reader_tasks[i],
913 &tmp_mask); 913 shuffle_tmp_mask);
914 } 914 }
915 915
916 if (fakewriter_tasks) { 916 if (fakewriter_tasks) {
917 for (i = 0; i < nfakewriters; i++) 917 for (i = 0; i < nfakewriters; i++)
918 if (fakewriter_tasks[i]) 918 if (fakewriter_tasks[i])
919 set_cpus_allowed_ptr(fakewriter_tasks[i], 919 set_cpus_allowed_ptr(fakewriter_tasks[i],
920 &tmp_mask); 920 shuffle_tmp_mask);
921 } 921 }
922 922
923 if (writer_task) 923 if (writer_task)
924 set_cpus_allowed_ptr(writer_task, &tmp_mask); 924 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
925 925
926 if (stats_task) 926 if (stats_task)
927 set_cpus_allowed_ptr(stats_task, &tmp_mask); 927 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
928 928
929 if (rcu_idle_cpu == -1) 929 if (rcu_idle_cpu == -1)
930 rcu_idle_cpu = num_online_cpus() - 1; 930 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
1012 if (shuffler_task) { 1012 if (shuffler_task) {
1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1014 kthread_stop(shuffler_task); 1014 kthread_stop(shuffler_task);
1015 free_cpumask_var(shuffle_tmp_mask);
1015 } 1016 }
1016 shuffler_task = NULL; 1017 shuffler_task = NULL;
1017 1018
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
1190 } 1191 }
1191 if (test_no_idle_hz) { 1192 if (test_no_idle_hz) {
1192 rcu_idle_cpu = num_online_cpus() - 1; 1193 rcu_idle_cpu = num_online_cpus() - 1;
1194
1195 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
1196 firsterr = -ENOMEM;
1197 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
1198 goto unwind;
1199 }
1200
1193 /* Create the shuffler thread */ 1201 /* Create the shuffler thread */
1194 shuffler_task = kthread_run(rcu_torture_shuffle, NULL, 1202 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
1195 "rcu_torture_shuffle"); 1203 "rcu_torture_shuffle");
1196 if (IS_ERR(shuffler_task)) { 1204 if (IS_ERR(shuffler_task)) {
1205 free_cpumask_var(shuffle_tmp_mask);
1197 firsterr = PTR_ERR(shuffler_task); 1206 firsterr = PTR_ERR(shuffler_task);
1198 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); 1207 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
1199 shuffler_task = NULL; 1208 shuffler_task = NULL;
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f2179c8056f..e92db8c06acf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,
797 if (!chan) 797 if (!chan)
798 return; 798 return;
799 799
800 if (cpu >= NR_CPUS || !chan->buf[cpu]) 800 if (cpu >= NR_CPUS || !chan->buf[cpu] ||
801 subbufs_consumed > chan->n_subbufs)
801 return; 802 return;
802 803
803 buf = chan->buf[cpu]; 804 buf = chan->buf[cpu];
804 buf->subbufs_consumed += subbufs_consumed; 805 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
805 if (buf->subbufs_consumed > buf->subbufs_produced)
806 buf->subbufs_consumed = buf->subbufs_produced; 806 buf->subbufs_consumed = buf->subbufs_produced;
807 else
808 buf->subbufs_consumed += subbufs_consumed;
807} 809}
808EXPORT_SYMBOL_GPL(relay_subbufs_consumed); 810EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
809 811
diff --git a/kernel/sched.c b/kernel/sched.c
index 5757e03cfac0..73513f4e19df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4942,15 +4942,13 @@ pick_next_task(struct rq *rq)
4942/* 4942/*
4943 * schedule() is the main scheduler function. 4943 * schedule() is the main scheduler function.
4944 */ 4944 */
4945asmlinkage void __sched schedule(void) 4945asmlinkage void __sched __schedule(void)
4946{ 4946{
4947 struct task_struct *prev, *next; 4947 struct task_struct *prev, *next;
4948 unsigned long *switch_count; 4948 unsigned long *switch_count;
4949 struct rq *rq; 4949 struct rq *rq;
4950 int cpu; 4950 int cpu;
4951 4951
4952need_resched:
4953 preempt_disable();
4954 cpu = smp_processor_id(); 4952 cpu = smp_processor_id();
4955 rq = cpu_rq(cpu); 4953 rq = cpu_rq(cpu);
4956 rcu_qsctr_inc(cpu); 4954 rcu_qsctr_inc(cpu);
@@ -5007,13 +5005,80 @@ need_resched_nonpreemptible:
5007 5005
5008 if (unlikely(reacquire_kernel_lock(current) < 0)) 5006 if (unlikely(reacquire_kernel_lock(current) < 0))
5009 goto need_resched_nonpreemptible; 5007 goto need_resched_nonpreemptible;
5008}
5010 5009
5010asmlinkage void __sched schedule(void)
5011{
5012need_resched:
5013 preempt_disable();
5014 __schedule();
5011 preempt_enable_no_resched(); 5015 preempt_enable_no_resched();
5012 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5016 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
5013 goto need_resched; 5017 goto need_resched;
5014} 5018}
5015EXPORT_SYMBOL(schedule); 5019EXPORT_SYMBOL(schedule);
5016 5020
5021#ifdef CONFIG_SMP
5022/*
5023 * Look out! "owner" is an entirely speculative pointer
5024 * access and not reliable.
5025 */
5026int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5027{
5028 unsigned int cpu;
5029 struct rq *rq;
5030
5031 if (!sched_feat(OWNER_SPIN))
5032 return 0;
5033
5034#ifdef CONFIG_DEBUG_PAGEALLOC
5035 /*
5036 * Need to access the cpu field knowing that
5037 * DEBUG_PAGEALLOC could have unmapped it if
5038 * the mutex owner just released it and exited.
5039 */
5040 if (probe_kernel_address(&owner->cpu, cpu))
5041 goto out;
5042#else
5043 cpu = owner->cpu;
5044#endif
5045
5046 /*
5047 * Even if the access succeeded (likely case),
5048 * the cpu field may no longer be valid.
5049 */
5050 if (cpu >= nr_cpumask_bits)
5051 goto out;
5052
5053 /*
5054 * We need to validate that we can do a
5055 * get_cpu() and that we have the percpu area.
5056 */
5057 if (!cpu_online(cpu))
5058 goto out;
5059
5060 rq = cpu_rq(cpu);
5061
5062 for (;;) {
5063 /*
5064 * Owner changed, break to re-assess state.
5065 */
5066 if (lock->owner != owner)
5067 break;
5068
5069 /*
5070 * Is that owner really running on that cpu?
5071 */
5072 if (task_thread_info(rq->curr) != owner || need_resched())
5073 return 0;
5074
5075 cpu_relax();
5076 }
5077out:
5078 return 1;
5079}
5080#endif
5081
5017#ifdef CONFIG_PREEMPT 5082#ifdef CONFIG_PREEMPT
5018/* 5083/*
5019 * this is the entry point to schedule() from in-kernel preemption 5084 * this is the entry point to schedule() from in-kernel preemption
@@ -5131,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5131 __wake_up_common(q, mode, 1, 0, NULL); 5196 __wake_up_common(q, mode, 1, 0, NULL);
5132} 5197}
5133 5198
5199void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5200{
5201 __wake_up_common(q, mode, 1, 0, key);
5202}
5203
5134/** 5204/**
5135 * __wake_up_sync - wake up threads blocked on a waitqueue. 5205 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
5136 * @q: the waitqueue 5206 * @q: the waitqueue
5137 * @mode: which threads 5207 * @mode: which threads
5138 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5208 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5209 * @key: opaque value to be passed to wakeup targets
5139 * 5210 *
5140 * The sync wakeup differs that the waker knows that it will schedule 5211 * The sync wakeup differs that the waker knows that it will schedule
5141 * away soon, so while the target thread will be woken up, it will not 5212 * away soon, so while the target thread will be woken up, it will not
@@ -5144,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5144 * 5215 *
5145 * On UP it can prevent extra preemption. 5216 * On UP it can prevent extra preemption.
5146 */ 5217 */
5147void 5218void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5148__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5219 int nr_exclusive, void *key)
5149{ 5220{
5150 unsigned long flags; 5221 unsigned long flags;
5151 int sync = 1; 5222 int sync = 1;
@@ -5157,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5157 sync = 0; 5228 sync = 0;
5158 5229
5159 spin_lock_irqsave(&q->lock, flags); 5230 spin_lock_irqsave(&q->lock, flags);
5160 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5231 __wake_up_common(q, mode, nr_exclusive, sync, key);
5161 spin_unlock_irqrestore(&q->lock, flags); 5232 spin_unlock_irqrestore(&q->lock, flags);
5162} 5233}
5234EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5235
5236/*
5237 * __wake_up_sync - see __wake_up_sync_key()
5238 */
5239void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5240{
5241 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5242}
5163EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5243EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5164 5244
5165/** 5245/**
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0a..9a7e859b8fbf 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
25 25
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 76f61756e677..4569bfa7df9b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -14,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 14SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 15SCHED_FEAT(WAKEUP_OVERLAP, 0)
16SCHED_FEAT(LAST_BUDDY, 1) 16SCHED_FEAT(LAST_BUDDY, 1)
17SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
55 (handler == SIG_DFL && sig_kernel_ignore(sig)); 55 (handler == SIG_DFL && sig_kernel_ignore(sig));
56} 56}
57 57
58static int sig_ignored(struct task_struct *t, int sig) 58static int sig_task_ignored(struct task_struct *t, int sig,
59 int from_ancestor_ns)
59{ 60{
60 void __user *handler; 61 void __user *handler;
61 62
63 handler = sig_handler(t, sig);
64
65 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
66 handler == SIG_DFL && !from_ancestor_ns)
67 return 1;
68
69 return sig_handler_ignored(handler, sig);
70}
71
72static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
73{
62 /* 74 /*
63 * Blocked signals are never ignored, since the 75 * Blocked signals are never ignored, since the
64 * signal handler may change by the time it is 76 * signal handler may change by the time it is
@@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
67 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 79 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
68 return 0; 80 return 0;
69 81
70 handler = sig_handler(t, sig); 82 if (!sig_task_ignored(t, sig, from_ancestor_ns))
71 if (!sig_handler_ignored(handler, sig))
72 return 0; 83 return 0;
73 84
74 /* 85 /*
75 * Tracers may want to know about even ignored signals. 86 * Tracers may want to know about even ignored signals.
76 */ 87 */
77 return !tracehook_consider_ignored_signal(t, sig, handler); 88 return !tracehook_consider_ignored_signal(t, sig);
78} 89}
79 90
80/* 91/*
@@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
318 return 1; 329 return 1;
319 if (handler != SIG_IGN && handler != SIG_DFL) 330 if (handler != SIG_IGN && handler != SIG_DFL)
320 return 0; 331 return 0;
321 return !tracehook_consider_fatal_signal(tsk, sig, handler); 332 return !tracehook_consider_fatal_signal(tsk, sig);
322} 333}
323 334
324 335
@@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
624 * Returns true if the signal should be actually delivered, otherwise 635 * Returns true if the signal should be actually delivered, otherwise
625 * it should be dropped. 636 * it should be dropped.
626 */ 637 */
627static int prepare_signal(int sig, struct task_struct *p) 638static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
628{ 639{
629 struct signal_struct *signal = p->signal; 640 struct signal_struct *signal = p->signal;
630 struct task_struct *t; 641 struct task_struct *t;
@@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
708 } 719 }
709 } 720 }
710 721
711 return !sig_ignored(p, sig); 722 return !sig_ignored(p, sig, from_ancestor_ns);
712} 723}
713 724
714/* 725/*
@@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
777 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 788 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
778 !sigismember(&t->real_blocked, sig) && 789 !sigismember(&t->real_blocked, sig) &&
779 (sig == SIGKILL || 790 (sig == SIGKILL ||
780 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { 791 !tracehook_consider_fatal_signal(t, sig))) {
781 /* 792 /*
782 * This signal will be fatal to the whole group. 793 * This signal will be fatal to the whole group.
783 */ 794 */
@@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
813 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 824 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
814} 825}
815 826
816static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 827static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
817 int group) 828 int group, int from_ancestor_ns)
818{ 829{
819 struct sigpending *pending; 830 struct sigpending *pending;
820 struct sigqueue *q; 831 struct sigqueue *q;
@@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
822 trace_sched_signal_send(sig, t); 833 trace_sched_signal_send(sig, t);
823 834
824 assert_spin_locked(&t->sighand->siglock); 835 assert_spin_locked(&t->sighand->siglock);
825 if (!prepare_signal(sig, t)) 836
837 if (!prepare_signal(sig, t, from_ancestor_ns))
826 return 0; 838 return 0;
827 839
828 pending = group ? &t->signal->shared_pending : &t->pending; 840 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
871 break; 883 break;
872 default: 884 default:
873 copy_siginfo(&q->info, info); 885 copy_siginfo(&q->info, info);
886 if (from_ancestor_ns)
887 q->info.si_pid = 0;
874 break; 888 break;
875 } 889 }
876 } else if (!is_si_special(info)) { 890 } else if (!is_si_special(info)) {
@@ -889,6 +903,20 @@ out_set:
889 return 0; 903 return 0;
890} 904}
891 905
906static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
907 int group)
908{
909 int from_ancestor_ns = 0;
910
911#ifdef CONFIG_PID_NS
912 if (!is_si_special(info) && SI_FROMUSER(info) &&
913 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
914 from_ancestor_ns = 1;
915#endif
916
917 return __send_signal(sig, info, t, group, from_ancestor_ns);
918}
919
892int print_fatal_signals; 920int print_fatal_signals;
893 921
894static void print_fatal_signal(struct pt_regs *regs, int signr) 922static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1133 if (sig && p->sighand) { 1161 if (sig && p->sighand) {
1134 unsigned long flags; 1162 unsigned long flags;
1135 spin_lock_irqsave(&p->sighand->siglock, flags); 1163 spin_lock_irqsave(&p->sighand->siglock, flags);
1136 ret = __group_send_sig_info(sig, info, p); 1164 ret = __send_signal(sig, info, p, 1, 0);
1137 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1165 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1138 } 1166 }
1139out_unlock: 1167out_unlock:
@@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1320 goto ret; 1348 goto ret;
1321 1349
1322 ret = 1; /* the signal is ignored */ 1350 ret = 1; /* the signal is ignored */
1323 if (!prepare_signal(sig, t)) 1351 if (!prepare_signal(sig, t, 0))
1324 goto out; 1352 goto out;
1325 1353
1326 ret = 0; 1354 ret = 0;
@@ -1844,9 +1872,16 @@ relock:
1844 1872
1845 /* 1873 /*
1846 * Global init gets no signals it doesn't want. 1874 * Global init gets no signals it doesn't want.
1875 * Container-init gets no signals it doesn't want from same
1876 * container.
1877 *
1878 * Note that if global/container-init sees a sig_kernel_only()
1879 * signal here, the signal must have been generated internally
1880 * or must have come from an ancestor namespace. In either
1881 * case, the signal cannot be dropped.
1847 */ 1882 */
1848 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && 1883 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1849 !signal_group_exit(signal)) 1884 !sig_kernel_only(signr))
1850 continue; 1885 continue;
1851 1886
1852 if (sig_kernel_stop(signr)) { 1887 if (sig_kernel_stop(signr)) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..cf2bc01186ef
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24
25static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long);
27
28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
30 void __user *, size_t *, loff_t *);
31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
33 void __user *, size_t *, loff_t *);
34#endif
35
36/*
37 * The pool of threads has at least min threads in it as long as someone is
38 * using the facility, and may have as many as max.
39 *
40 * A portion of the pool may be processing very slow operations.
41 */
42static unsigned slow_work_min_threads = 2;
43static unsigned slow_work_max_threads = 4;
44static unsigned vslow_work_proportion = 50; /* % of threads that may process
45 * very slow work */
46
47#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255;
50static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99;
52
53ctl_table slow_work_sysctls[] = {
54 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads",
57 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned),
59 .mode = 0644,
60 .proc_handler = slow_work_min_threads_sysctl,
61 .extra1 = (void *) &slow_work_min_min_threads,
62 .extra2 = &slow_work_max_threads,
63 },
64 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads",
67 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned),
69 .mode = 0644,
70 .proc_handler = slow_work_max_threads_sysctl,
71 .extra1 = &slow_work_min_threads,
72 .extra2 = (void *) &slow_work_max_max_threads,
73 },
74 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned),
79 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow,
83 },
84 { .ctl_name = 0 }
85};
86#endif
87
88/*
89 * The active state of the thread pool
90 */
91static atomic_t slow_work_thread_count;
92static atomic_t vslow_work_executing_count;
93
94static bool slow_work_may_not_start_new_thread;
95static bool slow_work_cull; /* cull a thread due to lack of activity */
96static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
97static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */
99
100/*
101 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs.
104 *
105 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items.
107 */
108static LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock);
111
112/*
113 * The thread controls. A variable used to signal to the threads that they
114 * should exit when the queue is empty, a waitqueue used by the threads to wait
115 * for signals, and a completion set by the last thread to exit.
116 */
117static bool slow_work_threads_should_exit;
118static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
119static DECLARE_COMPLETION(slow_work_last_thread_exited);
120
121/*
122 * The number of users of the thread pool and its lock. Whilst this is zero we
123 * have no threads hanging around, and when this reaches zero, we wait for all
124 * active or queued work items to complete and kill all the threads we do have.
125 */
126static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock);
128
129/*
130 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items.
132 *
133 * The answer is rounded up to at least 1, but may not equal or exceed the
134 * maximum number of the threads in the pool. This means we always have at
135 * least one thread that can process slow work items, and we always have at
136 * least one thread that won't get tied up doing so.
137 */
138static unsigned slow_work_calc_vsmax(void)
139{
140 unsigned vsmax;
141
142 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
143 vsmax /= 100;
144 vsmax = max(vsmax, 1U);
145 return min(vsmax, slow_work_max_threads - 1);
146}
147
148/*
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do.
151 */
152static bool slow_work_execute(void)
153{
154 struct slow_work *work = NULL;
155 unsigned vsmax;
156 bool very_slow;
157
158 vsmax = slow_work_calc_vsmax();
159
160 /* see if we can schedule a new thread to be started if we're not
161 * keeping up with the work */
162 if (!waitqueue_active(&slow_work_thread_wq) &&
163 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
164 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
165 !slow_work_may_not_start_new_thread)
166 slow_work_enqueue(&slow_work_new_thread);
167
168 /* find something to execute */
169 spin_lock_irq(&slow_work_queue_lock);
170 if (!list_empty(&vslow_work_queue) &&
171 atomic_read(&vslow_work_executing_count) < vsmax) {
172 work = list_entry(vslow_work_queue.next,
173 struct slow_work, link);
174 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
175 BUG();
176 list_del_init(&work->link);
177 atomic_inc(&vslow_work_executing_count);
178 very_slow = true;
179 } else if (!list_empty(&slow_work_queue)) {
180 work = list_entry(slow_work_queue.next,
181 struct slow_work, link);
182 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
183 BUG();
184 list_del_init(&work->link);
185 very_slow = false;
186 } else {
187 very_slow = false; /* avoid the compiler warning */
188 }
189 spin_unlock_irq(&slow_work_queue_lock);
190
191 if (!work)
192 return false;
193
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG();
196
197 work->ops->execute(work);
198
199 if (very_slow)
200 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202
203 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously
206 *
207 * there is, however, a race between us testing the pending flag and
208 * getting the spinlock, and between the enqueuer setting the pending
209 * flag and getting the spinlock, so we use a deferral bit to tell us
210 * if the enqueuer got there first
211 */
212 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
213 spin_lock_irq(&slow_work_queue_lock);
214
215 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
216 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
217 goto auto_requeue;
218
219 spin_unlock_irq(&slow_work_queue_lock);
220 }
221
222 work->ops->put_ref(work);
223 return true;
224
225auto_requeue:
226 /* we must complete the enqueue operation
227 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already
229 */
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue);
232 else
233 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock);
235 return true;
236}
237
238/**
239 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue
241 *
242 * Schedule a slow work item for processing. If the item is already undergoing
243 * execution, this guarantees not to re-enter the execution routine until the
244 * first execution finishes.
245 *
246 * The item is pinned by this function as it retains a reference to it, managed
247 * through the item operations. The item is unpinned once it has been
248 * executed.
249 *
250 * An item may hog the thread that is running it for a relatively large amount
251 * of time, sufficient, for example, to perform several lookup, mkdir, create
252 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
253 *
254 * Conversely, if a number of items are awaiting processing, it may take some
255 * time before any given item is given attention. The number of threads in the
256 * pool may be increased to deal with demand, but only up to a limit.
257 *
258 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
259 * the very slow queue, from which only a portion of the threads will be
260 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow.
262 *
263 * Returns 0 if successful, -EAGAIN if not.
264 */
265int slow_work_enqueue(struct slow_work *work)
266{
267 unsigned long flags;
268
269 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work);
271 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273
274 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once
276 * per enqueue request
277 *
278 * we use the PENDING bit to merge together repeat requests without
279 * having to disable IRQs and take the spinlock, whilst still
280 * maintaining our promise
281 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
283 spin_lock_irqsave(&slow_work_queue_lock, flags);
284
285 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously
287 *
288 * this, however, leaves us with a problem if we're asked to
289 * enqueue the work whilst someone is executing the work
290 * function as simply queueing the work immediately means that
291 * another thread may try executing it whilst it is already
292 * under execution
293 *
294 * to deal with this, we set the ENQ_DEFERRED bit instead of
295 * enqueueing, and the thread currently executing the work
296 * function will enqueue the work item when the work function
297 * returns and it has cleared the EXECUTING bit
298 */
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else {
302 if (work->ops->get_ref(work) < 0)
303 goto cant_get_ref;
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
305 list_add_tail(&work->link, &vslow_work_queue);
306 else
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq);
309 }
310
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 }
313 return 0;
314
315cant_get_ref:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN;
318}
319EXPORT_SYMBOL(slow_work_enqueue);
320
321/*
322 * Worker thread culling algorithm
323 */
324static bool slow_work_cull_thread(void)
325{
326 unsigned long flags;
327 bool do_cull = false;
328
329 spin_lock_irqsave(&slow_work_queue_lock, flags);
330
331 if (slow_work_cull) {
332 slow_work_cull = false;
333
334 if (list_empty(&slow_work_queue) &&
335 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer,
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true;
341 }
342 }
343
344 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
345 return do_cull;
346}
347
348/*
349 * Determine if there is slow work available for dispatch
350 */
351static inline bool slow_work_available(int vsmax)
352{
353 return !list_empty(&slow_work_queue) ||
354 (!list_empty(&vslow_work_queue) &&
355 atomic_read(&vslow_work_executing_count) < vsmax);
356}
357
358/*
359 * Worker thread dispatcher
360 */
361static int slow_work_thread(void *_data)
362{
363 int vsmax;
364
365 DEFINE_WAIT(wait);
366
367 set_freezable();
368 set_user_nice(current, -5);
369
370 for (;;) {
371 vsmax = vslow_work_proportion;
372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100;
374
375 prepare_to_wait(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) &&
378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) &&
380 !slow_work_cull)
381 schedule();
382 finish_wait(&slow_work_thread_wq, &wait);
383
384 try_to_freeze();
385
386 vsmax = vslow_work_proportion;
387 vsmax *= atomic_read(&slow_work_thread_count);
388 vsmax /= 100;
389
390 if (slow_work_available(vsmax) && slow_work_execute()) {
391 cond_resched();
392 if (list_empty(&slow_work_queue) &&
393 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer,
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue;
399 }
400
401 if (slow_work_threads_should_exit)
402 break;
403
404 if (slow_work_cull && slow_work_cull_thread())
405 break;
406 }
407
408 if (atomic_dec_and_test(&slow_work_thread_count))
409 complete_and_exit(&slow_work_last_thread_exited, 0);
410 return 0;
411}
412
413/*
414 * Handle thread cull timer expiration
415 */
416static void slow_work_cull_timeout(unsigned long data)
417{
418 slow_work_cull = true;
419 wake_up(&slow_work_thread_wq);
420}
421
422/*
423 * Get a reference on slow work thread starter
424 */
425static int slow_work_new_thread_get_ref(struct slow_work *work)
426{
427 return 0;
428}
429
430/*
431 * Drop a reference on slow work thread starter
432 */
433static void slow_work_new_thread_put_ref(struct slow_work *work)
434{
435}
436
437/*
438 * Start a new slow work thread
439 */
440static void slow_work_new_thread_execute(struct slow_work *work)
441{
442 struct task_struct *p;
443
444 if (slow_work_threads_should_exit)
445 return;
446
447 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
448 return;
449
450 if (!mutex_trylock(&slow_work_user_lock))
451 return;
452
453 slow_work_may_not_start_new_thread = true;
454 atomic_inc(&slow_work_thread_count);
455 p = kthread_run(slow_work_thread, NULL, "kslowd");
456 if (IS_ERR(p)) {
457 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
458 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT);
462 } else {
463 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1);
465 }
466
467 mutex_unlock(&slow_work_user_lock);
468}
469
470static const struct slow_work_ops slow_work_new_thread_ops = {
471 .get_ref = slow_work_new_thread_get_ref,
472 .put_ref = slow_work_new_thread_put_ref,
473 .execute = slow_work_new_thread_execute,
474};
475
476/*
477 * post-OOM new thread start suppression expiration
478 */
479static void slow_work_oom_timeout(unsigned long data)
480{
481 slow_work_may_not_start_new_thread = false;
482}
483
484#ifdef CONFIG_SYSCTL
485/*
486 * Handle adjustment of the minimum number of threads
487 */
488static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
489 struct file *filp, void __user *buffer,
490 size_t *lenp, loff_t *ppos)
491{
492 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
493 int n;
494
495 if (ret == 0) {
496 mutex_lock(&slow_work_user_lock);
497 if (slow_work_user_count > 0) {
498 /* see if we need to start or stop threads */
499 n = atomic_read(&slow_work_thread_count) -
500 slow_work_min_threads;
501
502 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0)
505 mod_timer(&slow_work_cull_timer,
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 }
508 mutex_unlock(&slow_work_user_lock);
509 }
510
511 return ret;
512}
513
514/*
515 * Handle adjustment of the maximum number of threads
516 */
517static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
518 struct file *filp, void __user *buffer,
519 size_t *lenp, loff_t *ppos)
520{
521 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
522 int n;
523
524 if (ret == 0) {
525 mutex_lock(&slow_work_user_lock);
526 if (slow_work_user_count > 0) {
527 /* see if we need to stop threads */
528 n = slow_work_max_threads -
529 atomic_read(&slow_work_thread_count);
530
531 if (n < 0)
532 mod_timer(&slow_work_cull_timer,
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 }
535 mutex_unlock(&slow_work_user_lock);
536 }
537
538 return ret;
539}
540#endif /* CONFIG_SYSCTL */
541
542/**
543 * slow_work_register_user - Register a user of the facility
544 *
545 * Register a user of the facility, starting up the initial threads if there
546 * aren't any other users at this point. This will return 0 if successful, or
547 * an error if not.
548 */
549int slow_work_register_user(void)
550{
551 struct task_struct *p;
552 int loop;
553
554 mutex_lock(&slow_work_user_lock);
555
556 if (slow_work_user_count == 0) {
557 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
558 init_completion(&slow_work_last_thread_exited);
559
560 slow_work_threads_should_exit = false;
561 slow_work_init(&slow_work_new_thread,
562 &slow_work_new_thread_ops);
563 slow_work_may_not_start_new_thread = false;
564 slow_work_cull = false;
565
566 /* start the minimum number of threads */
567 for (loop = 0; loop < slow_work_min_threads; loop++) {
568 atomic_inc(&slow_work_thread_count);
569 p = kthread_run(slow_work_thread, NULL, "kslowd");
570 if (IS_ERR(p))
571 goto error;
572 }
573 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
574 }
575
576 slow_work_user_count++;
577 mutex_unlock(&slow_work_user_lock);
578 return 0;
579
580error:
581 if (atomic_dec_and_test(&slow_work_thread_count))
582 complete(&slow_work_last_thread_exited);
583 if (loop > 0) {
584 printk(KERN_ERR "Slow work thread pool:"
585 " Aborting startup on ENOMEM\n");
586 slow_work_threads_should_exit = true;
587 wake_up_all(&slow_work_thread_wq);
588 wait_for_completion(&slow_work_last_thread_exited);
589 printk(KERN_ERR "Slow work thread pool: Aborted\n");
590 }
591 mutex_unlock(&slow_work_user_lock);
592 return PTR_ERR(p);
593}
594EXPORT_SYMBOL(slow_work_register_user);
595
596/**
597 * slow_work_unregister_user - Unregister a user of the facility
598 *
599 * Unregister a user of the facility, killing all the threads if this was the
600 * last one.
601 */
602void slow_work_unregister_user(void)
603{
604 mutex_lock(&slow_work_user_lock);
605
606 BUG_ON(slow_work_user_count <= 0);
607
608 slow_work_user_count--;
609 if (slow_work_user_count == 0) {
610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
611 slow_work_threads_should_exit = true;
612 wake_up_all(&slow_work_thread_wq);
613 wait_for_completion(&slow_work_last_thread_exited);
614 printk(KERN_NOTICE "Slow work thread pool:"
615 " Shut down complete\n");
616 }
617
618 del_timer_sync(&slow_work_cull_timer);
619
620 mutex_unlock(&slow_work_user_lock);
621}
622EXPORT_SYMBOL(slow_work_unregister_user);
623
624/*
625 * Initialise the slow work facility
626 */
627static int __init init_slow_work(void)
628{
629 unsigned nr_cpus = num_possible_cpus();
630
631 if (slow_work_max_threads < nr_cpus)
632 slow_work_max_threads = nr_cpus;
633#ifdef CONFIG_SYSCTL
634 if (slow_work_max_max_threads < nr_cpus * 2)
635 slow_work_max_max_threads = nr_cpus * 2;
636#endif
637 return 0;
638}
639
640subsys_initcall(init_slow_work);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
121 local_irq_save(flags); 121 local_irq_save(flags);
122 preempt_disable(); 122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); 124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
125 return flags; 126 return flags;
126} 127}
127EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
151 local_irq_save(flags); 152 local_irq_save(flags);
152 preempt_disable(); 153 preempt_disable();
153 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
154 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); 155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
155 return flags; 157 return flags;
156} 158}
157EXPORT_SYMBOL(_write_lock_irqsave); 159EXPORT_SYMBOL(_write_lock_irqsave);
@@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
299 local_irq_save(flags); 301 local_irq_save(flags);
300 preempt_disable(); 302 preempt_disable();
301 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 303 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
302 /* 304 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
303 * On lockdep we dont want the hand-coded irq-enable of 305 _raw_spin_lock_flags, &flags);
304 * _raw_spin_lock_flags() code, because lockdep assumes
305 * that interrupts are not re-enabled during lock-acquire:
306 */
307#ifdef CONFIG_LOCKDEP
308 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
309#else
310 _raw_spin_lock_flags(lock, &flags);
311#endif
312 return flags; 306 return flags;
313} 307}
314EXPORT_SYMBOL(_spin_lock_irqsave_nested); 308EXPORT_SYMBOL(_spin_lock_irqsave_nested);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 74541ca49536..912823e2a11b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
44static int refcount; 44static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const cpumask_t *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..51dbb55604e8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/fs_struct.h>
37 38
38#include <linux/compat.h> 39#include <linux/compat.h>
39#include <linux/syscalls.h> 40#include <linux/syscalls.h>
@@ -1013,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
1013 if (err) 1014 if (err)
1014 goto out; 1015 goto out;
1015 1016
1016 if (task_pgrp(p) != pgrp) { 1017 if (task_pgrp(p) != pgrp)
1017 change_pid(p, PIDTYPE_PGID, pgrp); 1018 change_pid(p, PIDTYPE_PGID, pgrp);
1018 set_task_pgrp(p, pid_nr(pgrp));
1019 }
1020 1019
1021 err = 0; 1020 err = 0;
1022out: 1021out:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
48#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/processor.h> 54#include <asm/processor.h>
@@ -95,12 +96,9 @@ static int sixty = 60;
95static int neg_one = -1; 96static int neg_one = -1;
96#endif 97#endif
97 98
98#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
99static int two = 2;
100#endif
101
102static int zero; 99static int zero;
103static int one = 1; 100static int one = 1;
101static int two = 2;
104static unsigned long one_ul = 1; 102static unsigned long one_ul = 1;
105static int one_hundred = 100; 103static int one_hundred = 100;
106 104
@@ -900,6 +898,14 @@ static struct ctl_table kern_table[] = {
900 .proc_handler = &scan_unevictable_handler, 898 .proc_handler = &scan_unevictable_handler,
901 }, 899 },
902#endif 900#endif
901#ifdef CONFIG_SLOW_WORK
902 {
903 .ctl_name = CTL_UNNUMBERED,
904 .procname = "slow-work",
905 .mode = 0555,
906 .child = slow_work_sysctls,
907 },
908#endif
903/* 909/*
904 * NOTE: do not add new entries to this table unless you have read 910 * NOTE: do not add new entries to this table unless you have read
905 * Documentation/sysctl/ctl_unnumbered.txt 911 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1010,7 +1016,7 @@ static struct ctl_table vm_table[] = {
1010 .data = &dirty_expire_interval, 1016 .data = &dirty_expire_interval,
1011 .maxlen = sizeof(dirty_expire_interval), 1017 .maxlen = sizeof(dirty_expire_interval),
1012 .mode = 0644, 1018 .mode = 0644,
1013 .proc_handler = &proc_dointvec_userhz_jiffies, 1019 .proc_handler = &proc_dointvec,
1014 }, 1020 },
1015 { 1021 {
1016 .ctl_name = VM_NR_PDFLUSH_THREADS, 1022 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -1373,10 +1379,7 @@ static struct ctl_table fs_table[] = {
1373 .data = &lease_break_time, 1379 .data = &lease_break_time,
1374 .maxlen = sizeof(int), 1380 .maxlen = sizeof(int),
1375 .mode = 0644, 1381 .mode = 0644,
1376 .proc_handler = &proc_dointvec_minmax, 1382 .proc_handler = &proc_dointvec,
1377 .strategy = &sysctl_intvec,
1378 .extra1 = &zero,
1379 .extra2 = &two,
1380 }, 1383 },
1381#endif 1384#endif
1382#ifdef CONFIG_AIO 1385#ifdef CONFIG_AIO
@@ -1417,7 +1420,10 @@ static struct ctl_table fs_table[] = {
1417 .data = &suid_dumpable, 1420 .data = &suid_dumpable,
1418 .maxlen = sizeof(int), 1421 .maxlen = sizeof(int),
1419 .mode = 0644, 1422 .mode = 0644,
1420 .proc_handler = &proc_dointvec, 1423 .proc_handler = &proc_dointvec_minmax,
1424 .strategy = &sysctl_intvec,
1425 .extra1 = &zero,
1426 .extra2 = &two,
1421 }, 1427 },
1422#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1428#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1423 { 1429 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 9b77fc9a9ac8..b4555568b4e4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
491 debug_object_free(timer, &timer_debug_descr); 491 debug_object_free(timer, &timer_debug_descr);
492} 492}
493 493
494static void __init_timer(struct timer_list *timer); 494static void __init_timer(struct timer_list *timer,
495 const char *name,
496 struct lock_class_key *key);
495 497
496void init_timer_on_stack(struct timer_list *timer) 498void init_timer_on_stack_key(struct timer_list *timer,
499 const char *name,
500 struct lock_class_key *key)
497{ 501{
498 debug_object_init_on_stack(timer, &timer_debug_descr); 502 debug_object_init_on_stack(timer, &timer_debug_descr);
499 __init_timer(timer); 503 __init_timer(timer, name, key);
500} 504}
501EXPORT_SYMBOL_GPL(init_timer_on_stack); 505EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
502 506
503void destroy_timer_on_stack(struct timer_list *timer) 507void destroy_timer_on_stack(struct timer_list *timer)
504{ 508{
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
512static inline void debug_timer_deactivate(struct timer_list *timer) { } 516static inline void debug_timer_deactivate(struct timer_list *timer) { }
513#endif 517#endif
514 518
515static void __init_timer(struct timer_list *timer) 519static void __init_timer(struct timer_list *timer,
520 const char *name,
521 struct lock_class_key *key)
516{ 522{
517 timer->entry.next = NULL; 523 timer->entry.next = NULL;
518 timer->base = __raw_get_cpu_var(tvec_bases); 524 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
521 timer->start_pid = -1; 527 timer->start_pid = -1;
522 memset(timer->start_comm, 0, TASK_COMM_LEN); 528 memset(timer->start_comm, 0, TASK_COMM_LEN);
523#endif 529#endif
530 lockdep_init_map(&timer->lockdep_map, name, key, 0);
524} 531}
525 532
526/** 533/**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
530 * init_timer() must be done to a timer prior calling *any* of the 537 * init_timer() must be done to a timer prior calling *any* of the
531 * other timer functions. 538 * other timer functions.
532 */ 539 */
533void init_timer(struct timer_list *timer) 540void init_timer_key(struct timer_list *timer,
541 const char *name,
542 struct lock_class_key *key)
534{ 543{
535 debug_timer_init(timer); 544 debug_timer_init(timer);
536 __init_timer(timer); 545 __init_timer(timer, name, key);
537} 546}
538EXPORT_SYMBOL(init_timer); 547EXPORT_SYMBOL(init_timer_key);
539 548
540void init_timer_deferrable(struct timer_list *timer) 549void init_timer_deferrable_key(struct timer_list *timer,
550 const char *name,
551 struct lock_class_key *key)
541{ 552{
542 init_timer(timer); 553 init_timer_key(timer, name, key);
543 timer_set_deferrable(timer); 554 timer_set_deferrable(timer);
544} 555}
545EXPORT_SYMBOL(init_timer_deferrable); 556EXPORT_SYMBOL(init_timer_deferrable_key);
546 557
547static inline void detach_timer(struct timer_list *timer, 558static inline void detach_timer(struct timer_list *timer,
548 int clear_pending) 559 int clear_pending)
@@ -826,6 +837,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
826 */ 837 */
827int del_timer_sync(struct timer_list *timer) 838int del_timer_sync(struct timer_list *timer)
828{ 839{
840#ifdef CONFIG_LOCKDEP
841 unsigned long flags;
842
843 local_irq_save(flags);
844 lock_map_acquire(&timer->lockdep_map);
845 lock_map_release(&timer->lockdep_map);
846 local_irq_restore(flags);
847#endif
848
829 for (;;) { 849 for (;;) {
830 int ret = try_to_del_timer_sync(timer); 850 int ret = try_to_del_timer_sync(timer);
831 if (ret >= 0) 851 if (ret >= 0)
@@ -897,10 +917,36 @@ static inline void __run_timers(struct tvec_base *base)
897 917
898 set_running_timer(base, timer); 918 set_running_timer(base, timer);
899 detach_timer(timer, 1); 919 detach_timer(timer, 1);
920
900 spin_unlock_irq(&base->lock); 921 spin_unlock_irq(&base->lock);
901 { 922 {
902 int preempt_count = preempt_count(); 923 int preempt_count = preempt_count();
924
925#ifdef CONFIG_LOCKDEP
926 /*
927 * It is permissible to free the timer from
928 * inside the function that is called from
929 * it, this we need to take into account for
930 * lockdep too. To avoid bogus "held lock
931 * freed" warnings as well as problems when
932 * looking into timer->lockdep_map, make a
933 * copy and use that here.
934 */
935 struct lockdep_map lockdep_map =
936 timer->lockdep_map;
937#endif
938 /*
939 * Couple the lock chain with the lock chain at
940 * del_timer_sync() by acquiring the lock_map
941 * around the fn() call here and in
942 * del_timer_sync().
943 */
944 lock_map_acquire(&lockdep_map);
945
903 fn(data); 946 fn(data);
947
948 lock_map_release(&lockdep_map);
949
904 if (preempt_count != preempt_count()) { 950 if (preempt_count != preempt_count()) {
905 printk(KERN_ERR "huh, entered %p " 951 printk(KERN_ERR "huh, entered %p "
906 "with preempt_count %08x, exited" 952 "with preempt_count %08x, exited"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..dce71a5b51bc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = {
42/* pid on the last trace processed */ 42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; 43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44 44
45/* Add a function return address to the trace stack on thread info.*/
46int
47ftrace_push_return_trace(unsigned long ret, unsigned long long time,
48 unsigned long func, int *depth)
49{
50 int index;
51
52 if (!current->ret_stack)
53 return -EBUSY;
54
55 /* The return trace stack is full */
56 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
57 atomic_inc(&current->trace_overrun);
58 return -EBUSY;
59 }
60
61 index = ++current->curr_ret_stack;
62 barrier();
63 current->ret_stack[index].ret = ret;
64 current->ret_stack[index].func = func;
65 current->ret_stack[index].calltime = time;
66 *depth = index;
67
68 return 0;
69}
70
71/* Retrieve a function return address to the trace stack on thread info.*/
72void
73ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
74{
75 int index;
76
77 index = current->curr_ret_stack;
78
79 if (unlikely(index < 0)) {
80 ftrace_graph_stop();
81 WARN_ON(1);
82 /* Might as well panic, otherwise we have no where to go */
83 *ret = (unsigned long)panic;
84 return;
85 }
86
87 *ret = current->ret_stack[index].ret;
88 trace->func = current->ret_stack[index].func;
89 trace->calltime = current->ret_stack[index].calltime;
90 trace->overrun = atomic_read(&current->trace_overrun);
91 trace->depth = index;
92 barrier();
93 current->curr_ret_stack--;
94
95}
96
97/*
98 * Send the trace to the ring-buffer.
99 * @return the original return address.
100 */
101unsigned long ftrace_return_to_handler(void)
102{
103 struct ftrace_graph_ret trace;
104 unsigned long ret;
105
106 ftrace_pop_return_trace(&trace, &ret);
107 trace.rettime = cpu_clock(raw_smp_processor_id());
108 ftrace_graph_return(&trace);
109
110 if (unlikely(!ret)) {
111 ftrace_graph_stop();
112 WARN_ON(1);
113 /* Might as well panic. What else to do? */
114 ret = (unsigned long)panic;
115 }
116
117 return ret;
118}
119
45static int graph_trace_init(struct trace_array *tr) 120static int graph_trace_init(struct trace_array *tr)
46{ 121{
47 int cpu, ret; 122 int cpu, ret;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
37 up_write(&uts_sem); 37 up_write(&uts_sem);
38} 38}
39 39
40#ifdef CONFIG_PROC_FS 40#ifdef CONFIG_PROC_SYSCTL
41/* 41/*
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d3..32f8e0d2bf5a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,8 +48,6 @@ struct cpu_workqueue_struct {
48 48
49 struct workqueue_struct *wq; 49 struct workqueue_struct *wq;
50 struct task_struct *thread; 50 struct task_struct *thread;
51
52 int run_depth; /* Detect run_workqueue() recursion depth */
53} ____cacheline_aligned; 51} ____cacheline_aligned;
54 52
55/* 53/*
@@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
262static void run_workqueue(struct cpu_workqueue_struct *cwq) 260static void run_workqueue(struct cpu_workqueue_struct *cwq)
263{ 261{
264 spin_lock_irq(&cwq->lock); 262 spin_lock_irq(&cwq->lock);
265 cwq->run_depth++;
266 if (cwq->run_depth > 3) {
267 /* morton gets to eat his hat */
268 printk("%s: recursion depth exceeded: %d\n",
269 __func__, cwq->run_depth);
270 dump_stack();
271 }
272 while (!list_empty(&cwq->worklist)) { 263 while (!list_empty(&cwq->worklist)) {
273 struct work_struct *work = list_entry(cwq->worklist.next, 264 struct work_struct *work = list_entry(cwq->worklist.next,
274 struct work_struct, entry); 265 struct work_struct, entry);
@@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
311 spin_lock_irq(&cwq->lock); 302 spin_lock_irq(&cwq->lock);
312 cwq->current_work = NULL; 303 cwq->current_work = NULL;
313 } 304 }
314 cwq->run_depth--;
315 spin_unlock_irq(&cwq->lock); 305 spin_unlock_irq(&cwq->lock);
316} 306}
317 307
@@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
368 358
369static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 359static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
370{ 360{
371 int active; 361 int active = 0;
362 struct wq_barrier barr;
372 363
373 if (cwq->thread == current) { 364 WARN_ON(cwq->thread == current);
374 /*
375 * Probably keventd trying to flush its own queue. So simply run
376 * it by hand rather than deadlocking.
377 */
378 run_workqueue(cwq);
379 active = 1;
380 } else {
381 struct wq_barrier barr;
382 365
383 active = 0; 366 spin_lock_irq(&cwq->lock);
384 spin_lock_irq(&cwq->lock); 367 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
385 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 368 insert_wq_barrier(cwq, &barr, &cwq->worklist);
386 insert_wq_barrier(cwq, &barr, &cwq->worklist); 369 active = 1;
387 active = 1;
388 }
389 spin_unlock_irq(&cwq->lock);
390
391 if (active)
392 wait_for_completion(&barr.done);
393 } 370 }
371 spin_unlock_irq(&cwq->lock);
372
373 if (active)
374 wait_for_completion(&barr.done);
394 375
395 return active; 376 return active;
396} 377}
@@ -416,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)
416 might_sleep(); 397 might_sleep();
417 lock_map_acquire(&wq->lockdep_map); 398 lock_map_acquire(&wq->lockdep_map);
418 lock_map_release(&wq->lockdep_map); 399 lock_map_release(&wq->lockdep_map);
419 for_each_cpu_mask_nr(cpu, *cpu_map) 400 for_each_cpu(cpu, cpu_map)
420 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
421} 402}
422EXPORT_SYMBOL_GPL(flush_workqueue); 403EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +528,7 @@ static void wait_on_work(struct work_struct *work)
547 wq = cwq->wq; 528 wq = cwq->wq;
548 cpu_map = wq_cpu_map(wq); 529 cpu_map = wq_cpu_map(wq);
549 530
550 for_each_cpu_mask_nr(cpu, *cpu_map) 531 for_each_cpu(cpu, cpu_map)
551 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 532 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
552} 533}
553 534
@@ -911,7 +892,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
911 list_del(&wq->list); 892 list_del(&wq->list);
912 spin_unlock(&workqueue_lock); 893 spin_unlock(&workqueue_lock);
913 894
914 for_each_cpu_mask_nr(cpu, *cpu_map) 895 for_each_cpu(cpu, cpu_map)
915 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 896 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
916 cpu_maps_update_done(); 897 cpu_maps_update_done();
917 898