aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-07 07:47:33 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-07 07:47:45 -0400
commit93776a8ec746cf9d32c36e5a5b23d28d8be28826 (patch)
tree6c472ae9f709246ee5268e1d71559d07839fb965 /kernel
parent34886c8bc590f078d4c0b88f50d061326639198d (diff)
parentd508afb437daee7cf07da085b635c44a4ebf9b38 (diff)
Merge branch 'linus' into tracing/core
Merge reason: update to upstream tracing facilities Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/async.c18
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c16
-rw-r--r--kernel/auditsc.c34
-rw-r--r--kernel/cgroup.c435
-rw-r--r--kernel/cgroup_debug.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c254
-rw-r--r--kernel/exec_domain.c23
-rw-r--r--kernel/exit.c248
-rw-r--r--kernel/extable.c30
-rw-r--r--kernel/fork.c72
-rw-r--r--kernel/futex.c201
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c7
-rw-r--r--kernel/irq/handle.c36
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c223
-rw-r--r--kernel/irq/numa_migrate.c11
-rw-r--r--kernel/irq/pm.c79
-rw-r--r--kernel/irq/spurious.c14
-rw-r--r--kernel/kallsyms.c19
-rw-r--r--kernel/kexec.c22
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c83
-rw-r--r--kernel/lockdep.c26
-rw-r--r--kernel/module.c299
-rw-r--r--kernel/ns_cgroup.c14
-rw-r--r--kernel/panic.c115
-rw-r--r--kernel/params.c26
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/pid_namespace.c15
-rw-r--r--kernel/posix-cpu-timers.c3
-rw-r--r--kernel/power/disk.c143
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swsusp.c18
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/ptrace.c103
-rw-r--r--kernel/rcuclassic.c23
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c48
-rw-r--r--kernel/rcutorture.c25
-rw-r--r--kernel/rcutree.c20
-rw-r--r--kernel/rcutree.h10
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/relay.c10
-rw-r--r--kernel/sched.c1063
-rw-r--r--kernel/sched_clock.c31
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c8
-rw-r--r--kernel/sched_fair.c59
-rw-r--r--kernel/sched_features.h3
-rw-r--r--kernel/sched_rt.c537
-rw-r--r--kernel/sched_stats.h7
-rw-r--r--kernel/signal.c71
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/smp.c432
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/spinlock.c18
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c28
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c20
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/ntp.c444
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/timer.c110
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c473
-rw-r--r--kernel/trace/ftrace.c13
-rw-r--r--kernel/trace/kmemtrace.c319
-rw-r--r--kernel/trace/ring_buffer.c118
-rw-r--r--kernel/trace/trace.c42
-rw-r--r--kernel/trace/trace.h82
-rw-r--r--kernel/trace/trace_events.c203
-rw-r--r--kernel/trace/trace_events_filter.c427
-rw-r--r--kernel/trace/trace_events_stage_2.h45
-rw-r--r--kernel/trace/trace_events_stage_3.h9
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_output.h33
-rw-r--r--kernel/trace/trace_stat.c26
-rw-r--r--kernel/trace/trace_workqueue.c12
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/workqueue.c47
94 files changed, 6113 insertions, 2511 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o
96 97
97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 98ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 99# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/async.c b/kernel/async.c
index f565891f2c9b..968ef9457d4e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,6 +49,7 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
52#include <linux/module.h> 53#include <linux/module.h>
53#include <linux/wait.h> 54#include <linux/wait.h>
54#include <linux/sched.h> 55#include <linux/sched.h>
@@ -387,20 +388,11 @@ static int async_manager_thread(void *unused)
387 388
388static int __init async_init(void) 389static int __init async_init(void)
389{ 390{
390 if (async_enabled) 391 async_enabled =
391 if (IS_ERR(kthread_run(async_manager_thread, NULL, 392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
392 "async/mgr")))
393 async_enabled = 0;
394 return 0;
395}
396 393
397static int __init setup_async(char *str) 394 WARN_ON(!async_enabled);
398{ 395 return 0;
399 async_enabled = 1;
400 return 1;
401} 396}
402 397
403__setup("fastboot", setup_async);
404
405
406core_initcall(async_init); 398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131e..9442c3533ba9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
766 766
767 audit_log_format(ab, " msg="); 767 audit_log_format(ab, " msg=");
768 size = nlmsg_len(nlh); 768 size = nlmsg_len(nlh);
769 if (size > 0 &&
770 ((unsigned char *)data)[size - 1] == '\0')
771 size--;
769 audit_log_n_untrustedstring(ab, data, size); 772 audit_log_n_untrustedstring(ab, data, size);
770 } 773 }
771 audit_set_pid(ab, pid); 774 audit_set_pid(ab, pid);
@@ -1382,7 +1385,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
1382int audit_string_contains_control(const char *string, size_t len) 1385int audit_string_contains_control(const char *string, size_t len)
1383{ 1386{
1384 const unsigned char *p; 1387 const unsigned char *p;
1385 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1388 for (p = string; p < (const unsigned char *)string + len; p++) {
1386 if (*p == '"' || *p < 0x21 || *p > 0x7e) 1389 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1387 return 1; 1390 return 1;
1388 } 1391 }
@@ -1437,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1437 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1440 /* We will allow 11 spaces for ' (deleted)' to be appended */
1438 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1441 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
1439 if (!pathname) { 1442 if (!pathname) {
1440 audit_log_format(ab, "<no memory>"); 1443 audit_log_string(ab, "<no_memory>");
1441 return; 1444 return;
1442 } 1445 }
1443 p = d_path(path, pathname, PATH_MAX+11); 1446 p = d_path(path, pathname, PATH_MAX+11);
1444 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1447 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1445 /* FIXME: can we save some information here? */ 1448 /* FIXME: can we save some information here? */
1446 audit_log_format(ab, "<too long>"); 1449 audit_log_string(ab, "<too_long>");
1447 } else 1450 } else
1448 audit_log_untrustedstring(ab, p); 1451 audit_log_untrustedstring(ab, p);
1449 kfree(pathname); 1452 kfree(pathname);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db9..917ab9525568 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
385 mutex_lock(&inode->inotify_mutex); 385 mutex_lock(&inode->inotify_mutex);
386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
387 mutex_unlock(&inode->inotify_mutex); 387 mutex_unlock(&inode->inotify_mutex);
388 put_inotify_watch(&old->watch);
388 free_chunk(chunk); 389 free_chunk(chunk);
389 return -ENOSPC; 390 return -ENOSPC;
390 } 391 }
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
394 chunk->dead = 1; 395 chunk->dead = 1;
395 inotify_evict_watch(&chunk->watch); 396 inotify_evict_watch(&chunk->watch);
396 mutex_unlock(&inode->inotify_mutex); 397 mutex_unlock(&inode->inotify_mutex);
398 put_inotify_watch(&old->watch);
397 put_inotify_watch(&chunk->watch); 399 put_inotify_watch(&chunk->watch);
398 return 0; 400 return 0;
399 } 401 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d97..a6fe71fd5d1b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
135static inline void audit_free_rule(struct audit_entry *e) 135static inline void audit_free_rule(struct audit_entry *e)
136{ 136{
137 int i; 137 int i;
138 138 struct audit_krule *erule = &e->rule;
139 /* some rules don't have associated watches */ 139 /* some rules don't have associated watches */
140 if (e->rule.watch) 140 if (erule->watch)
141 audit_put_watch(e->rule.watch); 141 audit_put_watch(erule->watch);
142 if (e->rule.fields) 142 if (erule->fields)
143 for (i = 0; i < e->rule.field_count; i++) { 143 for (i = 0; i < erule->field_count; i++) {
144 struct audit_field *f = &e->rule.fields[i]; 144 struct audit_field *f = &erule->fields[i];
145 kfree(f->lsm_str); 145 kfree(f->lsm_str);
146 security_audit_rule_free(f->lsm_rule); 146 security_audit_rule_free(f->lsm_rule);
147 } 147 }
148 kfree(e->rule.fields); 148 kfree(erule->fields);
149 kfree(e->rule.filterkey); 149 kfree(erule->filterkey);
150 kfree(e); 150 kfree(e);
151} 151}
152 152
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..7d6ac7c1f414 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h>
69 70
70#include "audit.h" 71#include "audit.h"
71 72
@@ -328,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
328 */ 329 */
329 330
330#ifdef CONFIG_AUDIT_TREE 331#ifdef CONFIG_AUDIT_TREE
332static void audit_set_auditable(struct audit_context *ctx)
333{
334 if (!ctx->prio) {
335 ctx->prio = 1;
336 ctx->current_state = AUDIT_RECORD_CONTEXT;
337 }
338}
339
331static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) 340static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
332{ 341{
333 struct audit_tree_refs *p = ctx->trees; 342 struct audit_tree_refs *p = ctx->trees;
@@ -741,17 +750,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
741 rcu_read_unlock(); 750 rcu_read_unlock();
742} 751}
743 752
744static void audit_set_auditable(struct audit_context *ctx)
745{
746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
750}
751
752static inline struct audit_context *audit_get_context(struct task_struct *tsk, 753static inline struct audit_context *audit_get_context(struct task_struct *tsk,
753 int return_valid, 754 int return_valid,
754 int return_code) 755 long return_code)
755{ 756{
756 struct audit_context *context = tsk->audit_context; 757 struct audit_context *context = tsk->audit_context;
757 758
@@ -1023,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1023{ 1024{
1024 char arg_num_len_buf[12]; 1025 char arg_num_len_buf[12];
1025 const char __user *tmp_p = p; 1026 const char __user *tmp_p = p;
1026 /* how many digits are in arg_num? 3 is the length of a=\n */ 1027 /* how many digits are in arg_num? 3 is the length of " a=" */
1027 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
1028 size_t len, len_left, to_send; 1029 size_t len, len_left, to_send;
1029 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1109,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1109 * so we can be sure nothing was lost. 1110 * so we can be sure nothing was lost.
1110 */ 1111 */
1111 if ((i == 0) && (too_long)) 1112 if ((i == 0) && (too_long))
1112 audit_log_format(*ab, "a%d_len=%zu ", arg_num, 1113 audit_log_format(*ab, " a%d_len=%zu", arg_num,
1113 has_cntl ? 2*len : len); 1114 has_cntl ? 2*len : len);
1114 1115
1115 /* 1116 /*
@@ -1129,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1129 buf[to_send] = '\0'; 1130 buf[to_send] = '\0';
1130 1131
1131 /* actually log it */ 1132 /* actually log it */
1132 audit_log_format(*ab, "a%d", arg_num); 1133 audit_log_format(*ab, " a%d", arg_num);
1133 if (too_long) 1134 if (too_long)
1134 audit_log_format(*ab, "[%d]", i); 1135 audit_log_format(*ab, "[%d]", i);
1135 audit_log_format(*ab, "="); 1136 audit_log_format(*ab, "=");
@@ -1137,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 audit_log_n_hex(*ab, buf, to_send); 1138 audit_log_n_hex(*ab, buf, to_send);
1138 else 1139 else
1139 audit_log_format(*ab, "\"%s\"", buf); 1140 audit_log_format(*ab, "\"%s\"", buf);
1140 audit_log_format(*ab, "\n");
1141 1141
1142 p += to_send; 1142 p += to_send;
1143 len_left -= to_send; 1143 len_left -= to_send;
@@ -1165,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
1165 1165
1166 p = (const char __user *)axi->mm->arg_start; 1166 p = (const char __user *)axi->mm->arg_start;
1167 1167
1168 audit_log_format(*ab, "argc=%d ", axi->argc); 1168 audit_log_format(*ab, "argc=%d", axi->argc);
1169 1169
1170 /* 1170 /*
1171 * we need some kernel buffer to hold the userspace args. Just 1171 * we need some kernel buffer to hold the userspace args. Just
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1478 case 0: 1478 case 0:
1479 /* name was specified as a relative path and the 1479 /* name was specified as a relative path and the
1480 * directory component is the cwd */ 1480 * directory component is the cwd */
1481 audit_log_d_path(ab, " name=", &context->pwd); 1481 audit_log_d_path(ab, "name=", &context->pwd);
1482 break; 1482 break;
1483 default: 1483 default:
1484 /* log the name's directory component */ 1484 /* log the name's directory component */
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2149 * __audit_mq_open - record audit data for a POSIX MQ open 2149 * __audit_mq_open - record audit data for a POSIX MQ open
2150 * @oflag: open flag 2150 * @oflag: open flag
2151 * @mode: mode bits 2151 * @mode: mode bits
2152 * @u_attr: queue attributes 2152 * @attr: queue attributes
2153 * 2153 *
2154 */ 2154 */
2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2196/** 2196/**
2197 * __audit_mq_notify - record audit data for a POSIX MQ notify 2197 * __audit_mq_notify - record audit data for a POSIX MQ notify
2198 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2199 * @u_notification: Notification event 2199 * @notification: Notification event
2200 * 2200 *
2201 */ 2201 */
2202 2202
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9edb5c4b79b4..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
585 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
586 * This is called before css refcnt check. 623 * This is called before css refcnt check.
587 */ 624 */
588static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
589{ 626{
590 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
591 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
592 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
593 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
594 return; 633 if (ret)
634 break;
635 }
636 return ret;
595} 637}
596 638
597static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
685 remove_dir(dentry); 727 remove_dir(dentry);
686} 728}
687 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
688static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
689 unsigned long final_bits) 747 unsigned long final_bits)
690{ 748{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
857 } 915 }
858 916
859 ret = rebind_subsystems(root, opts.subsys_bits); 917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
860 920
861 /* (re)populate subsystem files */ 921 /* (re)populate subsystem files */
862 if (!ret) 922 cgroup_populate_dir(cgrp);
863 cgroup_populate_dir(cgrp);
864 923
865 if (opts.release_agent) 924 if (opts.release_agent)
866 strcpy(root->release_agent_path, opts.release_agent); 925 strcpy(root->release_agent_path, opts.release_agent);
867 out_unlock: 926 out_unlock:
868 if (opts.release_agent) 927 kfree(opts.release_agent);
869 kfree(opts.release_agent);
870 mutex_unlock(&cgroup_mutex); 928 mutex_unlock(&cgroup_mutex);
871 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
872 return ret; 930 return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
969 /* First find the desired set of subsystems */ 1027 /* First find the desired set of subsystems */
970 ret = parse_cgroupfs_options(data, &opts); 1028 ret = parse_cgroupfs_options(data, &opts);
971 if (ret) { 1029 if (ret) {
972 if (opts.release_agent) 1030 kfree(opts.release_agent);
973 kfree(opts.release_agent);
974 return ret; 1031 return ret;
975 } 1032 }
976 1033
977 root = kzalloc(sizeof(*root), GFP_KERNEL); 1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
978 if (!root) { 1035 if (!root) {
979 if (opts.release_agent) 1036 kfree(opts.release_agent);
980 kfree(opts.release_agent);
981 return -ENOMEM; 1037 return -ENOMEM;
982 } 1038 }
983 1039
@@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1071 mutex_unlock(&cgroup_mutex); 1127 mutex_unlock(&cgroup_mutex);
1072 } 1128 }
1073 1129
1074 return simple_set_mnt(mnt, sb); 1130 simple_set_mnt(mnt, sb);
1131 return 0;
1075 1132
1076 free_cg_links: 1133 free_cg_links:
1077 free_cg_links(&tmp_cg_links); 1134 free_cg_links(&tmp_cg_links);
@@ -1279,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1279 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1336 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1280 synchronize_rcu(); 1337 synchronize_rcu();
1281 put_css_set(cg); 1338 put_css_set(cg);
1339
1340 /*
1341 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1342 * is no longer empty.
1343 */
1344 cgroup_wakeup_rmdir_waiters(cgrp);
1282 return 0; 1345 return 0;
1283} 1346}
1284 1347
@@ -1624,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {
1624 .rename = cgroup_rename, 1687 .rename = cgroup_rename,
1625}; 1688};
1626 1689
1627static int cgroup_create_file(struct dentry *dentry, int mode, 1690static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1628 struct super_block *sb) 1691 struct super_block *sb)
1629{ 1692{
1630 static struct dentry_operations cgroup_dops = { 1693 static const struct dentry_operations cgroup_dops = {
1631 .d_iput = cgroup_diput, 1694 .d_iput = cgroup_diput,
1632 }; 1695 };
1633 1696
@@ -1670,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
1670 * @mode: mode to set on new directory. 1733 * @mode: mode to set on new directory.
1671 */ 1734 */
1672static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1673 int mode) 1736 mode_t mode)
1674{ 1737{
1675 struct dentry *parent; 1738 struct dentry *parent;
1676 int error = 0; 1739 int error = 0;
@@ -1688,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1688 return error; 1751 return error;
1689} 1752}
1690 1753
1754/**
1755 * cgroup_file_mode - deduce file mode of a control file
1756 * @cft: the control file in question
1757 *
1758 * returns cft->mode if ->mode is not 0
1759 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1760 * returns S_IRUGO if it has only a read handler
1761 * returns S_IWUSR if it has only a write hander
1762 */
1763static mode_t cgroup_file_mode(const struct cftype *cft)
1764{
1765 mode_t mode = 0;
1766
1767 if (cft->mode)
1768 return cft->mode;
1769
1770 if (cft->read || cft->read_u64 || cft->read_s64 ||
1771 cft->read_map || cft->read_seq_string)
1772 mode |= S_IRUGO;
1773
1774 if (cft->write || cft->write_u64 || cft->write_s64 ||
1775 cft->write_string || cft->trigger)
1776 mode |= S_IWUSR;
1777
1778 return mode;
1779}
1780
1691int cgroup_add_file(struct cgroup *cgrp, 1781int cgroup_add_file(struct cgroup *cgrp,
1692 struct cgroup_subsys *subsys, 1782 struct cgroup_subsys *subsys,
1693 const struct cftype *cft) 1783 const struct cftype *cft)
@@ -1695,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
1695 struct dentry *dir = cgrp->dentry; 1785 struct dentry *dir = cgrp->dentry;
1696 struct dentry *dentry; 1786 struct dentry *dentry;
1697 int error; 1787 int error;
1788 mode_t mode;
1698 1789
1699 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1790 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1700 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1791 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1705,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
1705 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1796 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1706 dentry = lookup_one_len(name, dir, strlen(name)); 1797 dentry = lookup_one_len(name, dir, strlen(name));
1707 if (!IS_ERR(dentry)) { 1798 if (!IS_ERR(dentry)) {
1708 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1799 mode = cgroup_file_mode(cft);
1800 error = cgroup_create_file(dentry, mode | S_IFREG,
1709 cgrp->root->sb); 1801 cgrp->root->sb);
1710 if (!error) 1802 if (!error)
1711 dentry->d_fsdata = (void *)cft; 1803 dentry->d_fsdata = (void *)cft;
@@ -2287,6 +2379,7 @@ static struct cftype files[] = {
2287 .write_u64 = cgroup_tasks_write, 2379 .write_u64 = cgroup_tasks_write,
2288 .release = cgroup_tasks_release, 2380 .release = cgroup_tasks_release,
2289 .private = FILE_TASKLIST, 2381 .private = FILE_TASKLIST,
2382 .mode = S_IRUGO | S_IWUSR,
2290 }, 2383 },
2291 2384
2292 { 2385 {
@@ -2326,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2326 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2419 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2327 return err; 2420 return err;
2328 } 2421 }
2422 /* This cgroup is ready now */
2423 for_each_subsys(cgrp->root, ss) {
2424 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2425 /*
2426 * Update id->css pointer and make this css visible from
2427 * CSS ID functions. This pointer will be dereferened
2428 * from RCU-read-side without locks.
2429 */
2430 if (css->id)
2431 rcu_assign_pointer(css->id->css, css);
2432 }
2329 2433
2330 return 0; 2434 return 0;
2331} 2435}
@@ -2337,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2337 css->cgroup = cgrp; 2441 css->cgroup = cgrp;
2338 atomic_set(&css->refcnt, 1); 2442 atomic_set(&css->refcnt, 1);
2339 css->flags = 0; 2443 css->flags = 0;
2444 css->id = NULL;
2340 if (cgrp == dummytop) 2445 if (cgrp == dummytop)
2341 set_bit(CSS_ROOT, &css->flags); 2446 set_bit(CSS_ROOT, &css->flags);
2342 BUG_ON(cgrp->subsys[ss->subsys_id]); 2447 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2375,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2375 * Must be called with the mutex on the parent inode held 2480 * Must be called with the mutex on the parent inode held
2376 */ 2481 */
2377static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2482static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2378 int mode) 2483 mode_t mode)
2379{ 2484{
2380 struct cgroup *cgrp; 2485 struct cgroup *cgrp;
2381 struct cgroupfs_root *root = parent->root; 2486 struct cgroupfs_root *root = parent->root;
@@ -2412,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2412 goto err_destroy; 2517 goto err_destroy;
2413 } 2518 }
2414 init_cgroup_css(css, ss, cgrp); 2519 init_cgroup_css(css, ss, cgrp);
2520 if (ss->use_id)
2521 if (alloc_css_id(ss, parent, cgrp))
2522 goto err_destroy;
2523 /* At error, ->destroy() callback has to free assigned ID. */
2415 } 2524 }
2416 2525
2417 cgroup_lock_hierarchy(root); 2526 cgroup_lock_hierarchy(root);
@@ -2554,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2554 struct cgroup *cgrp = dentry->d_fsdata; 2663 struct cgroup *cgrp = dentry->d_fsdata;
2555 struct dentry *d; 2664 struct dentry *d;
2556 struct cgroup *parent; 2665 struct cgroup *parent;
2666 DEFINE_WAIT(wait);
2667 int ret;
2557 2668
2558 /* the vfs holds both inode->i_mutex already */ 2669 /* the vfs holds both inode->i_mutex already */
2559 2670again:
2560 mutex_lock(&cgroup_mutex); 2671 mutex_lock(&cgroup_mutex);
2561 if (atomic_read(&cgrp->count) != 0) { 2672 if (atomic_read(&cgrp->count) != 0) {
2562 mutex_unlock(&cgroup_mutex); 2673 mutex_unlock(&cgroup_mutex);
@@ -2572,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2572 * Call pre_destroy handlers of subsys. Notify subsystems 2683 * Call pre_destroy handlers of subsys. Notify subsystems
2573 * that rmdir() request comes. 2684 * that rmdir() request comes.
2574 */ 2685 */
2575 cgroup_call_pre_destroy(cgrp); 2686 ret = cgroup_call_pre_destroy(cgrp);
2687 if (ret)
2688 return ret;
2576 2689
2577 mutex_lock(&cgroup_mutex); 2690 mutex_lock(&cgroup_mutex);
2578 parent = cgrp->parent; 2691 parent = cgrp->parent;
2579 2692 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2580 if (atomic_read(&cgrp->count)
2581 || !list_empty(&cgrp->children)
2582 || !cgroup_clear_css_refs(cgrp)) {
2583 mutex_unlock(&cgroup_mutex); 2693 mutex_unlock(&cgroup_mutex);
2584 return -EBUSY; 2694 return -EBUSY;
2585 } 2695 }
2696 /*
2697 * css_put/get is provided for subsys to grab refcnt to css. In typical
2698 * case, subsystem has no reference after pre_destroy(). But, under
2699 * hierarchy management, some *temporal* refcnt can be hold.
2700 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2701 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2702 * is called when css_put() is called and refcnt goes down to 0.
2703 */
2704 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2705 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2706
2707 if (!cgroup_clear_css_refs(cgrp)) {
2708 mutex_unlock(&cgroup_mutex);
2709 schedule();
2710 finish_wait(&cgroup_rmdir_waitq, &wait);
2711 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2712 if (signal_pending(current))
2713 return -EINTR;
2714 goto again;
2715 }
2716 /* NO css_tryget() can success after here. */
2717 finish_wait(&cgroup_rmdir_waitq, &wait);
2718 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2586 2719
2587 spin_lock(&release_list_lock); 2720 spin_lock(&release_list_lock);
2588 set_bit(CGRP_REMOVED, &cgrp->flags); 2721 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2707,6 +2840,8 @@ int __init cgroup_init(void)
2707 struct cgroup_subsys *ss = subsys[i]; 2840 struct cgroup_subsys *ss = subsys[i];
2708 if (!ss->early_init) 2841 if (!ss->early_init)
2709 cgroup_init_subsys(ss); 2842 cgroup_init_subsys(ss);
2843 if (ss->use_id)
2844 cgroup_subsys_init_idr(ss);
2710 } 2845 }
2711 2846
2712 /* Add init_css_set to the hash table */ 2847 /* Add init_css_set to the hash table */
@@ -3083,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3083} 3218}
3084 3219
3085/** 3220/**
3086 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 3221 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
3087 * @cgrp: the cgroup in question 3222 * @cgrp: the cgroup in question
3223 * @task: the task in question
3088 * 3224 *
3089 * See if @cgrp is a descendant of the current task's cgroup in 3225 * See if @cgrp is a descendant of @task's cgroup in the appropriate
3090 * the appropriate hierarchy. 3226 * hierarchy.
3091 * 3227 *
3092 * If we are sending in dummytop, then presumably we are creating 3228 * If we are sending in dummytop, then presumably we are creating
3093 * the top cgroup in the subsystem. 3229 * the top cgroup in the subsystem.
3094 * 3230 *
3095 * Called only by the ns (nsproxy) cgroup. 3231 * Called only by the ns (nsproxy) cgroup.
3096 */ 3232 */
3097int cgroup_is_descendant(const struct cgroup *cgrp) 3233int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3098{ 3234{
3099 int ret; 3235 int ret;
3100 struct cgroup *target; 3236 struct cgroup *target;
@@ -3104,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
3104 return 1; 3240 return 1;
3105 3241
3106 get_first_subsys(cgrp, NULL, &subsys_id); 3242 get_first_subsys(cgrp, NULL, &subsys_id);
3107 target = task_cgroup(current, subsys_id); 3243 target = task_cgroup(task, subsys_id);
3108 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3244 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3109 cgrp = cgrp->parent; 3245 cgrp = cgrp->parent;
3110 ret = (cgrp == target); 3246 ret = (cgrp == target);
@@ -3137,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
3137{ 3273{
3138 struct cgroup *cgrp = css->cgroup; 3274 struct cgroup *cgrp = css->cgroup;
3139 rcu_read_lock(); 3275 rcu_read_lock();
3140 if ((atomic_dec_return(&css->refcnt) == 1) && 3276 if (atomic_dec_return(&css->refcnt) == 1) {
3141 notify_on_release(cgrp)) { 3277 if (notify_on_release(cgrp)) {
3142 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3278 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3143 check_for_release(cgrp); 3279 check_for_release(cgrp);
3280 }
3281 cgroup_wakeup_rmdir_waiters(cgrp);
3144 } 3282 }
3145 rcu_read_unlock(); 3283 rcu_read_unlock();
3146} 3284}
@@ -3240,3 +3378,232 @@ static int __init cgroup_disable(char *str)
3240 return 1; 3378 return 1;
3241} 3379}
3242__setup("cgroup_disable=", cgroup_disable); 3380__setup("cgroup_disable=", cgroup_disable);
3381
3382/*
3383 * Functons for CSS ID.
3384 */
3385
3386/*
3387 *To get ID other than 0, this should be called when !cgroup_is_removed().
3388 */
3389unsigned short css_id(struct cgroup_subsys_state *css)
3390{
3391 struct css_id *cssid = rcu_dereference(css->id);
3392
3393 if (cssid)
3394 return cssid->id;
3395 return 0;
3396}
3397
3398unsigned short css_depth(struct cgroup_subsys_state *css)
3399{
3400 struct css_id *cssid = rcu_dereference(css->id);
3401
3402 if (cssid)
3403 return cssid->depth;
3404 return 0;
3405}
3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 const struct cgroup_subsys_state *root)
3409{
3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id);
3412
3413 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3414 return false;
3415 return child_id->stack[root_id->depth] == root_id->id;
3416}
3417
3418static void __free_css_id_cb(struct rcu_head *head)
3419{
3420 struct css_id *id;
3421
3422 id = container_of(head, struct css_id, rcu_head);
3423 kfree(id);
3424}
3425
3426void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3427{
3428 struct css_id *id = css->id;
3429 /* When this is called before css_id initialization, id can be NULL */
3430 if (!id)
3431 return;
3432
3433 BUG_ON(!ss->use_id);
3434
3435 rcu_assign_pointer(id->css, NULL);
3436 rcu_assign_pointer(css->id, NULL);
3437 spin_lock(&ss->id_lock);
3438 idr_remove(&ss->idr, id->id);
3439 spin_unlock(&ss->id_lock);
3440 call_rcu(&id->rcu_head, __free_css_id_cb);
3441}
3442
3443/*
3444 * This is called by init or create(). Then, calls to this function are
3445 * always serialized (By cgroup_mutex() at create()).
3446 */
3447
3448static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3449{
3450 struct css_id *newid;
3451 int myid, error, size;
3452
3453 BUG_ON(!ss->use_id);
3454
3455 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3456 newid = kzalloc(size, GFP_KERNEL);
3457 if (!newid)
3458 return ERR_PTR(-ENOMEM);
3459 /* get id */
3460 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3461 error = -ENOMEM;
3462 goto err_out;
3463 }
3464 spin_lock(&ss->id_lock);
3465 /* Don't use 0. allocates an ID of 1-65535 */
3466 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3467 spin_unlock(&ss->id_lock);
3468
3469 /* Returns error when there are no free spaces for new ID.*/
3470 if (error) {
3471 error = -ENOSPC;
3472 goto err_out;
3473 }
3474 if (myid > CSS_ID_MAX)
3475 goto remove_idr;
3476
3477 newid->id = myid;
3478 newid->depth = depth;
3479 return newid;
3480remove_idr:
3481 error = -ENOSPC;
3482 spin_lock(&ss->id_lock);
3483 idr_remove(&ss->idr, myid);
3484 spin_unlock(&ss->id_lock);
3485err_out:
3486 kfree(newid);
3487 return ERR_PTR(error);
3488
3489}
3490
3491static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3492{
3493 struct css_id *newid;
3494 struct cgroup_subsys_state *rootcss;
3495
3496 spin_lock_init(&ss->id_lock);
3497 idr_init(&ss->idr);
3498
3499 rootcss = init_css_set.subsys[ss->subsys_id];
3500 newid = get_new_cssid(ss, 0);
3501 if (IS_ERR(newid))
3502 return PTR_ERR(newid);
3503
3504 newid->stack[0] = newid->id;
3505 newid->css = rootcss;
3506 rootcss->id = newid;
3507 return 0;
3508}
3509
3510static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3511 struct cgroup *child)
3512{
3513 int subsys_id, i, depth = 0;
3514 struct cgroup_subsys_state *parent_css, *child_css;
3515 struct css_id *child_id, *parent_id = NULL;
3516
3517 subsys_id = ss->subsys_id;
3518 parent_css = parent->subsys[subsys_id];
3519 child_css = child->subsys[subsys_id];
3520 depth = css_depth(parent_css) + 1;
3521 parent_id = parent_css->id;
3522
3523 child_id = get_new_cssid(ss, depth);
3524 if (IS_ERR(child_id))
3525 return PTR_ERR(child_id);
3526
3527 for (i = 0; i < depth; i++)
3528 child_id->stack[i] = parent_id->stack[i];
3529 child_id->stack[depth] = child_id->id;
3530 /*
3531 * child_id->css pointer will be set after this cgroup is available
3532 * see cgroup_populate_dir()
3533 */
3534 rcu_assign_pointer(child_css->id, child_id);
3535
3536 return 0;
3537}
3538
3539/**
3540 * css_lookup - lookup css by id
3541 * @ss: cgroup subsys to be looked into.
3542 * @id: the id
3543 *
3544 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3545 * NULL if not. Should be called under rcu_read_lock()
3546 */
3547struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3548{
3549 struct css_id *cssid = NULL;
3550
3551 BUG_ON(!ss->use_id);
3552 cssid = idr_find(&ss->idr, id);
3553
3554 if (unlikely(!cssid))
3555 return NULL;
3556
3557 return rcu_dereference(cssid->css);
3558}
3559
3560/**
3561 * css_get_next - lookup next cgroup under specified hierarchy.
3562 * @ss: pointer to subsystem
3563 * @id: current position of iteration.
3564 * @root: pointer to css. search tree under this.
3565 * @foundid: position of found object.
3566 *
3567 * Search next css under the specified hierarchy of rootid. Calling under
3568 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3569 */
3570struct cgroup_subsys_state *
3571css_get_next(struct cgroup_subsys *ss, int id,
3572 struct cgroup_subsys_state *root, int *foundid)
3573{
3574 struct cgroup_subsys_state *ret = NULL;
3575 struct css_id *tmp;
3576 int tmpid;
3577 int rootid = css_id(root);
3578 int depth = css_depth(root);
3579
3580 if (!rootid)
3581 return NULL;
3582
3583 BUG_ON(!ss->use_id);
3584 /* fill start point for scan */
3585 tmpid = id;
3586 while (1) {
3587 /*
3588 * scan next entry from bitmap(tree), tmpid is updated after
3589 * idr_get_next().
3590 */
3591 spin_lock(&ss->id_lock);
3592 tmp = idr_get_next(&ss->idr, &tmpid);
3593 spin_unlock(&ss->id_lock);
3594
3595 if (!tmp)
3596 break;
3597 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3598 ret = rcu_dereference(tmp->css);
3599 if (ret) {
3600 *foundid = tmpid;
3601 break;
3602 }
3603 }
3604 /* continue to scan from next id */
3605 tmpid = tmpid + 1;
3606 }
3607 return ret;
3608}
3609
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{ 40{
41 u64 count; 41 u64 count;
42 42
43 cgroup_lock();
44 count = cgroup_task_count(cont); 43 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count; 44 return count;
47} 45}
48 46
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb8..395b6974dc8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
281 goto out; 281 goto out;
282 } 282 }
283 283
284 cpu_clear(cpu, cpu_active_map); 284 set_cpu_active(cpu, false);
285 285
286 /* 286 /*
287 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
296 err = _cpu_down(cpu, 0); 296 err = _cpu_down(cpu, 0);
297 297
298 if (cpu_online(cpu)) 298 if (cpu_online(cpu))
299 cpu_set(cpu, cpu_active_map); 299 set_cpu_active(cpu, true);
300 300
301out: 301out:
302 cpu_maps_update_done(); 302 cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
333 goto out_notify; 333 goto out_notify;
334 BUG_ON(!cpu_online(cpu)); 334 BUG_ON(!cpu_online(cpu));
335 335
336 cpu_set(cpu, cpu_active_map); 336 set_cpu_active(cpu, true);
337 337
338 /* Now call notifier in preparation. */ 338 /* Now call notifier in preparation. */
339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 128 return container_of(task_subsys_state(task, cpuset_subsys_id),
129 struct cpuset, css); 129 struct cpuset, css);
130} 130}
131struct cpuset_hotplug_scanner {
132 struct cgroup_scanner scan;
133 struct cgroup *to;
134};
135 131
136/* bits in struct cpuset flags field */ 132/* bits in struct cpuset flags field */
137typedef enum { 133typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
521 return 0; 517 return 0;
522} 518}
523 519
520#ifdef CONFIG_SMP
524/* 521/*
525 * Helper routine for generate_sched_domains(). 522 * Helper routine for generate_sched_domains().
526 * Do cpusets a, b have overlapping cpus_allowed masks? 523 * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
815 812
816 put_online_cpus(); 813 put_online_cpus();
817} 814}
815#else /* !CONFIG_SMP */
816static void do_rebuild_sched_domains(struct work_struct *unused)
817{
818}
819
820static int generate_sched_domains(struct cpumask **domains,
821 struct sched_domain_attr **attributes)
822{
823 *domains = NULL;
824 return 1;
825}
826#endif /* CONFIG_SMP */
818 827
819static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); 828static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
820 829
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1026 mutex_unlock(&callback_mutex); 1035 mutex_unlock(&callback_mutex);
1027} 1036}
1028 1037
1038/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
1041 */
1042static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan)
1044{
1045 struct mm_struct *mm;
1046 struct cpuset *cs;
1047 int migrate;
1048 const nodemask_t *oldmem = scan->data;
1049
1050 mm = get_task_mm(p);
1051 if (!mm)
1052 return;
1053
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs);
1056
1057 mpol_rebind_mm(mm, &cs->mems_allowed);
1058 if (migrate)
1059 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1060 mmput(mm);
1061}
1062
1029static void *cpuset_being_rebound; 1063static void *cpuset_being_rebound;
1030 1064
1031/** 1065/**
1032 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1066 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1033 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1067 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1034 * @oldmem: old mems_allowed of cpuset cs 1068 * @oldmem: old mems_allowed of cpuset cs
1069 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1035 * 1070 *
1036 * Called with cgroup_mutex held 1071 * Called with cgroup_mutex held
1037 * Return 0 if successful, -errno if not. 1072 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1073 * if @heap != NULL.
1038 */ 1074 */
1039static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) 1075static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1076 struct ptr_heap *heap)
1040{ 1077{
1041 struct task_struct *p; 1078 struct cgroup_scanner scan;
1042 struct mm_struct **mmarray;
1043 int i, n, ntasks;
1044 int migrate;
1045 int fudge;
1046 struct cgroup_iter it;
1047 int retval;
1048 1079
1049 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1080 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1050 1081
1051 fudge = 10; /* spare mmarray[] slots */ 1082 scan.cg = cs->css.cgroup;
1052 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ 1083 scan.test_task = NULL;
1053 retval = -ENOMEM; 1084 scan.process_task = cpuset_change_nodemask;
1054 1085 scan.heap = heap;
1055 /* 1086 scan.data = (nodemask_t *)oldmem;
1056 * Allocate mmarray[] to hold mm reference for each task
1057 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
1058 * tasklist_lock. We could use GFP_ATOMIC, but with a
1059 * few more lines of code, we can retry until we get a big
1060 * enough mmarray[] w/o using GFP_ATOMIC.
1061 */
1062 while (1) {
1063 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
1064 ntasks += fudge;
1065 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1066 if (!mmarray)
1067 goto done;
1068 read_lock(&tasklist_lock); /* block fork */
1069 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1070 break; /* got enough */
1071 read_unlock(&tasklist_lock); /* try again */
1072 kfree(mmarray);
1073 }
1074
1075 n = 0;
1076
1077 /* Load up mmarray[] with mm reference for each task in cpuset. */
1078 cgroup_iter_start(cs->css.cgroup, &it);
1079 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1080 struct mm_struct *mm;
1081
1082 if (n >= ntasks) {
1083 printk(KERN_WARNING
1084 "Cpuset mempolicy rebind incomplete.\n");
1085 break;
1086 }
1087 mm = get_task_mm(p);
1088 if (!mm)
1089 continue;
1090 mmarray[n++] = mm;
1091 }
1092 cgroup_iter_end(cs->css.cgroup, &it);
1093 read_unlock(&tasklist_lock);
1094 1087
1095 /* 1088 /*
1096 * Now that we've dropped the tasklist spinlock, we can 1089 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1097 * rebind the vma mempolicies of each mm in mmarray[] to their 1090 * take while holding tasklist_lock. Forks can happen - the
1098 * new cpuset, and release that mm. The mpol_rebind_mm() 1091 * mpol_dup() cpuset_being_rebound check will catch such forks,
1099 * call takes mmap_sem, which we couldn't take while holding 1092 * and rebind their vma mempolicies too. Because we still hold
1100 * tasklist_lock. Forks can happen again now - the mpol_dup() 1093 * the global cgroup_mutex, we know that no other rebind effort
1101 * cpuset_being_rebound check will catch such forks, and rebind 1094 * will be contending for the global variable cpuset_being_rebound.
1102 * their vma mempolicies too. Because we still hold the global
1103 * cgroup_mutex, we know that no other rebind effort will
1104 * be contending for the global variable cpuset_being_rebound.
1105 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1095 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1106 * is idempotent. Also migrate pages in each mm to new nodes. 1096 * is idempotent. Also migrate pages in each mm to new nodes.
1107 */ 1097 */
1108 migrate = is_memory_migrate(cs); 1098 cgroup_scan_tasks(&scan);
1109 for (i = 0; i < n; i++) {
1110 struct mm_struct *mm = mmarray[i];
1111
1112 mpol_rebind_mm(mm, &cs->mems_allowed);
1113 if (migrate)
1114 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1115 mmput(mm);
1116 }
1117 1099
1118 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1100 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1119 kfree(mmarray);
1120 cpuset_being_rebound = NULL; 1101 cpuset_being_rebound = NULL;
1121 retval = 0;
1122done:
1123 return retval;
1124} 1102}
1125 1103
1126/* 1104/*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1141{ 1119{
1142 nodemask_t oldmem; 1120 nodemask_t oldmem;
1143 int retval; 1121 int retval;
1122 struct ptr_heap heap;
1144 1123
1145 /* 1124 /*
1146 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1125 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1175 if (retval < 0) 1154 if (retval < 0)
1176 goto done; 1155 goto done;
1177 1156
1157 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1158 if (retval < 0)
1159 goto done;
1160
1178 mutex_lock(&callback_mutex); 1161 mutex_lock(&callback_mutex);
1179 cs->mems_allowed = trialcs->mems_allowed; 1162 cs->mems_allowed = trialcs->mems_allowed;
1180 cs->mems_generation = cpuset_mems_generation++; 1163 cs->mems_generation = cpuset_mems_generation++;
1181 mutex_unlock(&callback_mutex); 1164 mutex_unlock(&callback_mutex);
1182 1165
1183 retval = update_tasks_nodemask(cs, &oldmem); 1166 update_tasks_nodemask(cs, &oldmem, &heap);
1167
1168 heap_free(&heap);
1184done: 1169done:
1185 return retval; 1170 return retval;
1186} 1171}
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
1192 1177
1193static int update_relax_domain_level(struct cpuset *cs, s64 val) 1178static int update_relax_domain_level(struct cpuset *cs, s64 val)
1194{ 1179{
1180#ifdef CONFIG_SMP
1195 if (val < -1 || val >= SD_LV_MAX) 1181 if (val < -1 || val >= SD_LV_MAX)
1196 return -EINVAL; 1182 return -EINVAL;
1183#endif
1197 1184
1198 if (val != cs->relax_domain_level) { 1185 if (val != cs->relax_domain_level) {
1199 cs->relax_domain_level = val; 1186 cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1355 struct cgroup *cont, struct task_struct *tsk) 1342 struct cgroup *cont, struct task_struct *tsk)
1356{ 1343{
1357 struct cpuset *cs = cgroup_cs(cont); 1344 struct cpuset *cs = cgroup_cs(cont);
1358 int ret = 0;
1359 1345
1360 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1346 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1361 return -ENOSPC; 1347 return -ENOSPC;
1362 1348
1363 if (tsk->flags & PF_THREAD_BOUND) { 1349 /*
1364 mutex_lock(&callback_mutex); 1350 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1365 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) 1351 * cannot change their cpu affinity and isolating such threads by their
1366 ret = -EINVAL; 1352 * set of allowed nodes is unnecessary. Thus, cpusets are not
1367 mutex_unlock(&callback_mutex); 1353 * applicable for such threads. This prevents checking for success of
1368 } 1354 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1355 * be changed.
1356 */
1357 if (tsk->flags & PF_THREAD_BOUND)
1358 return -EINVAL;
1369 1359
1370 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); 1360 return security_task_setscheduler(tsk, 0, NULL);
1371} 1361}
1372 1362
1373static void cpuset_attach(struct cgroup_subsys *ss, 1363static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
1706 .read_u64 = cpuset_read_u64, 1696 .read_u64 = cpuset_read_u64,
1707 .write_u64 = cpuset_write_u64, 1697 .write_u64 = cpuset_write_u64,
1708 .private = FILE_MEMORY_PRESSURE, 1698 .private = FILE_MEMORY_PRESSURE,
1699 .mode = S_IRUGO,
1709 }, 1700 },
1710 1701
1711 { 1702 {
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
1913static void cpuset_do_move_task(struct task_struct *tsk, 1904static void cpuset_do_move_task(struct task_struct *tsk,
1914 struct cgroup_scanner *scan) 1905 struct cgroup_scanner *scan)
1915{ 1906{
1916 struct cpuset_hotplug_scanner *chsp; 1907 struct cgroup *new_cgroup = scan->data;
1917 1908
1918 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); 1909 cgroup_attach_task(new_cgroup, tsk);
1919 cgroup_attach_task(chsp->to, tsk);
1920} 1910}
1921 1911
1922/** 1912/**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 */ 1922 */
1933static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) 1923static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1934{ 1924{
1935 struct cpuset_hotplug_scanner scan; 1925 struct cgroup_scanner scan;
1936 1926
1937 scan.scan.cg = from->css.cgroup; 1927 scan.cg = from->css.cgroup;
1938 scan.scan.test_task = NULL; /* select all tasks in cgroup */ 1928 scan.test_task = NULL; /* select all tasks in cgroup */
1939 scan.scan.process_task = cpuset_do_move_task; 1929 scan.process_task = cpuset_do_move_task;
1940 scan.scan.heap = NULL; 1930 scan.heap = NULL;
1941 scan.to = to->css.cgroup; 1931 scan.data = to->css.cgroup;
1942 1932
1943 if (cgroup_scan_tasks(&scan.scan)) 1933 if (cgroup_scan_tasks(&scan))
1944 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1934 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1945 "cgroup_scan_tasks failed\n"); 1935 "cgroup_scan_tasks failed\n");
1946} 1936}
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2033 remove_tasks_in_empty_cpuset(cp); 2023 remove_tasks_in_empty_cpuset(cp);
2034 else { 2024 else {
2035 update_tasks_cpumask(cp, NULL); 2025 update_tasks_cpumask(cp, NULL);
2036 update_tasks_nodemask(cp, &oldmems); 2026 update_tasks_nodemask(cp, &oldmems, NULL);
2037 } 2027 }
2038 } 2028 }
2039} 2029}
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2069 } 2059 }
2070 2060
2071 cgroup_lock(); 2061 cgroup_lock();
2062 mutex_lock(&callback_mutex);
2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2063 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2064 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2092 cgroup_lock(); 2084 cgroup_lock();
2093 switch (action) { 2085 switch (action) {
2094 case MEM_ONLINE: 2086 case MEM_ONLINE:
2095 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2096 break;
2097 case MEM_OFFLINE: 2087 case MEM_OFFLINE:
2088 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2089 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 scan_for_empty_cpusets(&top_cpuset); 2090 mutex_unlock(&callback_mutex);
2091 if (action == MEM_OFFLINE)
2092 scan_for_empty_cpusets(&top_cpuset);
2100 break; 2093 break;
2101 default: 2094 default:
2102 break; 2095 break;
@@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2206} 2199}
2207 2200
2208/** 2201/**
2209 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? 2202 * cpuset_node_allowed_softwall - Can we allocate on a memory node?
2210 * @z: is this zone on an allowed node? 2203 * @node: is this an allowed node?
2211 * @gfp_mask: memory allocation flags 2204 * @gfp_mask: memory allocation flags
2212 * 2205 *
2213 * If we're in interrupt, yes, we can always allocate. If 2206 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2214 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2207 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2215 * z's node is in our tasks mems_allowed, yes. If it's not a 2208 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
2216 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2209 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
2217 * hardwalled cpuset ancestor to this tasks cpuset, yes. 2210 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2218 * If the task has been OOM killed and has access to memory reserves 2211 * flag, yes.
2219 * as specified by the TIF_MEMDIE flag, yes.
2220 * Otherwise, no. 2212 * Otherwise, no.
2221 * 2213 *
2222 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() 2214 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2223 * reduces to cpuset_zone_allowed_hardwall(). Otherwise, 2215 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2224 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone 2216 * might sleep, and might allow a node from an enclosing cpuset.
2225 * from an enclosing cpuset.
2226 * 2217 *
2227 * cpuset_zone_allowed_hardwall() only handles the simpler case of 2218 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2228 * hardwall cpusets, and never sleeps. 2219 * cpusets, and never sleeps.
2229 * 2220 *
2230 * The __GFP_THISNODE placement logic is really handled elsewhere, 2221 * The __GFP_THISNODE placement logic is really handled elsewhere,
2231 * by forcibly using a zonelist starting at a specified node, and by 2222 * by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2264 * GFP_USER - only nodes in current tasks mems allowed ok. 2255 * GFP_USER - only nodes in current tasks mems allowed ok.
2265 * 2256 *
2266 * Rule: 2257 * Rule:
2267 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you 2258 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2268 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2259 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2269 * the code that might scan up ancestor cpusets and sleep. 2260 * the code that might scan up ancestor cpusets and sleep.
2270 */ 2261 */
2271 2262int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2272int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2273{ 2263{
2274 int node; /* node that zone z is on */
2275 const struct cpuset *cs; /* current cpuset ancestors */ 2264 const struct cpuset *cs; /* current cpuset ancestors */
2276 int allowed; /* is allocation in zone z allowed? */ 2265 int allowed; /* is allocation in zone z allowed? */
2277 2266
2278 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2279 return 1; 2268 return 1;
2280 node = zone_to_nid(z);
2281 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2269 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2282 if (node_isset(node, current->mems_allowed)) 2270 if (node_isset(node, current->mems_allowed))
2283 return 1; 2271 return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2306} 2294}
2307 2295
2308/* 2296/*
2309 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? 2297 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2310 * @z: is this zone on an allowed node? 2298 * @node: is this an allowed node?
2311 * @gfp_mask: memory allocation flags 2299 * @gfp_mask: memory allocation flags
2312 * 2300 *
2313 * If we're in interrupt, yes, we can always allocate. 2301 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2314 * If __GFP_THISNODE is set, yes, we can always allocate. If zone 2302 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2315 * z's node is in our tasks mems_allowed, yes. If the task has been 2303 * yes. If the task has been OOM killed and has access to memory reserves as
2316 * OOM killed and has access to memory reserves as specified by the 2304 * specified by the TIF_MEMDIE flag, yes.
2317 * TIF_MEMDIE flag, yes. Otherwise, no. 2305 * Otherwise, no.
2318 * 2306 *
2319 * The __GFP_THISNODE placement logic is really handled elsewhere, 2307 * The __GFP_THISNODE placement logic is really handled elsewhere,
2320 * by forcibly using a zonelist starting at a specified node, and by 2308 * by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2322 * any node on the zonelist except the first. By the time any such 2310 * any node on the zonelist except the first. By the time any such
2323 * calls get to this routine, we should just shut up and say 'yes'. 2311 * calls get to this routine, we should just shut up and say 'yes'.
2324 * 2312 *
2325 * Unlike the cpuset_zone_allowed_softwall() variant, above, 2313 * Unlike the cpuset_node_allowed_softwall() variant, above,
2326 * this variant requires that the zone be in the current tasks 2314 * this variant requires that the node be in the current task's
2327 * mems_allowed or that we're in interrupt. It does not scan up the 2315 * mems_allowed or that we're in interrupt. It does not scan up the
2328 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. 2316 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2329 * It never sleeps. 2317 * It never sleeps.
2330 */ 2318 */
2331 2319int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2332int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2333{ 2320{
2334 int node; /* node that zone z is on */
2335
2336 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2321 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2337 return 1; 2322 return 1;
2338 node = zone_to_nid(z);
2339 if (node_isset(node, current->mems_allowed)) 2323 if (node_isset(node, current->mems_allowed))
2340 return 1; 2324 return 1;
2341 /* 2325 /*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c2952..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/fs_struct.h>
21 22
22 23
23static void default_handler(int, struct pt_regs *); 24static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
145 return 0; 146 return 0;
146 } 147 }
147 148
148 if (atomic_read(&current->fs->count) != 1) {
149 struct fs_struct *fsp, *ofsp;
150
151 fsp = copy_fs_struct(current->fs);
152 if (fsp == NULL) {
153 module_put(ep->module);
154 return -ENOMEM;
155 }
156
157 task_lock(current);
158 ofsp = current->fs;
159 current->fs = fsp;
160 task_unlock(current);
161
162 put_fs_struct(ofsp);
163 }
164
165 /*
166 * At that point we are guaranteed to be the sole owner of
167 * current->fs.
168 */
169
170 current->personality = personality; 149 current->personality = personality;
171 oep = current_thread_info()->exec_domain; 150 oep = current_thread_info()->exec_domain;
172 current_thread_info()->exec_domain = ep; 151 current_thread_info()->exec_domain = ep;
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..32cbf2607cb0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h>
49#include <linux/init_task.h> 50#include <linux/init_task.h>
50#include <trace/sched.h> 51#include <trace/sched.h>
51 52
@@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
61 62
62static void exit_mm(struct task_struct * tsk); 63static void exit_mm(struct task_struct * tsk);
63 64
64static inline int task_detached(struct task_struct *p)
65{
66 return p->exit_signal == -1;
67}
68
69static void __unhash_process(struct task_struct *p) 65static void __unhash_process(struct task_struct *p)
70{ 66{
71 nr_threads--; 67 nr_threads--;
@@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)
362void __set_special_pids(struct pid *pid) 358void __set_special_pids(struct pid *pid)
363{ 359{
364 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
365 pid_t nr = pid_nr(pid);
366 361
367 if (task_session(curr) != pid) { 362 if (task_session(curr) != pid)
368 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
369 set_task_session(curr, nr); 364
370 } 365 if (task_pgrp(curr) != pid)
371 if (task_pgrp(curr) != pid) {
372 change_pid(curr, PIDTYPE_PGID, pid); 366 change_pid(curr, PIDTYPE_PGID, pid);
373 set_task_pgrp(curr, nr);
374 }
375} 367}
376 368
377static void set_special_pids(struct pid *pid) 369static void set_special_pids(struct pid *pid)
@@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
429void daemonize(const char *name, ...) 421void daemonize(const char *name, ...)
430{ 422{
431 va_list args; 423 va_list args;
432 struct fs_struct *fs;
433 sigset_t blocked; 424 sigset_t blocked;
434 425
435 va_start(args, name); 426 va_start(args, name);
@@ -462,11 +453,7 @@ void daemonize(const char *name, ...)
462 453
463 /* Become as one with the init task */ 454 /* Become as one with the init task */
464 455
465 exit_fs(current); /* current->fs->count--; */ 456 daemonize_fs_struct();
466 fs = init_task.fs;
467 current->fs = fs;
468 atomic_inc(&fs->count);
469
470 exit_files(current); 457 exit_files(current);
471 current->files = init_task.files; 458 current->files = init_task.files;
472 atomic_inc(&current->files->count); 459 atomic_inc(&current->files->count);
@@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)
565 } 552 }
566} 553}
567 554
568void put_fs_struct(struct fs_struct *fs)
569{
570 /* No need to hold fs->lock if we are killing it */
571 if (atomic_dec_and_test(&fs->count)) {
572 path_put(&fs->root);
573 path_put(&fs->pwd);
574 kmem_cache_free(fs_cachep, fs);
575 }
576}
577
578void exit_fs(struct task_struct *tsk)
579{
580 struct fs_struct * fs = tsk->fs;
581
582 if (fs) {
583 task_lock(tsk);
584 tsk->fs = NULL;
585 task_unlock(tsk);
586 put_fs_struct(fs);
587 }
588}
589
590EXPORT_SYMBOL_GPL(exit_fs);
591
592#ifdef CONFIG_MM_OWNER 555#ifdef CONFIG_MM_OWNER
593/* 556/*
594 * Task p is exiting and it owned mm, lets find a new owner for it 557 * Task p is exiting and it owned mm, lets find a new owner for it
@@ -732,119 +695,6 @@ static void exit_mm(struct task_struct * tsk)
732} 695}
733 696
734/* 697/*
735 * Return nonzero if @parent's children should reap themselves.
736 *
737 * Called with write_lock_irq(&tasklist_lock) held.
738 */
739static int ignoring_children(struct task_struct *parent)
740{
741 int ret;
742 struct sighand_struct *psig = parent->sighand;
743 unsigned long flags;
744 spin_lock_irqsave(&psig->siglock, flags);
745 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
746 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
747 spin_unlock_irqrestore(&psig->siglock, flags);
748 return ret;
749}
750
751/*
752 * Detach all tasks we were using ptrace on.
753 * Any that need to be release_task'd are put on the @dead list.
754 *
755 * Called with write_lock(&tasklist_lock) held.
756 */
757static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
758{
759 struct task_struct *p, *n;
760 int ign = -1;
761
762 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
763 __ptrace_unlink(p);
764
765 if (p->exit_state != EXIT_ZOMBIE)
766 continue;
767
768 /*
769 * If it's a zombie, our attachedness prevented normal
770 * parent notification or self-reaping. Do notification
771 * now if it would have happened earlier. If it should
772 * reap itself, add it to the @dead list. We can't call
773 * release_task() here because we already hold tasklist_lock.
774 *
775 * If it's our own child, there is no notification to do.
776 * But if our normal children self-reap, then this child
777 * was prevented by ptrace and we must reap it now.
778 */
779 if (!task_detached(p) && thread_group_empty(p)) {
780 if (!same_thread_group(p->real_parent, parent))
781 do_notify_parent(p, p->exit_signal);
782 else {
783 if (ign < 0)
784 ign = ignoring_children(parent);
785 if (ign)
786 p->exit_signal = -1;
787 }
788 }
789
790 if (task_detached(p)) {
791 /*
792 * Mark it as in the process of being reaped.
793 */
794 p->exit_state = EXIT_DEAD;
795 list_add(&p->ptrace_entry, dead);
796 }
797 }
798}
799
800/*
801 * Finish up exit-time ptrace cleanup.
802 *
803 * Called without locks.
804 */
805static void ptrace_exit_finish(struct task_struct *parent,
806 struct list_head *dead)
807{
808 struct task_struct *p, *n;
809
810 BUG_ON(!list_empty(&parent->ptraced));
811
812 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
813 list_del_init(&p->ptrace_entry);
814 release_task(p);
815 }
816}
817
818static void reparent_thread(struct task_struct *p, struct task_struct *father)
819{
820 if (p->pdeath_signal)
821 /* We already hold the tasklist_lock here. */
822 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
823
824 list_move_tail(&p->sibling, &p->real_parent->children);
825
826 /* If this is a threaded reparent there is no need to
827 * notify anyone anything has happened.
828 */
829 if (same_thread_group(p->real_parent, father))
830 return;
831
832 /* We don't want people slaying init. */
833 if (!task_detached(p))
834 p->exit_signal = SIGCHLD;
835
836 /* If we'd notified the old parent about this child's death,
837 * also notify the new parent.
838 */
839 if (!ptrace_reparented(p) &&
840 p->exit_state == EXIT_ZOMBIE &&
841 !task_detached(p) && thread_group_empty(p))
842 do_notify_parent(p, p->exit_signal);
843
844 kill_orphaned_pgrp(p, father);
845}
846
847/*
848 * When we die, we re-parent all our children. 698 * When we die, we re-parent all our children.
849 * Try to give them to another thread in our thread 699 * Try to give them to another thread in our thread
850 * group, and if no such member exists, give it to 700 * group, and if no such member exists, give it to
@@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
883 return pid_ns->child_reaper; 733 return pid_ns->child_reaper;
884} 734}
885 735
736/*
737* Any that need to be release_task'd are put on the @dead list.
738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead)
741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children);
746
747 if (task_detached(p))
748 return;
749 /*
750 * If this is a threaded reparent there is no need to
751 * notify anyone anything has happened.
752 */
753 if (same_thread_group(p->real_parent, father))
754 return;
755
756 /* We don't want people slaying init. */
757 p->exit_signal = SIGCHLD;
758
759 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) {
764 p->exit_state = EXIT_DEAD;
765 list_move_tail(&p->sibling, dead);
766 }
767 }
768
769 kill_orphaned_pgrp(p, father);
770}
771
886static void forget_original_parent(struct task_struct *father) 772static void forget_original_parent(struct task_struct *father)
887{ 773{
888 struct task_struct *p, *n, *reaper; 774 struct task_struct *p, *n, *reaper;
889 LIST_HEAD(ptrace_dead); 775 LIST_HEAD(dead_children);
776
777 exit_ptrace(father);
890 778
891 write_lock_irq(&tasklist_lock); 779 write_lock_irq(&tasklist_lock);
892 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
893 /*
894 * First clean up ptrace if we were using it.
895 */
896 ptrace_exit(father, &ptrace_dead);
897 781
898 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
899 p->real_parent = reaper; 783 p->real_parent = reaper;
@@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
901 BUG_ON(p->ptrace); 785 BUG_ON(p->ptrace);
902 p->parent = p->real_parent; 786 p->parent = p->real_parent;
903 } 787 }
904 reparent_thread(p, father); 788 reparent_thread(father, p, &dead_children);
905 } 789 }
906
907 write_unlock_irq(&tasklist_lock); 790 write_unlock_irq(&tasklist_lock);
791
908 BUG_ON(!list_empty(&father->children)); 792 BUG_ON(!list_empty(&father->children));
909 793
910 ptrace_exit_finish(father, &ptrace_dead); 794 list_for_each_entry_safe(p, n, &dead_children, sibling) {
795 list_del_init(&p->sibling);
796 release_task(p);
797 }
911} 798}
912 799
913/* 800/*
@@ -950,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
950 */ 837 */
951 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
952 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
953 tsk->self_exec_id != tsk->parent_exec_id) && 840 tsk->self_exec_id != tsk->parent_exec_id))
954 !capable(CAP_KILL))
955 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
956 842
957 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -1417,6 +1303,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
1417 return retval; 1303 return retval;
1418} 1304}
1419 1305
1306static int *task_stopped_code(struct task_struct *p, bool ptrace)
1307{
1308 if (ptrace) {
1309 if (task_is_stopped_or_traced(p))
1310 return &p->exit_code;
1311 } else {
1312 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1313 return &p->signal->group_exit_code;
1314 }
1315 return NULL;
1316}
1317
1420/* 1318/*
1421 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1319 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1422 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1320 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1427,7 +1325,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1427 int options, struct siginfo __user *infop, 1325 int options, struct siginfo __user *infop,
1428 int __user *stat_addr, struct rusage __user *ru) 1326 int __user *stat_addr, struct rusage __user *ru)
1429{ 1327{
1430 int retval, exit_code, why; 1328 int retval, exit_code, *p_code, why;
1431 uid_t uid = 0; /* unneeded, required by compiler */ 1329 uid_t uid = 0; /* unneeded, required by compiler */
1432 pid_t pid; 1330 pid_t pid;
1433 1331
@@ -1437,22 +1335,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1437 exit_code = 0; 1335 exit_code = 0;
1438 spin_lock_irq(&p->sighand->siglock); 1336 spin_lock_irq(&p->sighand->siglock);
1439 1337
1440 if (unlikely(!task_is_stopped_or_traced(p))) 1338 p_code = task_stopped_code(p, ptrace);
1441 goto unlock_sig; 1339 if (unlikely(!p_code))
1442
1443 if (!ptrace && p->signal->group_stop_count > 0)
1444 /*
1445 * A group stop is in progress and this is the group leader.
1446 * We won't report until all threads have stopped.
1447 */
1448 goto unlock_sig; 1340 goto unlock_sig;
1449 1341
1450 exit_code = p->exit_code; 1342 exit_code = *p_code;
1451 if (!exit_code) 1343 if (!exit_code)
1452 goto unlock_sig; 1344 goto unlock_sig;
1453 1345
1454 if (!unlikely(options & WNOWAIT)) 1346 if (!unlikely(options & WNOWAIT))
1455 p->exit_code = 0; 1347 *p_code = 0;
1456 1348
1457 /* don't need the RCU readlock here as we're holding a spinlock */ 1349 /* don't need the RCU readlock here as we're holding a spinlock */
1458 uid = __task_cred(p)->uid; 1350 uid = __task_cred(p)->uid;
@@ -1608,7 +1500,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1608 */ 1500 */
1609 *notask_error = 0; 1501 *notask_error = 0;
1610 1502
1611 if (task_is_stopped_or_traced(p)) 1503 if (task_stopped_code(p, ptrace))
1612 return wait_task_stopped(ptrace, p, options, 1504 return wait_task_stopped(ptrace, p, options,
1613 infop, stat_addr, ru); 1505 infop, stat_addr, ru);
1614 1506
@@ -1812,7 +1704,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1812 pid = find_get_pid(-upid); 1704 pid = find_get_pid(-upid);
1813 } else if (upid == 0) { 1705 } else if (upid == 0) {
1814 type = PIDTYPE_PGID; 1706 type = PIDTYPE_PGID;
1815 pid = get_pid(task_pgrp(current)); 1707 pid = get_task_pid(current, PIDTYPE_PGID);
1816 } else /* upid > 0 */ { 1708 } else /* upid > 0 */ {
1817 type = PIDTYPE_PID; 1709 type = PIDTYPE_PID;
1818 pid = find_get_pid(upid); 1710 pid = find_get_pid(upid);
diff --git a/kernel/extable.c b/kernel/extable.c
index 25d39b0c3a1b..7f8f263f8524 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -16,6 +16,7 @@
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
21#include <linux/init.h> 22#include <linux/init.h>
@@ -51,6 +52,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
51 return e; 52 return e;
52} 53}
53 54
55static inline int init_kernel_text(unsigned long addr)
56{
57 if (addr >= (unsigned long)_sinittext &&
58 addr <= (unsigned long)_einittext)
59 return 1;
60 return 0;
61}
62
54int core_kernel_text(unsigned long addr) 63int core_kernel_text(unsigned long addr)
55{ 64{
56 if (addr >= (unsigned long)_stext && 65 if (addr >= (unsigned long)_stext &&
@@ -58,8 +67,7 @@ int core_kernel_text(unsigned long addr)
58 return 1; 67 return 1;
59 68
60 if (system_state == SYSTEM_BOOTING && 69 if (system_state == SYSTEM_BOOTING &&
61 addr >= (unsigned long)_sinittext && 70 init_kernel_text(addr))
62 addr <= (unsigned long)_einittext)
63 return 1; 71 return 1;
64 return 0; 72 return 0;
65} 73}
@@ -68,14 +76,26 @@ int __kernel_text_address(unsigned long addr)
68{ 76{
69 if (core_kernel_text(addr)) 77 if (core_kernel_text(addr))
70 return 1; 78 return 1;
71 return __module_text_address(addr) != NULL; 79 if (is_module_text_address(addr))
80 return 1;
81 /*
82 * There might be init symbols in saved stacktraces.
83 * Give those symbols a chance to be printed in
84 * backtraces (such as lockdep traces).
85 *
86 * Since we are after the module-symbols check, there's
87 * no danger of address overlap:
88 */
89 if (init_kernel_text(addr))
90 return 1;
91 return 0;
72} 92}
73 93
74int kernel_text_address(unsigned long addr) 94int kernel_text_address(unsigned long addr)
75{ 95{
76 if (core_kernel_text(addr)) 96 if (core_kernel_text(addr))
77 return 1; 97 return 1;
78 return module_text_address(addr) != NULL; 98 return is_module_text_address(addr);
79} 99}
80 100
81/* 101/*
@@ -91,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
91 addr = (unsigned long) dereference_function_descriptor(ptr); 111 addr = (unsigned long) dereference_function_descriptor(ptr);
92 if (core_kernel_text(addr)) 112 if (core_kernel_text(addr))
93 return 1; 113 return 1;
94 return module_text_address(addr) != NULL; 114 return is_module_text_address(addr);
95} 115}
diff --git a/kernel/fork.c b/kernel/fork.c
index 6715ebc3761d..660c2b8765bc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
60#include <linux/tty.h> 60#include <linux/tty.h>
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h>
63#include <trace/sched.h> 64#include <trace/sched.h>
64#include <linux/magic.h> 65#include <linux/magic.h>
65 66
@@ -284,7 +285,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
284 mm->free_area_cache = oldmm->mmap_base; 285 mm->free_area_cache = oldmm->mmap_base;
285 mm->cached_hole_size = ~0UL; 286 mm->cached_hole_size = ~0UL;
286 mm->map_count = 0; 287 mm->map_count = 0;
287 cpus_clear(mm->cpu_vm_mask); 288 cpumask_clear(mm_cpumask(mm));
288 mm->mm_rb = RB_ROOT; 289 mm->mm_rb = RB_ROOT;
289 rb_link = &mm->mm_rb.rb_node; 290 rb_link = &mm->mm_rb.rb_node;
290 rb_parent = NULL; 291 rb_parent = NULL;
@@ -681,38 +682,21 @@ fail_nomem:
681 return retval; 682 return retval;
682} 683}
683 684
684static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
685{
686 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
687 /* We don't need to lock fs - think why ;-) */
688 if (fs) {
689 atomic_set(&fs->count, 1);
690 rwlock_init(&fs->lock);
691 fs->umask = old->umask;
692 read_lock(&old->lock);
693 fs->root = old->root;
694 path_get(&old->root);
695 fs->pwd = old->pwd;
696 path_get(&old->pwd);
697 read_unlock(&old->lock);
698 }
699 return fs;
700}
701
702struct fs_struct *copy_fs_struct(struct fs_struct *old)
703{
704 return __copy_fs_struct(old);
705}
706
707EXPORT_SYMBOL_GPL(copy_fs_struct);
708
709static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) 685static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
710{ 686{
687 struct fs_struct *fs = current->fs;
711 if (clone_flags & CLONE_FS) { 688 if (clone_flags & CLONE_FS) {
712 atomic_inc(&current->fs->count); 689 /* tsk->fs is already what we want */
690 write_lock(&fs->lock);
691 if (fs->in_exec) {
692 write_unlock(&fs->lock);
693 return -EAGAIN;
694 }
695 fs->users++;
696 write_unlock(&fs->lock);
713 return 0; 697 return 0;
714 } 698 }
715 tsk->fs = __copy_fs_struct(current->fs); 699 tsk->fs = copy_fs_struct(fs);
716 if (!tsk->fs) 700 if (!tsk->fs)
717 return -ENOMEM; 701 return -ENOMEM;
718 return 0; 702 return 0;
@@ -841,6 +825,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
841 atomic_set(&sig->live, 1); 825 atomic_set(&sig->live, 1);
842 init_waitqueue_head(&sig->wait_chldexit); 826 init_waitqueue_head(&sig->wait_chldexit);
843 sig->flags = 0; 827 sig->flags = 0;
828 if (clone_flags & CLONE_NEWPID)
829 sig->flags |= SIGNAL_UNKILLABLE;
844 sig->group_exit_code = 0; 830 sig->group_exit_code = 0;
845 sig->group_exit_task = NULL; 831 sig->group_exit_task = NULL;
846 sig->group_stop_count = 0; 832 sig->group_stop_count = 0;
@@ -1125,7 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1125 goto bad_fork_cleanup_mm; 1111 goto bad_fork_cleanup_mm;
1126 if ((retval = copy_io(clone_flags, p))) 1112 if ((retval = copy_io(clone_flags, p)))
1127 goto bad_fork_cleanup_namespaces; 1113 goto bad_fork_cleanup_namespaces;
1128 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1114 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1129 if (retval) 1115 if (retval)
1130 goto bad_fork_cleanup_io; 1116 goto bad_fork_cleanup_io;
1131 1117
@@ -1263,8 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1263 p->signal->leader_pid = pid; 1249 p->signal->leader_pid = pid;
1264 tty_kref_put(p->signal->tty); 1250 tty_kref_put(p->signal->tty);
1265 p->signal->tty = tty_kref_get(current->signal->tty); 1251 p->signal->tty = tty_kref_get(current->signal->tty);
1266 set_task_pgrp(p, task_pgrp_nr(current));
1267 set_task_session(p, task_session_nr(current));
1268 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1252 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1269 attach_pid(p, PIDTYPE_SID, task_session(current)); 1253 attach_pid(p, PIDTYPE_SID, task_session(current));
1270 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1488,6 +1472,7 @@ void __init proc_caches_init(void)
1488 mm_cachep = kmem_cache_create("mm_struct", 1472 mm_cachep = kmem_cache_create("mm_struct",
1489 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1473 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1490 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1474 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1475 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1491 mmap_init(); 1476 mmap_init();
1492} 1477}
1493 1478
@@ -1543,12 +1528,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1543{ 1528{
1544 struct fs_struct *fs = current->fs; 1529 struct fs_struct *fs = current->fs;
1545 1530
1546 if ((unshare_flags & CLONE_FS) && 1531 if (!(unshare_flags & CLONE_FS) || !fs)
1547 (fs && atomic_read(&fs->count) > 1)) { 1532 return 0;
1548 *new_fsp = __copy_fs_struct(current->fs); 1533
1549 if (!*new_fsp) 1534 /* don't need lock here; in the worst case we'll do useless copy */
1550 return -ENOMEM; 1535 if (fs->users == 1)
1551 } 1536 return 0;
1537
1538 *new_fsp = copy_fs_struct(fs);
1539 if (!*new_fsp)
1540 return -ENOMEM;
1552 1541
1553 return 0; 1542 return 0;
1554} 1543}
@@ -1664,8 +1653,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1664 1653
1665 if (new_fs) { 1654 if (new_fs) {
1666 fs = current->fs; 1655 fs = current->fs;
1656 write_lock(&fs->lock);
1667 current->fs = new_fs; 1657 current->fs = new_fs;
1668 new_fs = fs; 1658 if (--fs->users)
1659 new_fs = NULL;
1660 else
1661 new_fs = fs;
1662 write_unlock(&fs->lock);
1669 } 1663 }
1670 1664
1671 if (new_mm) { 1665 if (new_mm) {
@@ -1704,7 +1698,7 @@ bad_unshare_cleanup_sigh:
1704 1698
1705bad_unshare_cleanup_fs: 1699bad_unshare_cleanup_fs:
1706 if (new_fs) 1700 if (new_fs)
1707 put_fs_struct(new_fs); 1701 free_fs_struct(new_fs);
1708 1702
1709bad_unshare_cleanup_thread: 1703bad_unshare_cleanup_thread:
1710bad_unshare_out: 1704bad_unshare_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index 438701adce23..6b50a024bca2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -114,7 +114,9 @@ struct futex_q {
114}; 114};
115 115
116/* 116/*
117 * Split the global futex_lock into every hash list lock. 117 * Hash buckets are shared by all the futex_keys that hash to the same
118 * location. Each key may have multiple futex_q structures, one for each task
119 * waiting on a futex.
118 */ 120 */
119struct futex_hash_bucket { 121struct futex_hash_bucket {
120 spinlock_t lock; 122 spinlock_t lock;
@@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key)
189/** 191/**
190 * get_futex_key - Get parameters which are the keys for a futex. 192 * get_futex_key - Get parameters which are the keys for a futex.
191 * @uaddr: virtual address of the futex 193 * @uaddr: virtual address of the futex
192 * @shared: NULL for a PROCESS_PRIVATE futex, 194 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
193 * &current->mm->mmap_sem for a PROCESS_SHARED futex
194 * @key: address where result is stored. 195 * @key: address where result is stored.
195 * 196 *
196 * Returns a negative error code or 0 197 * Returns a negative error code or 0
@@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key)
200 * offset_within_page). For private mappings, it's (uaddr, current->mm). 201 * offset_within_page). For private mappings, it's (uaddr, current->mm).
201 * We can usually work out the index without swapping in the page. 202 * We can usually work out the index without swapping in the page.
202 * 203 *
203 * fshared is NULL for PROCESS_PRIVATE futexes 204 * lock_page() might sleep, the caller should not hold a spinlock.
204 * For other futexes, it points to &current->mm->mmap_sem and
205 * caller must have taken the reader lock. but NOT any spinlocks.
206 */ 205 */
207static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 206static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
208{ 207{
@@ -299,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
299 return ret ? -EFAULT : 0; 298 return ret ? -EFAULT : 0;
300} 299}
301 300
302/*
303 * Fault handling.
304 */
305static int futex_handle_fault(unsigned long address, int attempt)
306{
307 struct vm_area_struct * vma;
308 struct mm_struct *mm = current->mm;
309 int ret = -EFAULT;
310
311 if (attempt > 2)
312 return ret;
313
314 down_read(&mm->mmap_sem);
315 vma = find_vma(mm, address);
316 if (vma && address >= vma->vm_start &&
317 (vma->vm_flags & VM_WRITE)) {
318 int fault;
319 fault = handle_mm_fault(mm, vma, address, 1);
320 if (unlikely((fault & VM_FAULT_ERROR))) {
321#if 0
322 /* XXX: let's do this when we verify it is OK */
323 if (ret & VM_FAULT_OOM)
324 ret = -ENOMEM;
325#endif
326 } else {
327 ret = 0;
328 if (fault & VM_FAULT_MAJOR)
329 current->maj_flt++;
330 else
331 current->min_flt++;
332 }
333 }
334 up_read(&mm->mmap_sem);
335 return ret;
336}
337 301
338/* 302/*
339 * PI code: 303 * PI code:
@@ -589,10 +553,9 @@ static void wake_futex(struct futex_q *q)
589 * The waiting task can free the futex_q as soon as this is written, 553 * The waiting task can free the futex_q as soon as this is written,
590 * without taking any locks. This must come last. 554 * without taking any locks. This must come last.
591 * 555 *
592 * A memory barrier is required here to prevent the following store 556 * A memory barrier is required here to prevent the following store to
593 * to lock_ptr from getting ahead of the wakeup. Clearing the lock 557 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
594 * at the end of wake_up_all() does not prevent this store from 558 * end of wake_up() does not prevent this store from moving.
595 * moving.
596 */ 559 */
597 smp_wmb(); 560 smp_wmb();
598 q->lock_ptr = NULL; 561 q->lock_ptr = NULL;
@@ -692,9 +655,16 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
692 } 655 }
693} 656}
694 657
658static inline void
659double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
660{
661 spin_unlock(&hb1->lock);
662 if (hb1 != hb2)
663 spin_unlock(&hb2->lock);
664}
665
695/* 666/*
696 * Wake up all waiters hashed on the physical page that is mapped 667 * Wake up waiters matching bitset queued on this futex (uaddr).
697 * to this virtual address:
698 */ 668 */
699static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 669static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
700{ 670{
@@ -750,9 +720,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
750 struct futex_hash_bucket *hb1, *hb2; 720 struct futex_hash_bucket *hb1, *hb2;
751 struct plist_head *head; 721 struct plist_head *head;
752 struct futex_q *this, *next; 722 struct futex_q *this, *next;
753 int ret, op_ret, attempt = 0; 723 int ret, op_ret;
754 724
755retryfull: 725retry:
756 ret = get_futex_key(uaddr1, fshared, &key1); 726 ret = get_futex_key(uaddr1, fshared, &key1);
757 if (unlikely(ret != 0)) 727 if (unlikely(ret != 0))
758 goto out; 728 goto out;
@@ -763,16 +733,13 @@ retryfull:
763 hb1 = hash_futex(&key1); 733 hb1 = hash_futex(&key1);
764 hb2 = hash_futex(&key2); 734 hb2 = hash_futex(&key2);
765 735
766retry:
767 double_lock_hb(hb1, hb2); 736 double_lock_hb(hb1, hb2);
768 737retry_private:
769 op_ret = futex_atomic_op_inuser(op, uaddr2); 738 op_ret = futex_atomic_op_inuser(op, uaddr2);
770 if (unlikely(op_ret < 0)) { 739 if (unlikely(op_ret < 0)) {
771 u32 dummy; 740 u32 dummy;
772 741
773 spin_unlock(&hb1->lock); 742 double_unlock_hb(hb1, hb2);
774 if (hb1 != hb2)
775 spin_unlock(&hb2->lock);
776 743
777#ifndef CONFIG_MMU 744#ifndef CONFIG_MMU
778 /* 745 /*
@@ -788,26 +755,16 @@ retry:
788 goto out_put_keys; 755 goto out_put_keys;
789 } 756 }
790 757
791 /*
792 * futex_atomic_op_inuser needs to both read and write
793 * *(int __user *)uaddr2, but we can't modify it
794 * non-atomically. Therefore, if get_user below is not
795 * enough, we need to handle the fault ourselves, while
796 * still holding the mmap_sem.
797 */
798 if (attempt++) {
799 ret = futex_handle_fault((unsigned long)uaddr2,
800 attempt);
801 if (ret)
802 goto out_put_keys;
803 goto retry;
804 }
805
806 ret = get_user(dummy, uaddr2); 758 ret = get_user(dummy, uaddr2);
807 if (ret) 759 if (ret)
808 return ret; 760 goto out_put_keys;
761
762 if (!fshared)
763 goto retry_private;
809 764
810 goto retryfull; 765 put_futex_key(fshared, &key2);
766 put_futex_key(fshared, &key1);
767 goto retry;
811 } 768 }
812 769
813 head = &hb1->chain; 770 head = &hb1->chain;
@@ -834,9 +791,7 @@ retry:
834 ret += op_ret; 791 ret += op_ret;
835 } 792 }
836 793
837 spin_unlock(&hb1->lock); 794 double_unlock_hb(hb1, hb2);
838 if (hb1 != hb2)
839 spin_unlock(&hb2->lock);
840out_put_keys: 795out_put_keys:
841 put_futex_key(fshared, &key2); 796 put_futex_key(fshared, &key2);
842out_put_key1: 797out_put_key1:
@@ -869,6 +824,7 @@ retry:
869 hb1 = hash_futex(&key1); 824 hb1 = hash_futex(&key1);
870 hb2 = hash_futex(&key2); 825 hb2 = hash_futex(&key2);
871 826
827retry_private:
872 double_lock_hb(hb1, hb2); 828 double_lock_hb(hb1, hb2);
873 829
874 if (likely(cmpval != NULL)) { 830 if (likely(cmpval != NULL)) {
@@ -877,16 +833,18 @@ retry:
877 ret = get_futex_value_locked(&curval, uaddr1); 833 ret = get_futex_value_locked(&curval, uaddr1);
878 834
879 if (unlikely(ret)) { 835 if (unlikely(ret)) {
880 spin_unlock(&hb1->lock); 836 double_unlock_hb(hb1, hb2);
881 if (hb1 != hb2)
882 spin_unlock(&hb2->lock);
883 837
884 ret = get_user(curval, uaddr1); 838 ret = get_user(curval, uaddr1);
839 if (ret)
840 goto out_put_keys;
885 841
886 if (!ret) 842 if (!fshared)
887 goto retry; 843 goto retry_private;
888 844
889 goto out_put_keys; 845 put_futex_key(fshared, &key2);
846 put_futex_key(fshared, &key1);
847 goto retry;
890 } 848 }
891 if (curval != *cmpval) { 849 if (curval != *cmpval) {
892 ret = -EAGAIN; 850 ret = -EAGAIN;
@@ -923,9 +881,7 @@ retry:
923 } 881 }
924 882
925out_unlock: 883out_unlock:
926 spin_unlock(&hb1->lock); 884 double_unlock_hb(hb1, hb2);
927 if (hb1 != hb2)
928 spin_unlock(&hb2->lock);
929 885
930 /* drop_futex_key_refs() must be called outside the spinlocks. */ 886 /* drop_futex_key_refs() must be called outside the spinlocks. */
931 while (--drop_count >= 0) 887 while (--drop_count >= 0)
@@ -1063,7 +1019,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1063 struct futex_pi_state *pi_state = q->pi_state; 1019 struct futex_pi_state *pi_state = q->pi_state;
1064 struct task_struct *oldowner = pi_state->owner; 1020 struct task_struct *oldowner = pi_state->owner;
1065 u32 uval, curval, newval; 1021 u32 uval, curval, newval;
1066 int ret, attempt = 0; 1022 int ret;
1067 1023
1068 /* Owner died? */ 1024 /* Owner died? */
1069 if (!pi_state->owner) 1025 if (!pi_state->owner)
@@ -1076,11 +1032,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1076 * in the user space variable. This must be atomic as we have 1032 * in the user space variable. This must be atomic as we have
1077 * to preserve the owner died bit here. 1033 * to preserve the owner died bit here.
1078 * 1034 *
1079 * Note: We write the user space value _before_ changing the 1035 * Note: We write the user space value _before_ changing the pi_state
1080 * pi_state because we can fault here. Imagine swapped out 1036 * because we can fault here. Imagine swapped out pages or a fork
1081 * pages or a fork, which was running right before we acquired 1037 * that marked all the anonymous memory readonly for cow.
1082 * mmap_sem, that marked all the anonymous memory readonly for
1083 * cow.
1084 * 1038 *
1085 * Modifying pi_state _before_ the user space value would 1039 * Modifying pi_state _before_ the user space value would
1086 * leave the pi_state in an inconsistent state when we fault 1040 * leave the pi_state in an inconsistent state when we fault
@@ -1136,7 +1090,7 @@ retry:
1136handle_fault: 1090handle_fault:
1137 spin_unlock(q->lock_ptr); 1091 spin_unlock(q->lock_ptr);
1138 1092
1139 ret = futex_handle_fault((unsigned long)uaddr, attempt++); 1093 ret = get_user(uval, uaddr);
1140 1094
1141 spin_lock(q->lock_ptr); 1095 spin_lock(q->lock_ptr);
1142 1096
@@ -1185,10 +1139,11 @@ retry:
1185 if (unlikely(ret != 0)) 1139 if (unlikely(ret != 0))
1186 goto out; 1140 goto out;
1187 1141
1142retry_private:
1188 hb = queue_lock(&q); 1143 hb = queue_lock(&q);
1189 1144
1190 /* 1145 /*
1191 * Access the page AFTER the futex is queued. 1146 * Access the page AFTER the hash-bucket is locked.
1192 * Order is important: 1147 * Order is important:
1193 * 1148 *
1194 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 1149 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
@@ -1204,20 +1159,23 @@ retry:
1204 * a wakeup when *uaddr != val on entry to the syscall. This is 1159 * a wakeup when *uaddr != val on entry to the syscall. This is
1205 * rare, but normal. 1160 * rare, but normal.
1206 * 1161 *
1207 * for shared futexes, we hold the mmap semaphore, so the mapping 1162 * For shared futexes, we hold the mmap semaphore, so the mapping
1208 * cannot have changed since we looked it up in get_futex_key. 1163 * cannot have changed since we looked it up in get_futex_key.
1209 */ 1164 */
1210 ret = get_futex_value_locked(&uval, uaddr); 1165 ret = get_futex_value_locked(&uval, uaddr);
1211 1166
1212 if (unlikely(ret)) { 1167 if (unlikely(ret)) {
1213 queue_unlock(&q, hb); 1168 queue_unlock(&q, hb);
1214 put_futex_key(fshared, &q.key);
1215 1169
1216 ret = get_user(uval, uaddr); 1170 ret = get_user(uval, uaddr);
1171 if (ret)
1172 goto out_put_key;
1217 1173
1218 if (!ret) 1174 if (!fshared)
1219 goto retry; 1175 goto retry_private;
1220 goto out; 1176
1177 put_futex_key(fshared, &q.key);
1178 goto retry;
1221 } 1179 }
1222 ret = -EWOULDBLOCK; 1180 ret = -EWOULDBLOCK;
1223 if (unlikely(uval != val)) { 1181 if (unlikely(uval != val)) {
@@ -1248,16 +1206,13 @@ retry:
1248 if (!abs_time) 1206 if (!abs_time)
1249 schedule(); 1207 schedule();
1250 else { 1208 else {
1251 unsigned long slack;
1252 slack = current->timer_slack_ns;
1253 if (rt_task(current))
1254 slack = 0;
1255 hrtimer_init_on_stack(&t.timer, 1209 hrtimer_init_on_stack(&t.timer,
1256 clockrt ? CLOCK_REALTIME : 1210 clockrt ? CLOCK_REALTIME :
1257 CLOCK_MONOTONIC, 1211 CLOCK_MONOTONIC,
1258 HRTIMER_MODE_ABS); 1212 HRTIMER_MODE_ABS);
1259 hrtimer_init_sleeper(&t, current); 1213 hrtimer_init_sleeper(&t, current);
1260 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1214 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1215 current->timer_slack_ns);
1261 1216
1262 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); 1217 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1263 if (!hrtimer_active(&t.timer)) 1218 if (!hrtimer_active(&t.timer))
@@ -1354,7 +1309,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1354 struct futex_hash_bucket *hb; 1309 struct futex_hash_bucket *hb;
1355 u32 uval, newval, curval; 1310 u32 uval, newval, curval;
1356 struct futex_q q; 1311 struct futex_q q;
1357 int ret, lock_taken, ownerdied = 0, attempt = 0; 1312 int ret, lock_taken, ownerdied = 0;
1358 1313
1359 if (refill_pi_state_cache()) 1314 if (refill_pi_state_cache())
1360 return -ENOMEM; 1315 return -ENOMEM;
@@ -1374,7 +1329,7 @@ retry:
1374 if (unlikely(ret != 0)) 1329 if (unlikely(ret != 0))
1375 goto out; 1330 goto out;
1376 1331
1377retry_unlocked: 1332retry_private:
1378 hb = queue_lock(&q); 1333 hb = queue_lock(&q);
1379 1334
1380retry_locked: 1335retry_locked:
@@ -1458,6 +1413,7 @@ retry_locked:
1458 * exit to complete. 1413 * exit to complete.
1459 */ 1414 */
1460 queue_unlock(&q, hb); 1415 queue_unlock(&q, hb);
1416 put_futex_key(fshared, &q.key);
1461 cond_resched(); 1417 cond_resched();
1462 goto retry; 1418 goto retry;
1463 1419
@@ -1564,6 +1520,13 @@ retry_locked:
1564 } 1520 }
1565 } 1521 }
1566 1522
1523 /*
1524 * If fixup_pi_state_owner() faulted and was unable to handle the
1525 * fault, unlock it and return the fault to userspace.
1526 */
1527 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1528 rt_mutex_unlock(&q.pi_state->pi_mutex);
1529
1567 /* Unqueue and drop the lock */ 1530 /* Unqueue and drop the lock */
1568 unqueue_me_pi(&q); 1531 unqueue_me_pi(&q);
1569 1532
@@ -1591,22 +1554,18 @@ uaddr_faulted:
1591 */ 1554 */
1592 queue_unlock(&q, hb); 1555 queue_unlock(&q, hb);
1593 1556
1594 if (attempt++) {
1595 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1596 if (ret)
1597 goto out_put_key;
1598 goto retry_unlocked;
1599 }
1600
1601 ret = get_user(uval, uaddr); 1557 ret = get_user(uval, uaddr);
1602 if (!ret) 1558 if (ret)
1603 goto retry; 1559 goto out_put_key;
1604 1560
1605 if (to) 1561 if (!fshared)
1606 destroy_hrtimer_on_stack(&to->timer); 1562 goto retry_private;
1607 return ret; 1563
1564 put_futex_key(fshared, &q.key);
1565 goto retry;
1608} 1566}
1609 1567
1568
1610/* 1569/*
1611 * Userspace attempted a TID -> 0 atomic transition, and failed. 1570 * Userspace attempted a TID -> 0 atomic transition, and failed.
1612 * This is the in-kernel slowpath: we look up the PI state (if any), 1571 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1619,7 +1578,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1619 u32 uval; 1578 u32 uval;
1620 struct plist_head *head; 1579 struct plist_head *head;
1621 union futex_key key = FUTEX_KEY_INIT; 1580 union futex_key key = FUTEX_KEY_INIT;
1622 int ret, attempt = 0; 1581 int ret;
1623 1582
1624retry: 1583retry:
1625 if (get_user(uval, uaddr)) 1584 if (get_user(uval, uaddr))
@@ -1635,7 +1594,6 @@ retry:
1635 goto out; 1594 goto out;
1636 1595
1637 hb = hash_futex(&key); 1596 hb = hash_futex(&key);
1638retry_unlocked:
1639 spin_lock(&hb->lock); 1597 spin_lock(&hb->lock);
1640 1598
1641 /* 1599 /*
@@ -1700,14 +1658,7 @@ pi_faulted:
1700 * we have to drop the mmap_sem in order to call get_user(). 1658 * we have to drop the mmap_sem in order to call get_user().
1701 */ 1659 */
1702 spin_unlock(&hb->lock); 1660 spin_unlock(&hb->lock);
1703 1661 put_futex_key(fshared, &key);
1704 if (attempt++) {
1705 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1706 if (ret)
1707 goto out;
1708 uval = 0;
1709 goto retry_unlocked;
1710 }
1711 1662
1712 ret = get_user(uval, uaddr); 1663 ret = get_user(uval, uaddr);
1713 if (!ret) 1664 if (!ret)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651 * and expiry check is done in the hrtimer_interrupt or in the softirq. 651 * and expiry check is done in the hrtimer_interrupt or in the softirq.
652 */ 652 */
653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
654 struct hrtimer_clock_base *base) 654 struct hrtimer_clock_base *base,
655 int wakeup)
655{ 656{
656 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 657 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
657 spin_unlock(&base->cpu_base->lock); 658 if (wakeup) {
658 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 659 spin_unlock(&base->cpu_base->lock);
659 spin_lock(&base->cpu_base->lock); 660 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
661 spin_lock(&base->cpu_base->lock);
662 } else
663 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
664
660 return 1; 665 return 1;
661 } 666 }
667
662 return 0; 668 return 0;
663} 669}
664 670
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
703static inline int hrtimer_switch_to_hres(void) { return 0; } 709static inline int hrtimer_switch_to_hres(void) { return 0; }
704static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 710static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
705static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 711static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
706 struct hrtimer_clock_base *base) 712 struct hrtimer_clock_base *base,
713 int wakeup)
707{ 714{
708 return 0; 715 return 0;
709} 716}
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
886 return 0; 893 return 0;
887} 894}
888 895
889/** 896int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
890 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 897 unsigned long delta_ns, const enum hrtimer_mode mode,
891 * @timer: the timer to be added 898 int wakeup)
892 * @tim: expiry time
893 * @delta_ns: "slack" range for the timer
894 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
895 *
896 * Returns:
897 * 0 on success
898 * 1 when the timer was active
899 */
900int
901hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
902 const enum hrtimer_mode mode)
903{ 899{
904 struct hrtimer_clock_base *base, *new_base; 900 struct hrtimer_clock_base *base, *new_base;
905 unsigned long flags; 901 unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
940 * XXX send_remote_softirq() ? 936 * XXX send_remote_softirq() ?
941 */ 937 */
942 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 938 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
943 hrtimer_enqueue_reprogram(timer, new_base); 939 hrtimer_enqueue_reprogram(timer, new_base, wakeup);
944 940
945 unlock_hrtimer_base(timer, &flags); 941 unlock_hrtimer_base(timer, &flags);
946 942
947 return ret; 943 return ret;
948} 944}
945
946/**
947 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added
949 * @tim: expiry time
950 * @delta_ns: "slack" range for the timer
951 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
952 *
953 * Returns:
954 * 0 on success
955 * 1 when the timer was active
956 */
957int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
958 unsigned long delta_ns, const enum hrtimer_mode mode)
959{
960 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
961}
949EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 962EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
950 963
951/** 964/**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
961int 974int
962hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 975hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
963{ 976{
964 return hrtimer_start_range_ns(timer, tim, 0, mode); 977 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
965} 978}
966EXPORT_SYMBOL_GPL(hrtimer_start); 979EXPORT_SYMBOL_GPL(hrtimer_start);
967 980
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac98..3394f8f52964 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 122fef4b0bd3..c687ba4363f2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -81,6 +81,7 @@ void dynamic_irq_cleanup(unsigned int irq)
81 desc->handle_irq = handle_bad_irq; 81 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc);
84 spin_unlock_irqrestore(&desc->lock, flags); 85 spin_unlock_irqrestore(&desc->lock, flags);
85} 86}
86 87
@@ -293,7 +294,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
293 desc->chip->mask_ack(irq); 294 desc->chip->mask_ack(irq);
294 else { 295 else {
295 desc->chip->mask(irq); 296 desc->chip->mask(irq);
296 desc->chip->ack(irq); 297 if (desc->chip->ack)
298 desc->chip->ack(irq);
297 } 299 }
298} 300}
299 301
@@ -479,7 +481,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
479 kstat_incr_irqs_this_cpu(irq, desc); 481 kstat_incr_irqs_this_cpu(irq, desc);
480 482
481 /* Start handling the irq */ 483 /* Start handling the irq */
482 desc->chip->ack(irq); 484 if (desc->chip->ack)
485 desc->chip->ack(irq);
483 desc = irq_remap_to_desc(irq, desc); 486 desc = irq_remap_to_desc(irq, desc);
484 487
485 /* Mark the IRQ currently in progress.*/ 488 /* Mark the IRQ currently in progress.*/
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 412370ab9a34..343acecae629 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -83,19 +83,21 @@ static struct irq_desc irq_desc_init = {
83 83
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
85{ 85{
86 unsigned long bytes;
87 char *ptr;
88 int node; 86 int node;
89 87 void *ptr;
90 /* Compute how many bytes we need per irq and allocate them */
91 bytes = nr * sizeof(unsigned int);
92 88
93 node = cpu_to_node(cpu); 89 node = cpu_to_node(cpu);
94 ptr = kzalloc_node(bytes, GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
95 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
96 91
97 if (ptr) 92 /*
98 desc->kstat_irqs = (unsigned int *)ptr; 93 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one
95 */
96 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n",
98 cpu, node);
99 desc->kstat_irqs = ptr;
100 }
99} 101}
100 102
101static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@ -238,6 +240,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
238 } 240 }
239}; 241};
240 242
243static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
241int __init early_irq_init(void) 244int __init early_irq_init(void)
242{ 245{
243 struct irq_desc *desc; 246 struct irq_desc *desc;
@@ -254,6 +257,7 @@ int __init early_irq_init(void)
254 for (i = 0; i < count; i++) { 257 for (i = 0; i < count; i++) {
255 desc[i].irq = i; 258 desc[i].irq = i;
256 init_alloc_desc_masks(&desc[i], 0, true); 259 init_alloc_desc_masks(&desc[i], 0, true);
260 desc[i].kstat_irqs = kstat_irqs_all[i];
257 } 261 }
258 return arch_early_irq_init(); 262 return arch_early_irq_init();
259} 263}
@@ -269,6 +273,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
269} 273}
270#endif /* !CONFIG_SPARSE_IRQ */ 274#endif /* !CONFIG_SPARSE_IRQ */
271 275
276void clear_kstat_irqs(struct irq_desc *desc)
277{
278 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
279}
280
272/* 281/*
273 * What should we do if we get a hw irq event on an illegal vector? 282 * What should we do if we get a hw irq event on an illegal vector?
274 * Each architecture has to answer this themself. 283 * Each architecture has to answer this themself.
@@ -345,6 +354,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
345 irqreturn_t ret, retval = IRQ_NONE; 354 irqreturn_t ret, retval = IRQ_NONE;
346 unsigned int status = 0; 355 unsigned int status = 0;
347 356
357 WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
358
348 if (!(action->flags & IRQF_DISABLED)) 359 if (!(action->flags & IRQF_DISABLED))
349 local_irq_enable_in_hardirq(); 360 local_irq_enable_in_hardirq();
350 361
@@ -366,6 +377,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
366} 377}
367 378
368#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ 379#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
380
381#ifdef CONFIG_ENABLE_WARN_DEPRECATED
382# warning __do_IRQ is deprecated. Please convert to proper flow handlers
383#endif
384
369/** 385/**
370 * __do_IRQ - original all in one highlevel IRQ handler 386 * __do_IRQ - original all in one highlevel IRQ handler
371 * @irq: the interrupt number 387 * @irq: the interrupt number
@@ -486,12 +502,10 @@ void early_init_irq_lock_class(void)
486 } 502 }
487} 503}
488 504
489#ifdef CONFIG_SPARSE_IRQ
490unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 505unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
491{ 506{
492 struct irq_desc *desc = irq_to_desc(irq); 507 struct irq_desc *desc = irq_to_desc(irq);
493 return desc ? desc->kstat_irqs[cpu] : 0; 508 return desc ? desc->kstat_irqs[cpu] : 0;
494} 509}
495#endif
496EXPORT_SYMBOL(kstat_irqs_cpu); 510EXPORT_SYMBOL(kstat_irqs_cpu);
497 511
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 40416a81a0f5..01ce20eab38f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,9 +12,12 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
15 17
16extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
18extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
19 22
20#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a3a5dc9ef346..1516ab77355c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -109,7 +109,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109/* 109/*
110 * Generic version of the affinity autoselector. 110 * Generic version of the affinity autoselector.
111 */ 111 */
112int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 112static int setup_affinity(unsigned int irq, struct irq_desc *desc)
113{ 113{
114 if (!irq_can_set_affinity(irq)) 114 if (!irq_can_set_affinity(irq))
115 return 0; 115 return 0;
@@ -133,7 +133,7 @@ set_affinity:
133 return 0; 133 return 0;
134} 134}
135#else 135#else
136static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) 136static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
137{ 137{
138 return irq_select_affinity(irq); 138 return irq_select_affinity(irq);
139} 139}
@@ -149,19 +149,33 @@ int irq_select_affinity_usr(unsigned int irq)
149 int ret; 149 int ret;
150 150
151 spin_lock_irqsave(&desc->lock, flags); 151 spin_lock_irqsave(&desc->lock, flags);
152 ret = do_irq_select_affinity(irq, desc); 152 ret = setup_affinity(irq, desc);
153 spin_unlock_irqrestore(&desc->lock, flags); 153 spin_unlock_irqrestore(&desc->lock, flags);
154 154
155 return ret; 155 return ret;
156} 156}
157 157
158#else 158#else
159static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) 159static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
160{ 160{
161 return 0; 161 return 0;
162} 162}
163#endif 163#endif
164 164
165void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
166{
167 if (suspend) {
168 if (!desc->action || (desc->action->flags & IRQF_TIMER))
169 return;
170 desc->status |= IRQ_SUSPENDED;
171 }
172
173 if (!desc->depth++) {
174 desc->status |= IRQ_DISABLED;
175 desc->chip->disable(irq);
176 }
177}
178
165/** 179/**
166 * disable_irq_nosync - disable an irq without waiting 180 * disable_irq_nosync - disable an irq without waiting
167 * @irq: Interrupt to disable 181 * @irq: Interrupt to disable
@@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)
182 return; 196 return;
183 197
184 spin_lock_irqsave(&desc->lock, flags); 198 spin_lock_irqsave(&desc->lock, flags);
185 if (!desc->depth++) { 199 __disable_irq(desc, irq, false);
186 desc->status |= IRQ_DISABLED;
187 desc->chip->disable(irq);
188 }
189 spin_unlock_irqrestore(&desc->lock, flags); 200 spin_unlock_irqrestore(&desc->lock, flags);
190} 201}
191EXPORT_SYMBOL(disable_irq_nosync); 202EXPORT_SYMBOL(disable_irq_nosync);
@@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)
215} 226}
216EXPORT_SYMBOL(disable_irq); 227EXPORT_SYMBOL(disable_irq);
217 228
218static void __enable_irq(struct irq_desc *desc, unsigned int irq) 229void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
219{ 230{
231 if (resume)
232 desc->status &= ~IRQ_SUSPENDED;
233
220 switch (desc->depth) { 234 switch (desc->depth) {
221 case 0: 235 case 0:
236 err_out:
222 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 237 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
223 break; 238 break;
224 case 1: { 239 case 1: {
225 unsigned int status = desc->status & ~IRQ_DISABLED; 240 unsigned int status = desc->status & ~IRQ_DISABLED;
226 241
242 if (desc->status & IRQ_SUSPENDED)
243 goto err_out;
227 /* Prevent probing on this irq: */ 244 /* Prevent probing on this irq: */
228 desc->status = status | IRQ_NOPROBE; 245 desc->status = status | IRQ_NOPROBE;
229 check_irq_resend(desc, irq); 246 check_irq_resend(desc, irq);
@@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)
253 return; 270 return;
254 271
255 spin_lock_irqsave(&desc->lock, flags); 272 spin_lock_irqsave(&desc->lock, flags);
256 __enable_irq(desc, irq); 273 __enable_irq(desc, irq, false);
257 spin_unlock_irqrestore(&desc->lock, flags); 274 spin_unlock_irqrestore(&desc->lock, flags);
258} 275}
259EXPORT_SYMBOL(enable_irq); 276EXPORT_SYMBOL(enable_irq);
@@ -389,9 +406,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
389 * allocate special interrupts that are part of the architecture. 406 * allocate special interrupts that are part of the architecture.
390 */ 407 */
391static int 408static int
392__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) 409__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
393{ 410{
394 struct irqaction *old, **p; 411 struct irqaction *old, **old_ptr;
395 const char *old_name = NULL; 412 const char *old_name = NULL;
396 unsigned long flags; 413 unsigned long flags;
397 int shared = 0; 414 int shared = 0;
@@ -423,8 +440,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
423 * The following block of code has to be executed atomically 440 * The following block of code has to be executed atomically
424 */ 441 */
425 spin_lock_irqsave(&desc->lock, flags); 442 spin_lock_irqsave(&desc->lock, flags);
426 p = &desc->action; 443 old_ptr = &desc->action;
427 old = *p; 444 old = *old_ptr;
428 if (old) { 445 if (old) {
429 /* 446 /*
430 * Can't share interrupts unless both agree to and are 447 * Can't share interrupts unless both agree to and are
@@ -447,8 +464,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
447 464
448 /* add new interrupt at end of irq queue */ 465 /* add new interrupt at end of irq queue */
449 do { 466 do {
450 p = &old->next; 467 old_ptr = &old->next;
451 old = *p; 468 old = *old_ptr;
452 } while (old); 469 } while (old);
453 shared = 1; 470 shared = 1;
454 } 471 }
@@ -488,7 +505,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
488 desc->status |= IRQ_NO_BALANCING; 505 desc->status |= IRQ_NO_BALANCING;
489 506
490 /* Set default affinity mask once everything is setup */ 507 /* Set default affinity mask once everything is setup */
491 do_irq_select_affinity(irq, desc); 508 setup_affinity(irq, desc);
492 509
493 } else if ((new->flags & IRQF_TRIGGER_MASK) 510 } else if ((new->flags & IRQF_TRIGGER_MASK)
494 && (new->flags & IRQF_TRIGGER_MASK) 511 && (new->flags & IRQF_TRIGGER_MASK)
@@ -499,7 +516,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
499 (int)(new->flags & IRQF_TRIGGER_MASK)); 516 (int)(new->flags & IRQF_TRIGGER_MASK));
500 } 517 }
501 518
502 *p = new; 519 *old_ptr = new;
503 520
504 /* Reset broken irq detection when installing new handler */ 521 /* Reset broken irq detection when installing new handler */
505 desc->irq_count = 0; 522 desc->irq_count = 0;
@@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
511 */ 528 */
512 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 529 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
513 desc->status &= ~IRQ_SPURIOUS_DISABLED; 530 desc->status &= ~IRQ_SPURIOUS_DISABLED;
514 __enable_irq(desc, irq); 531 __enable_irq(desc, irq, false);
515 } 532 }
516 533
517 spin_unlock_irqrestore(&desc->lock, flags); 534 spin_unlock_irqrestore(&desc->lock, flags);
@@ -549,90 +566,117 @@ int setup_irq(unsigned int irq, struct irqaction *act)
549 566
550 return __setup_irq(irq, desc, act); 567 return __setup_irq(irq, desc, act);
551} 568}
569EXPORT_SYMBOL_GPL(setup_irq);
552 570
553/** 571 /*
554 * free_irq - free an interrupt 572 * Internal function to unregister an irqaction - used to free
555 * @irq: Interrupt line to free 573 * regular and special interrupts that are part of the architecture.
556 * @dev_id: Device identity to free
557 *
558 * Remove an interrupt handler. The handler is removed and if the
559 * interrupt line is no longer in use by any driver it is disabled.
560 * On a shared IRQ the caller must ensure the interrupt is disabled
561 * on the card it drives before calling this function. The function
562 * does not return until any executing interrupts for this IRQ
563 * have completed.
564 *
565 * This function must not be called from interrupt context.
566 */ 574 */
567void free_irq(unsigned int irq, void *dev_id) 575static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
568{ 576{
569 struct irq_desc *desc = irq_to_desc(irq); 577 struct irq_desc *desc = irq_to_desc(irq);
570 struct irqaction **p; 578 struct irqaction *action, **action_ptr;
571 unsigned long flags; 579 unsigned long flags;
572 580
573 WARN_ON(in_interrupt()); 581 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
574 582
575 if (!desc) 583 if (!desc)
576 return; 584 return NULL;
577 585
578 spin_lock_irqsave(&desc->lock, flags); 586 spin_lock_irqsave(&desc->lock, flags);
579 p = &desc->action; 587
588 /*
589 * There can be multiple actions per IRQ descriptor, find the right
590 * one based on the dev_id:
591 */
592 action_ptr = &desc->action;
580 for (;;) { 593 for (;;) {
581 struct irqaction *action = *p; 594 action = *action_ptr;
582 595
583 if (action) { 596 if (!action) {
584 struct irqaction **pp = p; 597 WARN(1, "Trying to free already-free IRQ %d\n", irq);
598 spin_unlock_irqrestore(&desc->lock, flags);
585 599
586 p = &action->next; 600 return NULL;
587 if (action->dev_id != dev_id) 601 }
588 continue;
589 602
590 /* Found it - now remove it from the list of entries */ 603 if (action->dev_id == dev_id)
591 *pp = action->next; 604 break;
605 action_ptr = &action->next;
606 }
592 607
593 /* Currently used only by UML, might disappear one day.*/ 608 /* Found it - now remove it from the list of entries: */
609 *action_ptr = action->next;
610
611 /* Currently used only by UML, might disappear one day: */
594#ifdef CONFIG_IRQ_RELEASE_METHOD 612#ifdef CONFIG_IRQ_RELEASE_METHOD
595 if (desc->chip->release) 613 if (desc->chip->release)
596 desc->chip->release(irq, dev_id); 614 desc->chip->release(irq, dev_id);
597#endif 615#endif
598 616
599 if (!desc->action) { 617 /* If this was the last handler, shut down the IRQ line: */
600 desc->status |= IRQ_DISABLED; 618 if (!desc->action) {
601 if (desc->chip->shutdown) 619 desc->status |= IRQ_DISABLED;
602 desc->chip->shutdown(irq); 620 if (desc->chip->shutdown)
603 else 621 desc->chip->shutdown(irq);
604 desc->chip->disable(irq); 622 else
605 } 623 desc->chip->disable(irq);
606 spin_unlock_irqrestore(&desc->lock, flags); 624 }
607 unregister_handler_proc(irq, action); 625 spin_unlock_irqrestore(&desc->lock, flags);
626
627 unregister_handler_proc(irq, action);
628
629 /* Make sure it's not being used on another CPU: */
630 synchronize_irq(irq);
608 631
609 /* Make sure it's not being used on another CPU */
610 synchronize_irq(irq);
611#ifdef CONFIG_DEBUG_SHIRQ
612 /*
613 * It's a shared IRQ -- the driver ought to be
614 * prepared for it to happen even now it's
615 * being freed, so let's make sure.... We do
616 * this after actually deregistering it, to
617 * make sure that a 'real' IRQ doesn't run in
618 * parallel with our fake
619 */
620 if (action->flags & IRQF_SHARED) {
621 local_irq_save(flags);
622 action->handler(irq, dev_id);
623 local_irq_restore(flags);
624 }
625#endif
626 kfree(action);
627 return;
628 }
629 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
630#ifdef CONFIG_DEBUG_SHIRQ 632#ifdef CONFIG_DEBUG_SHIRQ
631 dump_stack(); 633 /*
632#endif 634 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
633 spin_unlock_irqrestore(&desc->lock, flags); 635 * event to happen even now it's being freed, so let's make sure that
634 return; 636 * is so by doing an extra call to the handler ....
637 *
638 * ( We do this after actually deregistering it, to make sure that a
639 * 'real' IRQ doesn't run in * parallel with our fake. )
640 */
641 if (action->flags & IRQF_SHARED) {
642 local_irq_save(flags);
643 action->handler(irq, dev_id);
644 local_irq_restore(flags);
635 } 645 }
646#endif
647 return action;
648}
649
650/**
651 * remove_irq - free an interrupt
652 * @irq: Interrupt line to free
653 * @act: irqaction for the interrupt
654 *
655 * Used to remove interrupts statically setup by the early boot process.
656 */
657void remove_irq(unsigned int irq, struct irqaction *act)
658{
659 __free_irq(irq, act->dev_id);
660}
661EXPORT_SYMBOL_GPL(remove_irq);
662
663/**
664 * free_irq - free an interrupt allocated with request_irq
665 * @irq: Interrupt line to free
666 * @dev_id: Device identity to free
667 *
668 * Remove an interrupt handler. The handler is removed and if the
669 * interrupt line is no longer in use by any driver it is disabled.
670 * On a shared IRQ the caller must ensure the interrupt is disabled
671 * on the card it drives before calling this function. The function
672 * does not return until any executing interrupts for this IRQ
673 * have completed.
674 *
675 * This function must not be called from interrupt context.
676 */
677void free_irq(unsigned int irq, void *dev_id)
678{
679 kfree(__free_irq(irq, dev_id));
636} 680}
637EXPORT_SYMBOL(free_irq); 681EXPORT_SYMBOL(free_irq);
638 682
@@ -679,11 +723,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
679 * the behavior is classified as "will not fix" so we need to 723 * the behavior is classified as "will not fix" so we need to
680 * start nudging drivers away from using that idiom. 724 * start nudging drivers away from using that idiom.
681 */ 725 */
682 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) 726 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
683 == (IRQF_SHARED|IRQF_DISABLED)) 727 (IRQF_SHARED|IRQF_DISABLED)) {
684 pr_warning("IRQ %d/%s: IRQF_DISABLED is not " 728 pr_warning(
685 "guaranteed on shared IRQs\n", 729 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
686 irq, devname); 730 irq, devname);
731 }
687 732
688#ifdef CONFIG_LOCKDEP 733#ifdef CONFIG_LOCKDEP
689 /* 734 /*
@@ -709,15 +754,13 @@ int request_irq(unsigned int irq, irq_handler_t handler,
709 if (!handler) 754 if (!handler)
710 return -EINVAL; 755 return -EINVAL;
711 756
712 action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); 757 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
713 if (!action) 758 if (!action)
714 return -ENOMEM; 759 return -ENOMEM;
715 760
716 action->handler = handler; 761 action->handler = handler;
717 action->flags = irqflags; 762 action->flags = irqflags;
718 cpus_clear(action->mask);
719 action->name = devname; 763 action->name = devname;
720 action->next = NULL;
721 action->dev_id = dev_id; 764 action->dev_id = dev_id;
722 765
723 retval = __setup_irq(irq, desc, action); 766 retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 7f9b80434e32..243d6121e50e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int cpu, int nr)
19{ 19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, cpu, nr);
23 21
24 if (desc->kstat_irqs != old_desc->kstat_irqs) { 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
25 /* Compute how many bytes we need per irq and allocate them */ 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
26 bytes = nr * sizeof(unsigned int); 24 nr * sizeof(*desc->kstat_irqs));
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30} 25}
31 26
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) 27static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000000..638d8bedec14
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
1/*
2 * linux/kernel/irq/pm.c
3 *
4 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file contains power management functions related to interrupts.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/interrupt.h>
12
13#include "internals.h"
14
15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 *
18 * During system-wide suspend or hibernation device interrupts need to be
19 * disabled at the chip level and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the
21 * IRQ_SUSPENDED flag for them.
22 */
23void suspend_device_irqs(void)
24{
25 struct irq_desc *desc;
26 int irq;
27
28 for_each_irq_desc(irq, desc) {
29 unsigned long flags;
30
31 spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags);
34 }
35
36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED)
38 synchronize_irq(irq);
39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs);
41
42/**
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{
50 struct irq_desc *desc;
51 int irq;
52
53 for_each_irq_desc(irq, desc) {
54 unsigned long flags;
55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags);
62 }
63}
64EXPORT_SYMBOL_GPL(resume_device_irqs);
65
66/**
67 * check_wakeup_irqs - check if any wake-up interrupts are pending
68 */
69int check_wakeup_irqs(void)
70{
71 struct irq_desc *desc;
72 int irq;
73
74 for_each_irq_desc(irq, desc)
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
76 return -EBUSY;
77
78 return 0;
79}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_spurious_irqs(unsigned long dummy) 107static void poll_all_shared_irqs(void)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
123 123
124 try_one_irq(i, desc); 124 try_one_irq(i, desc);
125 } 125 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
126 131
127 mod_timer(&poll_spurious_irq_timer, 132 mod_timer(&poll_spurious_irq_timer,
128 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
129} 134}
130 135
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
131/* 143/*
132 * If 99,900 of the previous 100,000 interrupts have not been handled 144 * If 99,900 of the previous 100,000 interrupts have not been handled
133 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
161 return module_kallsyms_lookup_name(name); 161 return module_kallsyms_lookup_name(name);
162} 162}
163 163
164int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
165 unsigned long),
166 void *data)
167{
168 char namebuf[KSYM_NAME_LEN];
169 unsigned long i;
170 unsigned int off;
171 int ret;
172
173 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
174 off = kallsyms_expand_symbol(off, namebuf);
175 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
176 if (ret != 0)
177 return ret;
178 }
179 return module_kallsyms_on_each_symbol(fn, data);
180}
181EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
182
164static unsigned long get_symbol_pos(unsigned long addr, 183static unsigned long get_symbol_pos(unsigned long addr,
165 unsigned long *symbolsize, 184 unsigned long *symbolsize,
166 unsigned long *offset) 185 unsigned long *offset)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c7fd6692939d..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
42note_buf_t* crash_notes; 42note_buf_t* crash_notes;
43 43
44/* vmcoreinfo stuff */ 44/* vmcoreinfo stuff */
45unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1409 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_OFFSET(list_head, prev);
1410 VMCOREINFO_OFFSET(vm_struct, addr); 1410 VMCOREINFO_OFFSET(vm_struct, addr);
1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1412 log_buf_kexec_setup();
1412 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1413 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1414 VMCOREINFO_NUMBER(PG_lru); 1415 VMCOREINFO_NUMBER(PG_lru);
@@ -1450,11 +1451,7 @@ int kernel_kexec(void)
1450 error = device_suspend(PMSG_FREEZE); 1451 error = device_suspend(PMSG_FREEZE);
1451 if (error) 1452 if (error)
1452 goto Resume_console; 1453 goto Resume_console;
1453 error = disable_nonboot_cpus();
1454 if (error)
1455 goto Resume_devices;
1456 device_pm_lock(); 1454 device_pm_lock();
1457 local_irq_disable();
1458 /* At this point, device_suspend() has been called, 1455 /* At this point, device_suspend() has been called,
1459 * but *not* device_power_down(). We *must* 1456 * but *not* device_power_down(). We *must*
1460 * device_power_down() now. Otherwise, drivers for 1457 * device_power_down() now. Otherwise, drivers for
@@ -1464,12 +1461,15 @@ int kernel_kexec(void)
1464 */ 1461 */
1465 error = device_power_down(PMSG_FREEZE); 1462 error = device_power_down(PMSG_FREEZE);
1466 if (error) 1463 if (error)
1467 goto Enable_irqs; 1464 goto Resume_devices;
1468 1465 error = disable_nonboot_cpus();
1466 if (error)
1467 goto Enable_cpus;
1468 local_irq_disable();
1469 /* Suspend system devices */ 1469 /* Suspend system devices */
1470 error = sysdev_suspend(PMSG_FREEZE); 1470 error = sysdev_suspend(PMSG_FREEZE);
1471 if (error) 1471 if (error)
1472 goto Power_up_devices; 1472 goto Enable_irqs;
1473 } else 1473 } else
1474#endif 1474#endif
1475 { 1475 {
@@ -1483,13 +1483,13 @@ int kernel_kexec(void)
1483#ifdef CONFIG_KEXEC_JUMP 1483#ifdef CONFIG_KEXEC_JUMP
1484 if (kexec_image->preserve_context) { 1484 if (kexec_image->preserve_context) {
1485 sysdev_resume(); 1485 sysdev_resume();
1486 Power_up_devices:
1487 device_power_up(PMSG_RESTORE);
1488 Enable_irqs: 1486 Enable_irqs:
1489 local_irq_enable(); 1487 local_irq_enable();
1490 device_pm_unlock(); 1488 Enable_cpus:
1491 enable_nonboot_cpus(); 1489 enable_nonboot_cpus();
1490 device_power_up(PMSG_RESTORE);
1492 Resume_devices: 1491 Resume_devices:
1492 device_pm_unlock();
1493 device_resume(PMSG_RESTORE); 1493 device_resume(PMSG_RESTORE);
1494 Resume_console: 1494 Resume_console:
1495 resume_console(); 1495 resume_console();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * __request_module - try to load a kernel module
54 * @wait: wait (or not) for the operation to complete
54 * @fmt: printf style format string for the name of the module 55 * @fmt: printf style format string for the name of the module
55 * @...: arguments as specified in the format string 56 * @...: arguments as specified in the format string
56 * 57 *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
63 * If module auto-loading support is disabled then this function 64 * If module auto-loading support is disabled then this function
64 * becomes a no-operation. 65 * becomes a no-operation.
65 */ 66 */
66int request_module(const char *fmt, ...) 67int __request_module(bool wait, const char *fmt, ...)
67{ 68{
68 va_list args; 69 va_list args;
69 char module_name[MODULE_NAME_LEN]; 70 char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
108 return -ENOMEM; 109 return -ENOMEM;
109 } 110 }
110 111
111 ret = call_usermodehelper(modprobe_path, argv, envp, 1); 112 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
112 atomic_dec(&kmod_concurrent); 114 atomic_dec(&kmod_concurrent);
113 return ret; 115 return ret;
114} 116}
115EXPORT_SYMBOL(request_module); 117EXPORT_SYMBOL(__request_module);
116#endif /* CONFIG_MODULES */ 118#endif /* CONFIG_MODULES */
117 119
118struct subprocess_info { 120struct subprocess_info {
@@ -167,7 +169,7 @@ static int ____call_usermodehelper(void *data)
167 } 169 }
168 170
169 /* We can run anywhere, unlike our parent keventd(). */ 171 /* We can run anywhere, unlike our parent keventd(). */
170 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); 172 set_cpus_allowed_ptr(current, cpu_all_mask);
171 173
172 /* 174 /*
173 * Our parent is keventd, which runs with elevated scheduling priority. 175 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..84bbadd4d021 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
110 */ 110 */
111 sched_setscheduler(create->result, SCHED_NORMAL, &param); 111 sched_setscheduler(create->result, SCHED_NORMAL, &param);
112 set_user_nice(create->result, KTHREAD_NICE_LEVEL); 112 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
113 set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); 113 set_cpus_allowed_ptr(create->result, cpu_all_mask);
114 } 114 }
115 complete(&create->done); 115 complete(&create->done);
116} 116}
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
240 set_task_comm(tsk, "kthreadd"); 240 set_task_comm(tsk, "kthreadd");
241 ignore_signals(tsk); 241 ignore_signals(tsk);
242 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 242 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
243 set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); 243 set_cpus_allowed_ptr(tsk, cpu_all_mask);
244 244
245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
246 246
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdbc..ca07c5c0c914 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
9 * as published by the Free Software Foundation; version 2 9 * as published by the Free Software Foundation; version 2
10 * of the License. 10 * of the License.
11 */ 11 */
12
13/*
14 * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
15 * used by the "latencytop" userspace tool. The latency that is tracked is not
16 * the 'traditional' interrupt latency (which is primarily caused by something
17 * else consuming CPU), but instead, it is the latency an application encounters
18 * because the kernel sleeps on its behalf for various reasons.
19 *
20 * This code tracks 2 levels of statistics:
21 * 1) System level latency
22 * 2) Per process latency
23 *
24 * The latency is stored in fixed sized data structures in an accumulated form;
25 * if the "same" latency cause is hit twice, this will be tracked as one entry
26 * in the data structure. Both the count, total accumulated latency and maximum
27 * latency are tracked in this data structure. When the fixed size structure is
28 * full, no new causes are tracked until the buffer is flushed by writing to
29 * the /proc file; the userspace tool does this on a regular basis.
30 *
31 * A latency cause is identified by a stringified backtrace at the point that
32 * the scheduler gets invoked. The userland tool will use this string to
33 * identify the cause of the latency in human readable form.
34 *
35 * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
36 * These files look like this:
37 *
38 * Latency Top version : v0.1
39 * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
40 * | | | |
41 * | | | +----> the stringified backtrace
42 * | | +---------> The maximum latency for this entry in microseconds
43 * | +--------------> The accumulated latency for this entry (microseconds)
44 * +-------------------> The number of times this entry is hit
45 *
46 * (note: the average latency is the accumulated latency divided by the number
47 * of times)
48 */
49
12#include <linux/latencytop.h> 50#include <linux/latencytop.h>
13#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
14#include <linux/seq_file.h> 52#include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
72 firstnonnull = i; 110 firstnonnull = i;
73 continue; 111 continue;
74 } 112 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 113 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
76 unsigned long record = lat->backtrace[q]; 114 unsigned long record = lat->backtrace[q];
77 115
78 if (latency_record[i].backtrace[q] != record) { 116 if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
101 memcpy(&latency_record[i], lat, sizeof(struct latency_record)); 139 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
102} 140}
103 141
104static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) 142/*
143 * Iterator to store a backtrace into a latency record entry
144 */
145static inline void store_stacktrace(struct task_struct *tsk,
146 struct latency_record *lat)
105{ 147{
106 struct stack_trace trace; 148 struct stack_trace trace;
107 149
108 memset(&trace, 0, sizeof(trace)); 150 memset(&trace, 0, sizeof(trace));
109 trace.max_entries = LT_BACKTRACEDEPTH; 151 trace.max_entries = LT_BACKTRACEDEPTH;
110 trace.entries = &lat->backtrace[0]; 152 trace.entries = &lat->backtrace[0];
111 trace.skip = 0;
112 save_stack_trace_tsk(tsk, &trace); 153 save_stack_trace_tsk(tsk, &trace);
113} 154}
114 155
156/**
157 * __account_scheduler_latency - record an occured latency
158 * @tsk - the task struct of the task hitting the latency
159 * @usecs - the duration of the latency in microseconds
160 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
161 *
162 * This function is the main entry point for recording latency entries
163 * as called by the scheduler.
164 *
165 * This function has a few special cases to deal with normal 'non-latency'
166 * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
167 * since this usually is caused by waiting for events via select() and co.
168 *
169 * Negative latencies (caused by time going backwards) are also explicitly
170 * skipped.
171 */
115void __sched 172void __sched
116account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) 173__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
117{ 174{
118 unsigned long flags; 175 unsigned long flags;
119 int i, q; 176 int i, q;
120 struct latency_record lat; 177 struct latency_record lat;
121 178
122 if (!latencytop_enabled)
123 return;
124
125 /* Long interruptible waits are generally user requested... */ 179 /* Long interruptible waits are generally user requested... */
126 if (inter && usecs > 5000) 180 if (inter && usecs > 5000)
127 return; 181 return;
128 182
183 /* Negative sleeps are time going backwards */
184 /* Zero-time sleeps are non-interesting */
185 if (usecs <= 0)
186 return;
187
129 memset(&lat, 0, sizeof(lat)); 188 memset(&lat, 0, sizeof(lat));
130 lat.count = 1; 189 lat.count = 1;
131 lat.time = usecs; 190 lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 if (tsk->latency_record_count >= LT_SAVECOUNT) 202 if (tsk->latency_record_count >= LT_SAVECOUNT)
144 goto out_unlock; 203 goto out_unlock;
145 204
146 for (i = 0; i < LT_SAVECOUNT ; i++) { 205 for (i = 0; i < LT_SAVECOUNT; i++) {
147 struct latency_record *mylat; 206 struct latency_record *mylat;
148 int same = 1; 207 int same = 1;
149 208
150 mylat = &tsk->latency_record[i]; 209 mylat = &tsk->latency_record[i];
151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 210 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
152 unsigned long record = lat.backtrace[q]; 211 unsigned long record = lat.backtrace[q];
153 212
154 if (mylat->backtrace[q] != record) { 213 if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
186 for (i = 0; i < MAXLR; i++) { 245 for (i = 0; i < MAXLR; i++) {
187 if (latency_record[i].backtrace[0]) { 246 if (latency_record[i].backtrace[0]) {
188 int q; 247 int q;
189 seq_printf(m, "%i %li %li ", 248 seq_printf(m, "%i %lu %lu ",
190 latency_record[i].count, 249 latency_record[i].count,
191 latency_record[i].time, 250 latency_record[i].time,
192 latency_record[i].max); 251 latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
223 return single_open(filp, lstats_show, NULL); 282 return single_open(filp, lstats_show, NULL);
224} 283}
225 284
226static struct file_operations lstats_fops = { 285static const struct file_operations lstats_fops = {
227 .open = lstats_open, 286 .open = lstats_open,
228 .read = seq_read, 287 .read = seq_read,
229 .write = lstats_write, 288 .write = lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
236 proc_create("latency_stats", 0644, NULL, &lstats_fops); 295 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237 return 0; 296 return 0;
238} 297}
239__initcall(init_lstats_procfs); 298device_initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 71b567f52813..b0f011866969 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -793,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
793 793
794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
795 printk("turning off the locking correctness validator.\n"); 795 printk("turning off the locking correctness validator.\n");
796 dump_stack();
796 return NULL; 797 return NULL;
797 } 798 }
798 class = lock_classes + nr_lock_classes++; 799 class = lock_classes + nr_lock_classes++;
@@ -856,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
856 857
857 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 858 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
858 printk("turning off the locking correctness validator.\n"); 859 printk("turning off the locking correctness validator.\n");
860 dump_stack();
859 return NULL; 861 return NULL;
860 } 862 }
861 return list_entries + nr_list_entries++; 863 return list_entries + nr_list_entries++;
@@ -1682,6 +1684,7 @@ cache_hit:
1682 1684
1683 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1685 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1684 printk("turning off the locking correctness validator.\n"); 1686 printk("turning off the locking correctness validator.\n");
1687 dump_stack();
1685 return 0; 1688 return 0;
1686 } 1689 }
1687 chain = lock_chains + nr_lock_chains++; 1690 chain = lock_chains + nr_lock_chains++;
@@ -2255,7 +2258,7 @@ void trace_softirqs_off(unsigned long ip)
2255 debug_atomic_inc(&redundant_softirqs_off); 2258 debug_atomic_inc(&redundant_softirqs_off);
2256} 2259}
2257 2260
2258void lockdep_trace_alloc(gfp_t gfp_mask) 2261static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2259{ 2262{
2260 struct task_struct *curr = current; 2263 struct task_struct *curr = current;
2261 2264
@@ -2274,12 +2277,29 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
2274 if (!(gfp_mask & __GFP_FS)) 2277 if (!(gfp_mask & __GFP_FS))
2275 return; 2278 return;
2276 2279
2277 if (DEBUG_LOCKS_WARN_ON(irqs_disabled())) 2280 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2278 return; 2281 return;
2279 2282
2280 mark_held_locks(curr, RECLAIM_FS); 2283 mark_held_locks(curr, RECLAIM_FS);
2281} 2284}
2282 2285
2286static void check_flags(unsigned long flags);
2287
2288void lockdep_trace_alloc(gfp_t gfp_mask)
2289{
2290 unsigned long flags;
2291
2292 if (unlikely(current->lockdep_recursion))
2293 return;
2294
2295 raw_local_irq_save(flags);
2296 check_flags(flags);
2297 current->lockdep_recursion = 1;
2298 __lockdep_trace_alloc(gfp_mask, flags);
2299 current->lockdep_recursion = 0;
2300 raw_local_irq_restore(flags);
2301}
2302
2283static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 2303static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2284{ 2304{
2285 /* 2305 /*
@@ -2524,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2524 debug_locks_off(); 2544 debug_locks_off();
2525 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); 2545 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2526 printk("turning off the locking correctness validator.\n"); 2546 printk("turning off the locking correctness validator.\n");
2547 dump_stack();
2527 return 0; 2548 return 0;
2528 } 2549 }
2529 2550
@@ -2620,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2620 debug_locks_off(); 2641 debug_locks_off();
2621 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2642 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2622 printk("turning off the locking correctness validator.\n"); 2643 printk("turning off the locking correctness validator.\n");
2644 dump_stack();
2623 return 0; 2645 return 0;
2624 } 2646 }
2625 2647
diff --git a/kernel/module.c b/kernel/module.c
index 7fa134e0cc24..c268a771595c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
68 68
69/* List of modules, protected by module_mutex or preempt_disable 69/* List of modules, protected by module_mutex or preempt_disable
70 * (delete uses stop_machine/add uses RCU list operations). */ 70 * (delete uses stop_machine/add uses RCU list operations). */
71static DEFINE_MUTEX(module_mutex); 71DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex);
72static LIST_HEAD(modules); 73static LIST_HEAD(modules);
73 74
74/* Waiting for a module to finish initializing? */ 75/* Waiting for a module to finish initializing? */
@@ -76,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
76 77
77static BLOCKING_NOTIFIER_HEAD(module_notify_list); 78static BLOCKING_NOTIFIER_HEAD(module_notify_list);
78 79
79/* Bounds of module allocation, for speeding __module_text_address */ 80/* Bounds of module allocation, for speeding __module_address */
80static unsigned long module_addr_min = -1UL, module_addr_max = 0; 81static unsigned long module_addr_min = -1UL, module_addr_max = 0;
81 82
82int register_module_notifier(struct notifier_block * nb) 83int register_module_notifier(struct notifier_block * nb)
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
186#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 187#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
187#endif 188#endif
188 189
189struct symsearch {
190 const struct kernel_symbol *start, *stop;
191 const unsigned long *crcs;
192 enum {
193 NOT_GPL_ONLY,
194 GPL_ONLY,
195 WILL_BE_GPL_ONLY,
196 } licence;
197 bool unused;
198};
199
200static bool each_symbol_in_section(const struct symsearch *arr, 190static bool each_symbol_in_section(const struct symsearch *arr,
201 unsigned int arrsize, 191 unsigned int arrsize,
202 struct module *owner, 192 struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
217} 207}
218 208
219/* Returns true as soon as fn returns true, otherwise false. */ 209/* Returns true as soon as fn returns true, otherwise false. */
220static bool each_symbol(bool (*fn)(const struct symsearch *arr, 210bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
221 struct module *owner, 211 unsigned int symnum, void *data), void *data)
222 unsigned int symnum, void *data),
223 void *data)
224{ 212{
225 struct module *mod; 213 struct module *mod;
226 const struct symsearch arr[] = { 214 const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
273 } 261 }
274 return false; 262 return false;
275} 263}
264EXPORT_SYMBOL_GPL(each_symbol);
276 265
277struct find_symbol_arg { 266struct find_symbol_arg {
278 /* Input */ 267 /* Input */
@@ -283,7 +272,7 @@ struct find_symbol_arg {
283 /* Output */ 272 /* Output */
284 struct module *owner; 273 struct module *owner;
285 const unsigned long *crc; 274 const unsigned long *crc;
286 unsigned long value; 275 const struct kernel_symbol *sym;
287}; 276};
288 277
289static bool find_symbol_in_section(const struct symsearch *syms, 278static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
324 313
325 fsa->owner = owner; 314 fsa->owner = owner;
326 fsa->crc = symversion(syms->crcs, symnum); 315 fsa->crc = symversion(syms->crcs, symnum);
327 fsa->value = syms->start[symnum].value; 316 fsa->sym = &syms->start[symnum];
328 return true; 317 return true;
329} 318}
330 319
331/* Find a symbol, return value, (optional) crc and (optional) module 320/* Find a symbol and return it, along with, (optional) crc and
332 * which owns it */ 321 * (optional) module which owns it */
333static unsigned long find_symbol(const char *name, 322const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 323 struct module **owner,
335 const unsigned long **crc, 324 const unsigned long **crc,
336 bool gplok, 325 bool gplok,
337 bool warn) 326 bool warn)
338{ 327{
339 struct find_symbol_arg fsa; 328 struct find_symbol_arg fsa;
340 329
@@ -347,15 +336,16 @@ static unsigned long find_symbol(const char *name,
347 *owner = fsa.owner; 336 *owner = fsa.owner;
348 if (crc) 337 if (crc)
349 *crc = fsa.crc; 338 *crc = fsa.crc;
350 return fsa.value; 339 return fsa.sym;
351 } 340 }
352 341
353 DEBUGP("Failed to find symbol %s\n", name); 342 DEBUGP("Failed to find symbol %s\n", name);
354 return -ENOENT; 343 return NULL;
355} 344}
345EXPORT_SYMBOL_GPL(find_symbol);
356 346
357/* Search for module by name: must hold module_mutex. */ 347/* Search for module by name: must hold module_mutex. */
358static struct module *find_module(const char *name) 348struct module *find_module(const char *name)
359{ 349{
360 struct module *mod; 350 struct module *mod;
361 351
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
365 } 355 }
366 return NULL; 356 return NULL;
367} 357}
358EXPORT_SYMBOL_GPL(find_module);
368 359
369#ifdef CONFIG_SMP 360#ifdef CONFIG_SMP
370 361
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
641} 632}
642 633
643/* Module a uses b */ 634/* Module a uses b */
644static int use_module(struct module *a, struct module *b) 635int use_module(struct module *a, struct module *b)
645{ 636{
646 struct module_use *use; 637 struct module_use *use;
647 int no_warn, err; 638 int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
674 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); 665 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
675 return 1; 666 return 1;
676} 667}
668EXPORT_SYMBOL_GPL(use_module);
677 669
678/* Clear the unload stuff of the module. */ 670/* Clear the unload stuff of the module. */
679static void module_unload_free(struct module *mod) 671static void module_unload_free(struct module *mod)
@@ -856,7 +848,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
856 mutex_lock(&module_mutex); 848 mutex_lock(&module_mutex);
857 /* Store the name of the last unloaded module for diagnostic purposes */ 849 /* Store the name of the last unloaded module for diagnostic purposes */
858 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 850 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
859 unregister_dynamic_debug_module(mod->name); 851 ddebug_remove_module(mod->name);
860 free_module(mod); 852 free_module(mod);
861 853
862 out: 854 out:
@@ -894,7 +886,7 @@ void __symbol_put(const char *symbol)
894 struct module *owner; 886 struct module *owner;
895 887
896 preempt_disable(); 888 preempt_disable();
897 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) 889 if (!find_symbol(symbol, &owner, NULL, true, false))
898 BUG(); 890 BUG();
899 module_put(owner); 891 module_put(owner);
900 preempt_enable(); 892 preempt_enable();
@@ -908,8 +900,10 @@ void symbol_put_addr(void *addr)
908 if (core_kernel_text((unsigned long)addr)) 900 if (core_kernel_text((unsigned long)addr))
909 return; 901 return;
910 902
911 if (!(modaddr = module_text_address((unsigned long)addr))) 903 /* module_text_address is safe here: we're supposed to have reference
912 BUG(); 904 * to module from symbol_get, so it can't go away. */
905 modaddr = __module_text_address((unsigned long)addr);
906 BUG_ON(!modaddr);
913 module_put(modaddr); 907 module_put(modaddr);
914} 908}
915EXPORT_SYMBOL_GPL(symbol_put_addr); 909EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -949,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
949{ 943{
950} 944}
951 945
952static inline int use_module(struct module *a, struct module *b) 946int use_module(struct module *a, struct module *b)
953{ 947{
954 return strong_try_module_get(b) == 0; 948 return strong_try_module_get(b) == 0;
955} 949}
950EXPORT_SYMBOL_GPL(use_module);
956 951
957static inline void module_unload_init(struct module *mod) 952static inline void module_unload_init(struct module *mod)
958{ 953{
@@ -995,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
995 990
996static const char vermagic[] = VERMAGIC_STRING; 991static const char vermagic[] = VERMAGIC_STRING;
997 992
998static int try_to_force_load(struct module *mod, const char *symname) 993static int try_to_force_load(struct module *mod, const char *reason)
999{ 994{
1000#ifdef CONFIG_MODULE_FORCE_LOAD 995#ifdef CONFIG_MODULE_FORCE_LOAD
1001 if (!test_taint(TAINT_FORCED_MODULE)) 996 if (!test_taint(TAINT_FORCED_MODULE))
1002 printk("%s: no version for \"%s\" found: kernel tainted.\n", 997 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1003 mod->name, symname); 998 mod->name, reason);
1004 add_taint_module(mod, TAINT_FORCED_MODULE); 999 add_taint_module(mod, TAINT_FORCED_MODULE);
1005 return 0; 1000 return 0;
1006#else 1001#else
@@ -1057,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1057{ 1052{
1058 const unsigned long *crc; 1053 const unsigned long *crc;
1059 1054
1060 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) 1055 if (!find_symbol("module_layout", NULL, &crc, true, false))
1061 BUG(); 1056 BUG();
1062 return check_version(sechdrs, versindex, "struct_module", mod, crc); 1057 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1063} 1058}
1064 1059
1065/* First part is kernel version, which we ignore if module has crcs. */ 1060/* First part is kernel version, which we ignore if module has crcs. */
@@ -1098,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1098 1093
1099/* Resolve a symbol for this module. I.e. if we find one, record usage. 1094/* Resolve a symbol for this module. I.e. if we find one, record usage.
1100 Must be holding module_mutex. */ 1095 Must be holding module_mutex. */
1101static unsigned long resolve_symbol(Elf_Shdr *sechdrs, 1096static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1102 unsigned int versindex, 1097 unsigned int versindex,
1103 const char *name, 1098 const char *name,
1104 struct module *mod) 1099 struct module *mod)
1105{ 1100{
1106 struct module *owner; 1101 struct module *owner;
1107 unsigned long ret; 1102 const struct kernel_symbol *sym;
1108 const unsigned long *crc; 1103 const unsigned long *crc;
1109 1104
1110 ret = find_symbol(name, &owner, &crc, 1105 sym = find_symbol(name, &owner, &crc,
1111 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1106 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1112 if (!IS_ERR_VALUE(ret)) { 1107 /* use_module can fail due to OOM,
1113 /* use_module can fail due to OOM, 1108 or module initialization or unloading */
1114 or module initialization or unloading */ 1109 if (sym) {
1115 if (!check_version(sechdrs, versindex, name, mod, crc) || 1110 if (!check_version(sechdrs, versindex, name, mod, crc) ||
1116 !use_module(mod, owner)) 1111 !use_module(mod, owner))
1117 ret = -EINVAL; 1112 sym = NULL;
1118 } 1113 }
1119 return ret; 1114 return sym;
1120} 1115}
1121 1116
1122/* 1117/*
@@ -1491,6 +1486,9 @@ static void free_module(struct module *mod)
1491 /* Module unload stuff */ 1486 /* Module unload stuff */
1492 module_unload_free(mod); 1487 module_unload_free(mod);
1493 1488
1489 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp);
1491
1494 /* release any pointers to mcount in this module */ 1492 /* release any pointers to mcount in this module */
1495 ftrace_release(mod->module_core, mod->core_size); 1493 ftrace_release(mod->module_core, mod->core_size);
1496 1494
@@ -1513,17 +1511,15 @@ static void free_module(struct module *mod)
1513void *__symbol_get(const char *symbol) 1511void *__symbol_get(const char *symbol)
1514{ 1512{
1515 struct module *owner; 1513 struct module *owner;
1516 unsigned long value; 1514 const struct kernel_symbol *sym;
1517 1515
1518 preempt_disable(); 1516 preempt_disable();
1519 value = find_symbol(symbol, &owner, NULL, true, true); 1517 sym = find_symbol(symbol, &owner, NULL, true, true);
1520 if (IS_ERR_VALUE(value)) 1518 if (sym && strong_try_module_get(owner))
1521 value = 0; 1519 sym = NULL;
1522 else if (strong_try_module_get(owner))
1523 value = 0;
1524 preempt_enable(); 1520 preempt_enable();
1525 1521
1526 return (void *)value; 1522 return sym ? (void *)sym->value : NULL;
1527} 1523}
1528EXPORT_SYMBOL_GPL(__symbol_get); 1524EXPORT_SYMBOL_GPL(__symbol_get);
1529 1525
@@ -1551,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
1551 1547
1552 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1548 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1553 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1549 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1554 if (!IS_ERR_VALUE(find_symbol(s->name, &owner, 1550 if (find_symbol(s->name, &owner, NULL, true, false)) {
1555 NULL, true, false))) {
1556 printk(KERN_ERR 1551 printk(KERN_ERR
1557 "%s: exports duplicate symbol %s" 1552 "%s: exports duplicate symbol %s"
1558 " (owned by %s)\n", 1553 " (owned by %s)\n",
@@ -1576,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1576 unsigned long secbase; 1571 unsigned long secbase;
1577 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1572 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1578 int ret = 0; 1573 int ret = 0;
1574 const struct kernel_symbol *ksym;
1579 1575
1580 for (i = 1; i < n; i++) { 1576 for (i = 1; i < n; i++) {
1581 switch (sym[i].st_shndx) { 1577 switch (sym[i].st_shndx) {
@@ -1595,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1595 break; 1591 break;
1596 1592
1597 case SHN_UNDEF: 1593 case SHN_UNDEF:
1598 sym[i].st_value 1594 ksym = resolve_symbol(sechdrs, versindex,
1599 = resolve_symbol(sechdrs, versindex, 1595 strtab + sym[i].st_name, mod);
1600 strtab + sym[i].st_name, mod);
1601
1602 /* Ok if resolved. */ 1596 /* Ok if resolved. */
1603 if (!IS_ERR_VALUE(sym[i].st_value)) 1597 if (ksym) {
1598 sym[i].st_value = ksym->value;
1604 break; 1599 break;
1600 }
1601
1605 /* Ok if weak. */ 1602 /* Ok if weak. */
1606 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1603 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1607 break; 1604 break;
@@ -1676,8 +1673,7 @@ static void layout_sections(struct module *mod,
1676 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1673 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1677 || (s->sh_flags & masks[m][1]) 1674 || (s->sh_flags & masks[m][1])
1678 || s->sh_entsize != ~0UL 1675 || s->sh_entsize != ~0UL
1679 || strncmp(secstrings + s->sh_name, 1676 || strstarts(secstrings + s->sh_name, ".init"))
1680 ".init", 5) == 0)
1681 continue; 1677 continue;
1682 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1678 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1683 DEBUGP("\t%s\n", secstrings + s->sh_name); 1679 DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1694,8 +1690,7 @@ static void layout_sections(struct module *mod,
1694 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1690 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1695 || (s->sh_flags & masks[m][1]) 1691 || (s->sh_flags & masks[m][1])
1696 || s->sh_entsize != ~0UL 1692 || s->sh_entsize != ~0UL
1697 || strncmp(secstrings + s->sh_name, 1693 || !strstarts(secstrings + s->sh_name, ".init"))
1698 ".init", 5) != 0)
1699 continue; 1694 continue;
1700 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1695 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1701 | INIT_OFFSET_MASK); 1696 | INIT_OFFSET_MASK);
@@ -1828,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
1828 else 1823 else
1829 return 'b'; 1824 return 'b';
1830 } 1825 }
1831 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, 1826 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
1832 ".debug", strlen(".debug")) == 0)
1833 return 'n'; 1827 return 'n';
1834 return '?'; 1828 return '?';
1835} 1829}
@@ -1861,19 +1855,13 @@ static inline void add_kallsyms(struct module *mod,
1861} 1855}
1862#endif /* CONFIG_KALLSYMS */ 1856#endif /* CONFIG_KALLSYMS */
1863 1857
1864static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) 1858static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1865{ 1859{
1866#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG 1860#ifdef CONFIG_DYNAMIC_DEBUG
1867 unsigned int i; 1861 if (ddebug_add_module(debug, num, debug->modname))
1868 1862 printk(KERN_ERR "dynamic debug error adding module: %s\n",
1869 for (i = 0; i < num; i++) { 1863 debug->modname);
1870 register_dynamic_debug_module(debug[i].modname, 1864#endif
1871 debug[i].type,
1872 debug[i].logical_modname,
1873 debug[i].flag_names,
1874 debug[i].hash, debug[i].hash2);
1875 }
1876#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1877} 1865}
1878 1866
1879static void *module_alloc_update_bounds(unsigned long size) 1867static void *module_alloc_update_bounds(unsigned long size)
@@ -1904,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
1904 unsigned int symindex = 0; 1892 unsigned int symindex = 0;
1905 unsigned int strindex = 0; 1893 unsigned int strindex = 0;
1906 unsigned int modindex, versindex, infoindex, pcpuindex; 1894 unsigned int modindex, versindex, infoindex, pcpuindex;
1907 unsigned int num_kp, num_mcount; 1895 unsigned int num_mcount;
1908 struct kernel_param *kp;
1909 struct module *mod; 1896 struct module *mod;
1910 long err = 0; 1897 long err = 0;
1911 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1922,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
1922 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1909 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1923 return ERR_PTR(-ENOMEM); 1910 return ERR_PTR(-ENOMEM);
1924 1911
1925 /* Create stop_machine threads since the error path relies on
1926 * a non-failing stop_machine call. */
1927 err = stop_machine_create();
1928 if (err)
1929 goto free_hdr;
1930
1931 if (copy_from_user(hdr, umod, len) != 0) { 1912 if (copy_from_user(hdr, umod, len) != 0) {
1932 err = -EFAULT; 1913 err = -EFAULT;
1933 goto free_hdr; 1914 goto free_hdr;
@@ -1968,9 +1949,12 @@ static noinline struct module *load_module(void __user *umod,
1968 } 1949 }
1969#ifndef CONFIG_MODULE_UNLOAD 1950#ifndef CONFIG_MODULE_UNLOAD
1970 /* Don't load .exit sections */ 1951 /* Don't load .exit sections */
1971 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) 1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
1972 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1973#endif 1954#endif
1955 /* Don't keep __versions around; it's just for loading. */
1956 if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
1957 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1974 } 1958 }
1975 1959
1976 modindex = find_sec(hdr, sechdrs, secstrings, 1960 modindex = find_sec(hdr, sechdrs, secstrings,
@@ -2012,7 +1996,7 @@ static noinline struct module *load_module(void __user *umod,
2012 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1996 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2013 /* This is allowed: modprobe --force will invalidate it. */ 1997 /* This is allowed: modprobe --force will invalidate it. */
2014 if (!modmagic) { 1998 if (!modmagic) {
2015 err = try_to_force_load(mod, "magic"); 1999 err = try_to_force_load(mod, "bad vermagic");
2016 if (err) 2000 if (err)
2017 goto free_hdr; 2001 goto free_hdr;
2018 } else if (!same_magic(modmagic, vermagic, versindex)) { 2002 } else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2150,8 +2134,8 @@ static noinline struct module *load_module(void __user *umod,
2150 2134
2151 /* Now we've got everything in the final locations, we can 2135 /* Now we've got everything in the final locations, we can
2152 * find optional sections. */ 2136 * find optional sections. */
2153 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), 2137 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2154 &num_kp); 2138 sizeof(*mod->kp), &mod->num_kp);
2155 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", 2139 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2156 sizeof(*mod->syms), &mod->num_syms); 2140 sizeof(*mod->syms), &mod->num_syms);
2157 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); 2141 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2201,8 +2185,8 @@ static noinline struct module *load_module(void __user *umod,
2201 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2185 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2202#endif 2186#endif
2203 ) { 2187 ) {
2204 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2188 err = try_to_force_load(mod,
2205 err = try_to_force_load(mod, "nocrc"); 2189 "no versions for exported symbols");
2206 if (err) 2190 if (err)
2207 goto cleanup; 2191 goto cleanup;
2208 } 2192 }
@@ -2247,12 +2231,13 @@ static noinline struct module *load_module(void __user *umod,
2247 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2231 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2248 2232
2249 if (!mod->taints) { 2233 if (!mod->taints) {
2250 struct mod_debug *debug; 2234 struct _ddebug *debug;
2251 unsigned int num_debug; 2235 unsigned int num_debug;
2252 2236
2253 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2237 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2254 sizeof(*debug), &num_debug); 2238 sizeof(*debug), &num_debug);
2255 dynamic_printk_setup(debug, num_debug); 2239 if (debug)
2240 dynamic_debug_setup(debug, num_debug);
2256 } 2241 }
2257 2242
2258 /* sechdrs[0].sh_size is always zero */ 2243 /* sechdrs[0].sh_size is always zero */
@@ -2296,11 +2281,11 @@ static noinline struct module *load_module(void __user *umod,
2296 */ 2281 */
2297 list_add_rcu(&mod->list, &modules); 2282 list_add_rcu(&mod->list, &modules);
2298 2283
2299 err = parse_args(mod->name, mod->args, kp, num_kp, NULL); 2284 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2300 if (err < 0) 2285 if (err < 0)
2301 goto unlink; 2286 goto unlink;
2302 2287
2303 err = mod_sysfs_setup(mod, kp, num_kp); 2288 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2304 if (err < 0) 2289 if (err < 0)
2305 goto unlink; 2290 goto unlink;
2306 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2291 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2309,12 +2294,13 @@ static noinline struct module *load_module(void __user *umod,
2309 /* Get rid of temporary copy */ 2294 /* Get rid of temporary copy */
2310 vfree(hdr); 2295 vfree(hdr);
2311 2296
2312 stop_machine_destroy();
2313 /* Done! */ 2297 /* Done! */
2314 return mod; 2298 return mod;
2315 2299
2316 unlink: 2300 unlink:
2317 stop_machine(__unlink_module, mod, NULL); 2301 /* Unlink carefully: kallsyms could be walking list. */
2302 list_del_rcu(&mod->list);
2303 synchronize_sched();
2318 module_arch_cleanup(mod); 2304 module_arch_cleanup(mod);
2319 cleanup: 2305 cleanup:
2320 kobject_del(&mod->mkobj.kobj); 2306 kobject_del(&mod->mkobj.kobj);
@@ -2322,8 +2308,8 @@ static noinline struct module *load_module(void __user *umod,
2322 ftrace_release(mod->module_core, mod->core_size); 2308 ftrace_release(mod->module_core, mod->core_size);
2323 free_unload: 2309 free_unload:
2324 module_unload_free(mod); 2310 module_unload_free(mod);
2325 free_init:
2326#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2311#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2312 free_init:
2327 percpu_modfree(mod->refptr); 2313 percpu_modfree(mod->refptr);
2328#endif 2314#endif
2329 module_free(mod, mod->module_init); 2315 module_free(mod, mod->module_init);
@@ -2337,7 +2323,6 @@ static noinline struct module *load_module(void __user *umod,
2337 kfree(args); 2323 kfree(args);
2338 free_hdr: 2324 free_hdr:
2339 vfree(hdr); 2325 vfree(hdr);
2340 stop_machine_destroy();
2341 return ERR_PTR(err); 2326 return ERR_PTR(err);
2342 2327
2343 truncated: 2328 truncated:
@@ -2614,6 +2599,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2614 preempt_enable(); 2599 preempt_enable();
2615 return ret; 2600 return ret;
2616} 2601}
2602
2603int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
2604 struct module *, unsigned long),
2605 void *data)
2606{
2607 struct module *mod;
2608 unsigned int i;
2609 int ret;
2610
2611 list_for_each_entry(mod, &modules, list) {
2612 for (i = 0; i < mod->num_symtab; i++) {
2613 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
2614 mod, mod->symtab[i].st_value);
2615 if (ret != 0)
2616 return ret;
2617 }
2618 }
2619 return 0;
2620}
2617#endif /* CONFIG_KALLSYMS */ 2621#endif /* CONFIG_KALLSYMS */
2618 2622
2619static char *module_flags(struct module *mod, char *buf) 2623static char *module_flags(struct module *mod, char *buf)
@@ -2749,29 +2753,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2749} 2753}
2750 2754
2751/* 2755/*
2752 * Is this a valid module address? 2756 * is_module_address - is this address inside a module?
2757 * @addr: the address to check.
2758 *
2759 * See is_module_text_address() if you simply want to see if the address
2760 * is code (not data).
2753 */ 2761 */
2754int is_module_address(unsigned long addr) 2762bool is_module_address(unsigned long addr)
2755{ 2763{
2756 struct module *mod; 2764 bool ret;
2757 2765
2758 preempt_disable(); 2766 preempt_disable();
2759 2767 ret = __module_address(addr) != NULL;
2760 list_for_each_entry_rcu(mod, &modules, list) {
2761 if (within_module_core(addr, mod)) {
2762 preempt_enable();
2763 return 1;
2764 }
2765 }
2766
2767 preempt_enable(); 2768 preempt_enable();
2768 2769
2769 return 0; 2770 return ret;
2770} 2771}
2771 2772
2772 2773/*
2773/* Is this a valid kernel address? */ 2774 * __module_address - get the module which contains an address.
2774struct module *__module_text_address(unsigned long addr) 2775 * @addr: the address.
2776 *
2777 * Must be called with preempt disabled or module mutex held so that
2778 * module doesn't get freed during this.
2779 */
2780struct module *__module_address(unsigned long addr)
2775{ 2781{
2776 struct module *mod; 2782 struct module *mod;
2777 2783
@@ -2779,22 +2785,51 @@ struct module *__module_text_address(unsigned long addr)
2779 return NULL; 2785 return NULL;
2780 2786
2781 list_for_each_entry_rcu(mod, &modules, list) 2787 list_for_each_entry_rcu(mod, &modules, list)
2782 if (within(addr, mod->module_init, mod->init_text_size) 2788 if (within_module_core(addr, mod)
2783 || within(addr, mod->module_core, mod->core_text_size)) 2789 || within_module_init(addr, mod))
2784 return mod; 2790 return mod;
2785 return NULL; 2791 return NULL;
2786} 2792}
2793EXPORT_SYMBOL_GPL(__module_address);
2787 2794
2788struct module *module_text_address(unsigned long addr) 2795/*
2796 * is_module_text_address - is this address inside module code?
2797 * @addr: the address to check.
2798 *
2799 * See is_module_address() if you simply want to see if the address is
2800 * anywhere in a module. See kernel_text_address() for testing if an
2801 * address corresponds to kernel or module code.
2802 */
2803bool is_module_text_address(unsigned long addr)
2789{ 2804{
2790 struct module *mod; 2805 bool ret;
2791 2806
2792 preempt_disable(); 2807 preempt_disable();
2793 mod = __module_text_address(addr); 2808 ret = __module_text_address(addr) != NULL;
2794 preempt_enable(); 2809 preempt_enable();
2795 2810
2811 return ret;
2812}
2813
2814/*
2815 * __module_text_address - get the module whose code contains an address.
2816 * @addr: the address.
2817 *
2818 * Must be called with preempt disabled or module mutex held so that
2819 * module doesn't get freed during this.
2820 */
2821struct module *__module_text_address(unsigned long addr)
2822{
2823 struct module *mod = __module_address(addr);
2824 if (mod) {
2825 /* Make sure it's within the text section. */
2826 if (!within(addr, mod->module_init, mod->init_text_size)
2827 && !within(addr, mod->module_core, mod->core_text_size))
2828 mod = NULL;
2829 }
2796 return mod; 2830 return mod;
2797} 2831}
2832EXPORT_SYMBOL_GPL(__module_text_address);
2798 2833
2799/* Don't grab lock, we're oopsing. */ 2834/* Don't grab lock, we're oopsing. */
2800void print_modules(void) 2835void print_modules(void)
@@ -2814,9 +2849,17 @@ void print_modules(void)
2814} 2849}
2815 2850
2816#ifdef CONFIG_MODVERSIONS 2851#ifdef CONFIG_MODVERSIONS
2817/* Generate the signature for struct module here, too, for modversions. */ 2852/* Generate the signature for all relevant module structures here.
2818void struct_module(struct module *mod) { return; } 2853 * If these change, we don't want to try to parse the module. */
2819EXPORT_SYMBOL(struct_module); 2854void module_layout(struct module *mod,
2855 struct modversion_info *ver,
2856 struct kernel_param *kp,
2857 struct kernel_symbol *ks,
2858 struct marker *marker,
2859 struct tracepoint *tp)
2860{
2861}
2862EXPORT_SYMBOL(module_layout);
2820#endif 2863#endif
2821 2864
2822#ifdef CONFIG_MARKERS 2865#ifdef CONFIG_MARKERS
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
34 34
35/* 35/*
36 * Rules: 36 * Rules:
37 * 1. you can only enter a cgroup which is a child of your current 37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup 38 * cgroup
39 * 2. you can only place another process into a cgroup if 39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN 40 * a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct cgroup *new_cgroup, struct task_struct *task)
47{ 47{
48 struct cgroup *orig;
49
50 if (current != task) { 48 if (current != task) {
51 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
52 return -EPERM; 50 return -EPERM;
53 51
54 if (!cgroup_is_descendant(new_cgroup)) 52 if (!cgroup_is_descendant(new_cgroup, current))
55 return -EPERM; 53 return -EPERM;
56 } 54 }
57 55
58 if (atomic_read(&new_cgroup->count) != 0) 56 if (!cgroup_is_descendant(new_cgroup, task))
59 return -EPERM;
60
61 orig = task_cgroup(task, ns_subsys_id);
62 if (orig && orig != new_cgroup->parent)
63 return -EPERM; 57 return -EPERM;
64 58
65 return 0; 59 return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
77 71
78 if (!capable(CAP_SYS_ADMIN)) 72 if (!capable(CAP_SYS_ADMIN))
79 return ERR_PTR(-EPERM); 73 return ERR_PTR(-EPERM);
80 if (!cgroup_is_descendant(cgroup)) 74 if (!cgroup_is_descendant(cgroup, current))
81 return ERR_PTR(-EPERM); 75 return ERR_PTR(-EPERM);
82 76
83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 77 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4eff1b89..3fd8c5bf8b39 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/debug_locks.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/notifier.h>
11#include <linux/module.h> 15#include <linux/module.h>
12#include <linux/sched.h> 16#include <linux/random.h>
13#include <linux/delay.h>
14#include <linux/reboot.h> 17#include <linux/reboot.h>
15#include <linux/notifier.h> 18#include <linux/delay.h>
16#include <linux/init.h> 19#include <linux/kexec.h>
20#include <linux/sched.h>
17#include <linux/sysrq.h> 21#include <linux/sysrq.h>
18#include <linux/interrupt.h> 22#include <linux/init.h>
19#include <linux/nmi.h> 23#include <linux/nmi.h>
20#include <linux/kexec.h>
21#include <linux/debug_locks.h>
22#include <linux/random.h>
23#include <linux/kallsyms.h>
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25 25
26int panic_on_oops; 26int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
52 * 52 *
53 * This function never returns. 53 * This function never returns.
54 */ 54 */
55
56NORET_TYPE void panic(const char * fmt, ...) 55NORET_TYPE void panic(const char * fmt, ...)
57{ 56{
58 long i;
59 static char buf[1024]; 57 static char buf[1024];
60 va_list args; 58 va_list args;
61#if defined(CONFIG_S390) 59 long i;
62 unsigned long caller = (unsigned long) __builtin_return_address(0);
63#endif
64 60
65 /* 61 /*
66 * It's possible to come here directly from a panic-assertion and not 62 * It's possible to come here directly from a panic-assertion and
67 * have preempt disabled. Some functions called from here want 63 * not have preempt disabled. Some functions called from here want
68 * preempt to be disabled. No point enabling it later though... 64 * preempt to be disabled. No point enabling it later though...
69 */ 65 */
70 preempt_disable(); 66 preempt_disable();
@@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...)
77#ifdef CONFIG_DEBUG_BUGVERBOSE 73#ifdef CONFIG_DEBUG_BUGVERBOSE
78 dump_stack(); 74 dump_stack();
79#endif 75#endif
80 bust_spinlocks(0);
81 76
82 /* 77 /*
83 * If we have crashed and we have a crash kernel loaded let it handle 78 * If we have crashed and we have a crash kernel loaded let it handle
@@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
86 */ 81 */
87 crash_kexec(NULL); 82 crash_kexec(NULL);
88 83
89#ifdef CONFIG_SMP
90 /* 84 /*
91 * Note smp_send_stop is the usual smp shutdown function, which 85 * Note smp_send_stop is the usual smp shutdown function, which
92 * unfortunately means it may not be hardened to work in a panic 86 * unfortunately means it may not be hardened to work in a panic
93 * situation. 87 * situation.
94 */ 88 */
95 smp_send_stop(); 89 smp_send_stop();
96#endif
97 90
98 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
99 92
@@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
102 95
103 if (panic_timeout > 0) { 96 if (panic_timeout > 0) {
104 /* 97 /*
105 * Delay timeout seconds before rebooting the machine. 98 * Delay timeout seconds before rebooting the machine.
106 * We can't use the "normal" timers since we just panicked.. 99 * We can't use the "normal" timers since we just panicked.
107 */ 100 */
108 printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); 101 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
102
109 for (i = 0; i < panic_timeout*1000; ) { 103 for (i = 0; i < panic_timeout*1000; ) {
110 touch_nmi_watchdog(); 104 touch_nmi_watchdog();
111 i += panic_blink(i); 105 i += panic_blink(i);
112 mdelay(1); 106 mdelay(1);
113 i++; 107 i++;
114 } 108 }
115 /* This will not be a clean reboot, with everything 109 /*
116 * shutting down. But if there is a chance of 110 * This will not be a clean reboot, with everything
117 * rebooting the system it will be rebooted. 111 * shutting down. But if there is a chance of
112 * rebooting the system it will be rebooted.
118 */ 113 */
119 emergency_restart(); 114 emergency_restart();
120 } 115 }
@@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
127 } 122 }
128#endif 123#endif
129#if defined(CONFIG_S390) 124#if defined(CONFIG_S390)
130 disabled_wait(caller); 125 {
126 unsigned long caller;
127
128 caller = (unsigned long)__builtin_return_address(0);
129 disabled_wait(caller);
130 }
131#endif 131#endif
132 local_irq_enable(); 132 local_irq_enable();
133 for (i = 0;;) { 133 for (i = 0; ; ) {
134 touch_softlockup_watchdog(); 134 touch_softlockup_watchdog();
135 i += panic_blink(i); 135 i += panic_blink(i);
136 mdelay(1); 136 mdelay(1);
137 i++; 137 i++;
138 } 138 }
139 bust_spinlocks(0);
139} 140}
140 141
141EXPORT_SYMBOL(panic); 142EXPORT_SYMBOL(panic);
142 143
143 144
144struct tnt { 145struct tnt {
145 u8 bit; 146 u8 bit;
146 char true; 147 char true;
147 char false; 148 char false;
148}; 149};
149 150
150static const struct tnt tnts[] = { 151static const struct tnt tnts[] = {
151 { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, 152 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
152 { TAINT_FORCED_MODULE, 'F', ' ' }, 153 { TAINT_FORCED_MODULE, 'F', ' ' },
153 { TAINT_UNSAFE_SMP, 'S', ' ' }, 154 { TAINT_UNSAFE_SMP, 'S', ' ' },
154 { TAINT_FORCED_RMMOD, 'R', ' ' }, 155 { TAINT_FORCED_RMMOD, 'R', ' ' },
155 { TAINT_MACHINE_CHECK, 'M', ' ' }, 156 { TAINT_MACHINE_CHECK, 'M', ' ' },
156 { TAINT_BAD_PAGE, 'B', ' ' }, 157 { TAINT_BAD_PAGE, 'B', ' ' },
157 { TAINT_USER, 'U', ' ' }, 158 { TAINT_USER, 'U', ' ' },
158 { TAINT_DIE, 'D', ' ' }, 159 { TAINT_DIE, 'D', ' ' },
159 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 160 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
160 { TAINT_WARN, 'W', ' ' }, 161 { TAINT_WARN, 'W', ' ' },
161 { TAINT_CRAP, 'C', ' ' }, 162 { TAINT_CRAP, 'C', ' ' },
162}; 163};
163 164
164/** 165/**
@@ -195,7 +196,8 @@ const char *print_tainted(void)
195 *s = 0; 196 *s = 0;
196 } else 197 } else
197 snprintf(buf, sizeof(buf), "Not tainted"); 198 snprintf(buf, sizeof(buf), "Not tainted");
198 return(buf); 199
200 return buf;
199} 201}
200 202
201int test_taint(unsigned flag) 203int test_taint(unsigned flag)
@@ -211,7 +213,8 @@ unsigned long get_taint(void)
211 213
212void add_taint(unsigned flag) 214void add_taint(unsigned flag)
213{ 215{
214 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 216 /* can't trust the integrity of the kernel anymore: */
217 debug_locks = 0;
215 set_bit(flag, &tainted_mask); 218 set_bit(flag, &tainted_mask);
216} 219}
217EXPORT_SYMBOL(add_taint); 220EXPORT_SYMBOL(add_taint);
@@ -266,8 +269,8 @@ static void do_oops_enter_exit(void)
266} 269}
267 270
268/* 271/*
269 * Return true if the calling CPU is allowed to print oops-related info. This 272 * Return true if the calling CPU is allowed to print oops-related info.
270 * is a bit racy.. 273 * This is a bit racy..
271 */ 274 */
272int oops_may_print(void) 275int oops_may_print(void)
273{ 276{
@@ -276,20 +279,22 @@ int oops_may_print(void)
276 279
277/* 280/*
278 * Called when the architecture enters its oops handler, before it prints 281 * Called when the architecture enters its oops handler, before it prints
279 * anything. If this is the first CPU to oops, and it's oopsing the first time 282 * anything. If this is the first CPU to oops, and it's oopsing the first
280 * then let it proceed. 283 * time then let it proceed.
281 * 284 *
282 * This is all enabled by the pause_on_oops kernel boot option. We do all this 285 * This is all enabled by the pause_on_oops kernel boot option. We do all
283 * to ensure that oopses don't scroll off the screen. It has the side-effect 286 * this to ensure that oopses don't scroll off the screen. It has the
284 * of preventing later-oopsing CPUs from mucking up the display, too. 287 * side-effect of preventing later-oopsing CPUs from mucking up the display,
288 * too.
285 * 289 *
286 * It turns out that the CPU which is allowed to print ends up pausing for the 290 * It turns out that the CPU which is allowed to print ends up pausing for
287 * right duration, whereas all the other CPUs pause for twice as long: once in 291 * the right duration, whereas all the other CPUs pause for twice as long:
288 * oops_enter(), once in oops_exit(). 292 * once in oops_enter(), once in oops_exit().
289 */ 293 */
290void oops_enter(void) 294void oops_enter(void)
291{ 295{
292 debug_locks_off(); /* can't trust the integrity of the kernel anymore */ 296 /* can't trust the integrity of the kernel anymore: */
297 debug_locks_off();
293 do_oops_enter_exit(); 298 do_oops_enter_exit();
294} 299}
295 300
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
27#if 0 30#if 0
28#define DEBUGP printk 31#define DEBUGP printk
29#else 32#else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 220 return -ENOSPC;
218 } 221 }
219 222
220 *(char **)kp->arg = (char *)val; 223 if (kp->perm & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg);
225
226 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg)
232 return -ENOMEM;
233 } else
234 *(const char **)kp->arg = val;
235
221 return 0; 236 return 0;
222} 237}
223 238
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
571} 586}
572#endif 587#endif
573 588
589void destroy_params(const struct kernel_param *params, unsigned num)
590{
591 unsigned int i;
592
593 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg);
596}
597
574static void __init kernel_add_sysfs_param(const char *name, 598static void __init kernel_add_sysfs_param(const char *name,
575 struct kernel_param *kparam, 599 struct kernel_param *kparam,
576 unsigned int name_skip) 600 unsigned int name_skip)
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 403{
404 struct pid *pid; 404 struct pid *pid;
405 rcu_read_lock(); 405 rcu_read_lock();
406 if (type != PIDTYPE_PID)
407 task = task->group_leader;
406 pid = get_pid(task->pids[type].pid); 408 pid = get_pid(task->pids[type].pid);
407 rcu_read_unlock(); 409 rcu_read_unlock();
408 return pid; 410 return pid;
@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
450} 452}
451EXPORT_SYMBOL_GPL(pid_vnr); 453EXPORT_SYMBOL_GPL(pid_vnr);
452 454
453pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 455pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
456 struct pid_namespace *ns)
454{ 457{
455 return pid_nr_ns(task_pid(tsk), ns); 458 pid_t nr = 0;
459
460 rcu_read_lock();
461 if (!ns)
462 ns = current->nsproxy->pid_ns;
463 if (likely(pid_alive(task))) {
464 if (type != PIDTYPE_PID)
465 task = task->group_leader;
466 nr = pid_nr_ns(task->pids[type].pid, ns);
467 }
468 rcu_read_unlock();
469
470 return nr;
456} 471}
457EXPORT_SYMBOL(task_pid_nr_ns); 472EXPORT_SYMBOL(__task_pid_nr_ns);
458 473
459pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 474pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
460{ 475{
@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
462} 477}
463EXPORT_SYMBOL(task_tgid_nr_ns); 478EXPORT_SYMBOL(task_tgid_nr_ns);
464 479
465pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
466{
467 return pid_nr_ns(task_pgrp(tsk), ns);
468}
469EXPORT_SYMBOL(task_pgrp_nr_ns);
470
471pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
472{
473 return pid_nr_ns(task_session(tsk), ns);
474}
475EXPORT_SYMBOL(task_session_nr_ns);
476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 480struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{ 481{
479 return ns_of_pid(task_pid(tsk)); 482 return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
152{ 152{
153 int nr; 153 int nr;
154 int rc; 154 int rc;
155 struct task_struct *task;
155 156
156 /* 157 /*
157 * The last thread in the cgroup-init thread group is terminating. 158 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
169 read_lock(&tasklist_lock); 170 read_lock(&tasklist_lock);
170 nr = next_pidmap(pid_ns, 1); 171 nr = next_pidmap(pid_ns, 1);
171 while (nr > 0) { 172 while (nr > 0) {
172 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); 173 rcu_read_lock();
174
175 /*
176 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
177 * any nested-container's init processes don't ignore the
178 * signal
179 */
180 task = pid_task(find_vpid(nr), PIDTYPE_PID);
181 if (task)
182 force_sig(SIGKILL, task);
183
184 rcu_read_unlock();
185
173 nr = next_pidmap(pid_ns, nr); 186 nr = next_pidmap(pid_ns, nr);
174 } 187 }
175 read_unlock(&tasklist_lock); 188 read_unlock(&tasklist_lock);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e976e505648d..8e5d9a68b022 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1370 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1370 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1371 return 1; 1371 return 1;
1372 } 1372 }
1373 return 0; 1373
1374 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
1374} 1375}
1375 1376
1376/* 1377/*
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..5f21ab2bbcdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <asm/suspend.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -214,7 +215,7 @@ static int create_image(int platform_mode)
214 return error; 215 return error;
215 216
216 device_pm_lock(); 217 device_pm_lock();
217 local_irq_disable(); 218
218 /* At this point, device_suspend() has been called, but *not* 219 /* At this point, device_suspend() has been called, but *not*
219 * device_power_down(). We *must* call device_power_down() now. 220 * device_power_down(). We *must* call device_power_down() now.
220 * Otherwise, drivers for some devices (e.g. interrupt controllers) 221 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -225,13 +226,25 @@ static int create_image(int platform_mode)
225 if (error) { 226 if (error) {
226 printk(KERN_ERR "PM: Some devices failed to power down, " 227 printk(KERN_ERR "PM: Some devices failed to power down, "
227 "aborting hibernation\n"); 228 "aborting hibernation\n");
228 goto Enable_irqs; 229 goto Unlock;
229 } 230 }
231
232 error = platform_pre_snapshot(platform_mode);
233 if (error || hibernation_test(TEST_PLATFORM))
234 goto Platform_finish;
235
236 error = disable_nonboot_cpus();
237 if (error || hibernation_test(TEST_CPUS)
238 || hibernation_testmode(HIBERNATION_TEST))
239 goto Enable_cpus;
240
241 local_irq_disable();
242
230 sysdev_suspend(PMSG_FREEZE); 243 sysdev_suspend(PMSG_FREEZE);
231 if (error) { 244 if (error) {
232 printk(KERN_ERR "PM: Some devices failed to power down, " 245 printk(KERN_ERR "PM: Some devices failed to power down, "
233 "aborting hibernation\n"); 246 "aborting hibernation\n");
234 goto Power_up_devices; 247 goto Enable_irqs;
235 } 248 }
236 249
237 if (hibernation_test(TEST_CORE)) 250 if (hibernation_test(TEST_CORE))
@@ -247,17 +260,28 @@ static int create_image(int platform_mode)
247 restore_processor_state(); 260 restore_processor_state();
248 if (!in_suspend) 261 if (!in_suspend)
249 platform_leave(platform_mode); 262 platform_leave(platform_mode);
263
250 Power_up: 264 Power_up:
251 sysdev_resume(); 265 sysdev_resume();
252 /* NOTE: device_power_up() is just a resume() for devices 266 /* NOTE: device_power_up() is just a resume() for devices
253 * that suspended with irqs off ... no overall powerup. 267 * that suspended with irqs off ... no overall powerup.
254 */ 268 */
255 Power_up_devices: 269
256 device_power_up(in_suspend ?
257 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
258 Enable_irqs: 270 Enable_irqs:
259 local_irq_enable(); 271 local_irq_enable();
272
273 Enable_cpus:
274 enable_nonboot_cpus();
275
276 Platform_finish:
277 platform_finish(platform_mode);
278
279 device_power_up(in_suspend ?
280 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
281
282 Unlock:
260 device_pm_unlock(); 283 device_pm_unlock();
284
261 return error; 285 return error;
262} 286}
263 287
@@ -265,7 +289,7 @@ static int create_image(int platform_mode)
265 * hibernation_snapshot - quiesce devices and create the hibernation 289 * hibernation_snapshot - quiesce devices and create the hibernation
266 * snapshot image. 290 * snapshot image.
267 * @platform_mode - if set, use the platform driver, if available, to 291 * @platform_mode - if set, use the platform driver, if available, to
268 * prepare the platform frimware for the power transition. 292 * prepare the platform firmware for the power transition.
269 * 293 *
270 * Must be called with pm_mutex held 294 * Must be called with pm_mutex held
271 */ 295 */
@@ -291,25 +315,9 @@ int hibernation_snapshot(int platform_mode)
291 if (hibernation_test(TEST_DEVICES)) 315 if (hibernation_test(TEST_DEVICES))
292 goto Recover_platform; 316 goto Recover_platform;
293 317
294 error = platform_pre_snapshot(platform_mode); 318 error = create_image(platform_mode);
295 if (error || hibernation_test(TEST_PLATFORM)) 319 /* Control returns here after successful restore */
296 goto Finish;
297
298 error = disable_nonboot_cpus();
299 if (!error) {
300 if (hibernation_test(TEST_CPUS))
301 goto Enable_cpus;
302
303 if (hibernation_testmode(HIBERNATION_TEST))
304 goto Enable_cpus;
305 320
306 error = create_image(platform_mode);
307 /* Control returns here after successful restore */
308 }
309 Enable_cpus:
310 enable_nonboot_cpus();
311 Finish:
312 platform_finish(platform_mode);
313 Resume_devices: 321 Resume_devices:
314 device_resume(in_suspend ? 322 device_resume(in_suspend ?
315 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@@ -331,19 +339,33 @@ int hibernation_snapshot(int platform_mode)
331 * kernel. 339 * kernel.
332 */ 340 */
333 341
334static int resume_target_kernel(void) 342static int resume_target_kernel(bool platform_mode)
335{ 343{
336 int error; 344 int error;
337 345
338 device_pm_lock(); 346 device_pm_lock();
339 local_irq_disable(); 347
340 error = device_power_down(PMSG_QUIESCE); 348 error = device_power_down(PMSG_QUIESCE);
341 if (error) { 349 if (error) {
342 printk(KERN_ERR "PM: Some devices failed to power down, " 350 printk(KERN_ERR "PM: Some devices failed to power down, "
343 "aborting resume\n"); 351 "aborting resume\n");
344 goto Enable_irqs; 352 goto Unlock;
345 } 353 }
346 sysdev_suspend(PMSG_QUIESCE); 354
355 error = platform_pre_restore(platform_mode);
356 if (error)
357 goto Cleanup;
358
359 error = disable_nonboot_cpus();
360 if (error)
361 goto Enable_cpus;
362
363 local_irq_disable();
364
365 error = sysdev_suspend(PMSG_QUIESCE);
366 if (error)
367 goto Enable_irqs;
368
347 /* We'll ignore saved state, but this gets preempt count (etc) right */ 369 /* We'll ignore saved state, but this gets preempt count (etc) right */
348 save_processor_state(); 370 save_processor_state();
349 error = restore_highmem(); 371 error = restore_highmem();
@@ -366,11 +388,23 @@ static int resume_target_kernel(void)
366 swsusp_free(); 388 swsusp_free();
367 restore_processor_state(); 389 restore_processor_state();
368 touch_softlockup_watchdog(); 390 touch_softlockup_watchdog();
391
369 sysdev_resume(); 392 sysdev_resume();
370 device_power_up(PMSG_RECOVER); 393
371 Enable_irqs: 394 Enable_irqs:
372 local_irq_enable(); 395 local_irq_enable();
396
397 Enable_cpus:
398 enable_nonboot_cpus();
399
400 Cleanup:
401 platform_restore_cleanup(platform_mode);
402
403 device_power_up(PMSG_RECOVER);
404
405 Unlock:
373 device_pm_unlock(); 406 device_pm_unlock();
407
374 return error; 408 return error;
375} 409}
376 410
@@ -378,7 +412,7 @@ static int resume_target_kernel(void)
378 * hibernation_restore - quiesce devices and restore the hibernation 412 * hibernation_restore - quiesce devices and restore the hibernation
379 * snapshot image. If successful, control returns in hibernation_snaphot() 413 * snapshot image. If successful, control returns in hibernation_snaphot()
380 * @platform_mode - if set, use the platform driver, if available, to 414 * @platform_mode - if set, use the platform driver, if available, to
381 * prepare the platform frimware for the transition. 415 * prepare the platform firmware for the transition.
382 * 416 *
383 * Must be called with pm_mutex held 417 * Must be called with pm_mutex held
384 */ 418 */
@@ -390,19 +424,10 @@ int hibernation_restore(int platform_mode)
390 pm_prepare_console(); 424 pm_prepare_console();
391 suspend_console(); 425 suspend_console();
392 error = device_suspend(PMSG_QUIESCE); 426 error = device_suspend(PMSG_QUIESCE);
393 if (error)
394 goto Finish;
395
396 error = platform_pre_restore(platform_mode);
397 if (!error) { 427 if (!error) {
398 error = disable_nonboot_cpus(); 428 error = resume_target_kernel(platform_mode);
399 if (!error) 429 device_resume(PMSG_RECOVER);
400 error = resume_target_kernel();
401 enable_nonboot_cpus();
402 } 430 }
403 platform_restore_cleanup(platform_mode);
404 device_resume(PMSG_RECOVER);
405 Finish:
406 resume_console(); 431 resume_console();
407 pm_restore_console(); 432 pm_restore_console();
408 return error; 433 return error;
@@ -438,38 +463,46 @@ int hibernation_platform_enter(void)
438 goto Resume_devices; 463 goto Resume_devices;
439 } 464 }
440 465
466 device_pm_lock();
467
468 error = device_power_down(PMSG_HIBERNATE);
469 if (error)
470 goto Unlock;
471
441 error = hibernation_ops->prepare(); 472 error = hibernation_ops->prepare();
442 if (error) 473 if (error)
443 goto Resume_devices; 474 goto Platofrm_finish;
444 475
445 error = disable_nonboot_cpus(); 476 error = disable_nonboot_cpus();
446 if (error) 477 if (error)
447 goto Finish; 478 goto Platofrm_finish;
448 479
449 device_pm_lock();
450 local_irq_disable(); 480 local_irq_disable();
451 error = device_power_down(PMSG_HIBERNATE); 481 sysdev_suspend(PMSG_HIBERNATE);
452 if (!error) { 482 hibernation_ops->enter();
453 sysdev_suspend(PMSG_HIBERNATE); 483 /* We should never get here */
454 hibernation_ops->enter(); 484 while (1);
455 /* We should never get here */
456 while (1);
457 }
458 local_irq_enable();
459 device_pm_unlock();
460 485
461 /* 486 /*
462 * We don't need to reenable the nonboot CPUs or resume consoles, since 487 * We don't need to reenable the nonboot CPUs or resume consoles, since
463 * the system is going to be halted anyway. 488 * the system is going to be halted anyway.
464 */ 489 */
465 Finish: 490 Platofrm_finish:
466 hibernation_ops->finish(); 491 hibernation_ops->finish();
492
493 device_power_up(PMSG_RESTORE);
494
495 Unlock:
496 device_pm_unlock();
497
467 Resume_devices: 498 Resume_devices:
468 entering_platform_hibernation = false; 499 entering_platform_hibernation = false;
469 device_resume(PMSG_RESTORE); 500 device_resume(PMSG_RESTORE);
470 resume_console(); 501 resume_console();
502
471 Close: 503 Close:
472 hibernation_ops->end(); 504 hibernation_ops->end();
505
473 return error; 506 return error;
474} 507}
475 508
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c9632f841f64..f172f41858bb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -287,17 +287,32 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
287 */ 287 */
288static int suspend_enter(suspend_state_t state) 288static int suspend_enter(suspend_state_t state)
289{ 289{
290 int error = 0; 290 int error;
291 291
292 device_pm_lock(); 292 device_pm_lock();
293 arch_suspend_disable_irqs();
294 BUG_ON(!irqs_disabled());
295 293
296 if ((error = device_power_down(PMSG_SUSPEND))) { 294 error = device_power_down(PMSG_SUSPEND);
295 if (error) {
297 printk(KERN_ERR "PM: Some devices failed to power down\n"); 296 printk(KERN_ERR "PM: Some devices failed to power down\n");
298 goto Done; 297 goto Done;
299 } 298 }
300 299
300 if (suspend_ops->prepare) {
301 error = suspend_ops->prepare();
302 if (error)
303 goto Power_up_devices;
304 }
305
306 if (suspend_test(TEST_PLATFORM))
307 goto Platfrom_finish;
308
309 error = disable_nonboot_cpus();
310 if (error || suspend_test(TEST_CPUS))
311 goto Enable_cpus;
312
313 arch_suspend_disable_irqs();
314 BUG_ON(!irqs_disabled());
315
301 error = sysdev_suspend(PMSG_SUSPEND); 316 error = sysdev_suspend(PMSG_SUSPEND);
302 if (!error) { 317 if (!error) {
303 if (!suspend_test(TEST_CORE)) 318 if (!suspend_test(TEST_CORE))
@@ -305,11 +320,22 @@ static int suspend_enter(suspend_state_t state)
305 sysdev_resume(); 320 sysdev_resume();
306 } 321 }
307 322
308 device_power_up(PMSG_RESUME);
309 Done:
310 arch_suspend_enable_irqs(); 323 arch_suspend_enable_irqs();
311 BUG_ON(irqs_disabled()); 324 BUG_ON(irqs_disabled());
325
326 Enable_cpus:
327 enable_nonboot_cpus();
328
329 Platfrom_finish:
330 if (suspend_ops->finish)
331 suspend_ops->finish();
332
333 Power_up_devices:
334 device_power_up(PMSG_RESUME);
335
336 Done:
312 device_pm_unlock(); 337 device_pm_unlock();
338
313 return error; 339 return error;
314} 340}
315 341
@@ -341,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
341 if (suspend_test(TEST_DEVICES)) 367 if (suspend_test(TEST_DEVICES))
342 goto Recover_platform; 368 goto Recover_platform;
343 369
344 if (suspend_ops->prepare) { 370 suspend_enter(state);
345 error = suspend_ops->prepare();
346 if (error)
347 goto Resume_devices;
348 }
349
350 if (suspend_test(TEST_PLATFORM))
351 goto Finish;
352
353 error = disable_nonboot_cpus();
354 if (!error && !suspend_test(TEST_CPUS))
355 suspend_enter(state);
356 371
357 enable_nonboot_cpus();
358 Finish:
359 if (suspend_ops->finish)
360 suspend_ops->finish();
361 Resume_devices: 372 Resume_devices:
362 suspend_test_start(); 373 suspend_test_start();
363 device_resume(PMSG_RESUME); 374 device_resume(PMSG_RESUME);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
321 321
322 INIT_LIST_HEAD(list); 322 INIT_LIST_HEAD(list);
323 323
324 for_each_zone(zone) { 324 for_each_populated_zone(zone) {
325 unsigned long zone_start, zone_end; 325 unsigned long zone_start, zone_end;
326 struct mem_extent *ext, *cur, *aux; 326 struct mem_extent *ext, *cur, *aux;
327 327
328 if (!populated_zone(zone))
329 continue;
330
331 zone_start = zone->zone_start_pfn; 328 zone_start = zone->zone_start_pfn;
332 zone_end = zone->zone_start_pfn + zone->spanned_pages; 329 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333 330
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
804 struct zone *zone; 801 struct zone *zone;
805 unsigned int cnt = 0; 802 unsigned int cnt = 0;
806 803
807 for_each_zone(zone) 804 for_each_populated_zone(zone)
808 if (populated_zone(zone) && is_highmem(zone)) 805 if (is_highmem(zone))
809 cnt += zone_page_state(zone, NR_FREE_PAGES); 806 cnt += zone_page_state(zone, NR_FREE_PAGES);
810 807
811 return cnt; 808 return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h> 52#include <linux/time.h>
53#include <linux/rbtree.h> 53#include <linux/rbtree.h>
54#include <linux/io.h>
54 55
55#include "power.h" 56#include "power.h"
56 57
@@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)
229 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; 230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
230 tmp = size; 231 tmp = size;
231 size += highmem_size; 232 size += highmem_size;
232 for_each_zone (zone) 233 for_each_populated_zone(zone) {
233 if (populated_zone(zone)) { 234 tmp += snapshot_additional_pages(zone);
234 tmp += snapshot_additional_pages(zone); 235 if (is_highmem(zone)) {
235 if (is_highmem(zone)) { 236 highmem_size -=
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES); 237 zone_page_state(zone, NR_FREE_PAGES);
238 } else { 238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES); 239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 } 241 }
242 }
243 243
244 if (highmem_size < 0) 244 if (highmem_size < 0)
245 highmem_size = 0; 245 highmem_size = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..5052b5497c67 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37 38
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
135static int log_buf_len = __LOG_BUF_LEN; 136static int log_buf_len = __LOG_BUF_LEN;
136static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 137static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
137 138
139#ifdef CONFIG_KEXEC
140/*
141 * This appends the listed symbols to /proc/vmcoreinfo
142 *
143 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
144 * obtain access to symbols that are otherwise very difficult to locate. These
145 * symbols are specifically used so that utilities can access and extract the
146 * dmesg log from a vmcore file after a crash.
147 */
148void log_buf_kexec_setup(void)
149{
150 VMCOREINFO_SYMBOL(log_buf);
151 VMCOREINFO_SYMBOL(log_end);
152 VMCOREINFO_SYMBOL(log_buf_len);
153 VMCOREINFO_SYMBOL(logged_chars);
154}
155#endif
156
138static int __init log_buf_len_setup(char *str) 157static int __init log_buf_len_setup(char *str)
139{ 158{
140 unsigned size = memparse(str, &str); 159 unsigned size = memparse(str, &str);
@@ -1292,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
1292bool printk_timed_ratelimit(unsigned long *caller_jiffies, 1311bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1293 unsigned int interval_msecs) 1312 unsigned int interval_msecs)
1294{ 1313{
1295 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { 1314 if (*caller_jiffies == 0
1296 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); 1315 || !time_in_range(jiffies, *caller_jiffies,
1316 *caller_jiffies
1317 + msecs_to_jiffies(interval_msecs))) {
1318 *caller_jiffies = jiffies;
1297 return true; 1319 return true;
1298 } 1320 }
1299 return false; 1321 return false;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..aaad0ec34194 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
60{ 60{
61 spin_lock(&child->sighand->siglock); 61 spin_lock(&child->sighand->siglock);
62 if (task_is_traced(child)) { 62 if (task_is_traced(child)) {
63 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 63 /*
64 * If the group stop is completed or in progress,
65 * this thread was already counted as stopped.
66 */
67 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
68 child->signal->group_stop_count)
64 __set_task_state(child, TASK_STOPPED); 69 __set_task_state(child, TASK_STOPPED);
65 } else { 70 else
66 signal_wake_up(child, 1); 71 signal_wake_up(child, 1);
67 }
68 } 72 }
69 spin_unlock(&child->sighand->siglock); 73 spin_unlock(&child->sighand->siglock);
70} 74}
@@ -235,18 +239,58 @@ out:
235 return retval; 239 return retval;
236} 240}
237 241
238static inline void __ptrace_detach(struct task_struct *child, unsigned int data) 242/*
243 * Called with irqs disabled, returns true if childs should reap themselves.
244 */
245static int ignoring_children(struct sighand_struct *sigh)
239{ 246{
240 child->exit_code = data; 247 int ret;
241 /* .. re-parent .. */ 248 spin_lock(&sigh->siglock);
242 __ptrace_unlink(child); 249 ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
243 /* .. and wake it up. */ 250 (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
244 if (child->exit_state != EXIT_ZOMBIE) 251 spin_unlock(&sigh->siglock);
245 wake_up_process(child); 252 return ret;
253}
254
255/*
256 * Called with tasklist_lock held for writing.
257 * Unlink a traced task, and clean it up if it was a traced zombie.
258 * Return true if it needs to be reaped with release_task().
259 * (We can't call release_task() here because we already hold tasklist_lock.)
260 *
261 * If it's a zombie, our attachedness prevented normal parent notification
262 * or self-reaping. Do notification now if it would have happened earlier.
263 * If it should reap itself, return true.
264 *
265 * If it's our own child, there is no notification to do.
266 * But if our normal children self-reap, then this child
267 * was prevented by ptrace and we must reap it now.
268 */
269static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
270{
271 __ptrace_unlink(p);
272
273 if (p->exit_state == EXIT_ZOMBIE) {
274 if (!task_detached(p) && thread_group_empty(p)) {
275 if (!same_thread_group(p->real_parent, tracer))
276 do_notify_parent(p, p->exit_signal);
277 else if (ignoring_children(tracer->sighand))
278 p->exit_signal = -1;
279 }
280 if (task_detached(p)) {
281 /* Mark it as in the process of being reaped. */
282 p->exit_state = EXIT_DEAD;
283 return true;
284 }
285 }
286
287 return false;
246} 288}
247 289
248int ptrace_detach(struct task_struct *child, unsigned int data) 290int ptrace_detach(struct task_struct *child, unsigned int data)
249{ 291{
292 bool dead = false;
293
250 if (!valid_signal(data)) 294 if (!valid_signal(data))
251 return -EIO; 295 return -EIO;
252 296
@@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
255 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 299 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
256 300
257 write_lock_irq(&tasklist_lock); 301 write_lock_irq(&tasklist_lock);
258 /* protect against de_thread()->release_task() */ 302 /*
259 if (child->ptrace) 303 * This child can be already killed. Make sure de_thread() or
260 __ptrace_detach(child, data); 304 * our sub-thread doing do_wait() didn't do release_task() yet.
305 */
306 if (child->ptrace) {
307 child->exit_code = data;
308 dead = __ptrace_detach(current, child);
309 }
261 write_unlock_irq(&tasklist_lock); 310 write_unlock_irq(&tasklist_lock);
262 311
312 if (unlikely(dead))
313 release_task(child);
314
263 return 0; 315 return 0;
264} 316}
265 317
318/*
319 * Detach all tasks we were using ptrace on.
320 */
321void exit_ptrace(struct task_struct *tracer)
322{
323 struct task_struct *p, *n;
324 LIST_HEAD(ptrace_dead);
325
326 write_lock_irq(&tasklist_lock);
327 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
328 if (__ptrace_detach(tracer, p))
329 list_add(&p->ptrace_entry, &ptrace_dead);
330 }
331 write_unlock_irq(&tasklist_lock);
332
333 BUG_ON(!list_empty(&tracer->ptraced));
334
335 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
336 list_del_init(&p->ptrace_entry);
337 release_task(p);
338 }
339}
340
266int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 341int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
267{ 342{
268 int copied = 0; 343 int copied = 0;
@@ -612,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
612 goto out_put_task_struct; 687 goto out_put_task_struct;
613 688
614 ret = arch_ptrace(child, request, addr, data); 689 ret = arch_ptrace(child, request, addr, data);
615 if (ret < 0)
616 goto out_put_task_struct;
617 690
618 out_put_task_struct: 691 out_put_task_struct:
619 put_task_struct(child); 692 put_task_struct(child);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 70 .cur = -300,
70 .completed = -300, 71 .completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
73 .cpumask = CPU_BITS_NONE, 74 .cpumask = CPU_BITS_NONE,
74}; 75};
75 76
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
77DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
78 97
79static int blimit = 10; 98static int blimit = 10;
80static int qhimark = 10000; 99static int qhimark = 10000;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a059cf47..2c7b8457d0d2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
122 } 122 }
123} 123}
124 124
125static inline void wait_migrated_callbacks(void);
126
125/* 127/*
126 * Orchestrate the specified type of RCU barrier, waiting for all 128 * Orchestrate the specified type of RCU barrier, waiting for all
127 * RCU callbacks of the specified type to complete. 129 * RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
147 complete(&rcu_barrier_completion); 149 complete(&rcu_barrier_completion);
148 wait_for_completion(&rcu_barrier_completion); 150 wait_for_completion(&rcu_barrier_completion);
149 mutex_unlock(&rcu_barrier_mutex); 151 mutex_unlock(&rcu_barrier_mutex);
152 wait_migrated_callbacks();
150} 153}
151 154
152/** 155/**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
176} 179}
177EXPORT_SYMBOL_GPL(rcu_barrier_sched); 180EXPORT_SYMBOL_GPL(rcu_barrier_sched);
178 181
182static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
183static struct rcu_head rcu_migrate_head[3];
184static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
185
186static void rcu_migrate_callback(struct rcu_head *notused)
187{
188 if (atomic_dec_and_test(&rcu_migrate_type_count))
189 wake_up(&rcu_migrate_wq);
190}
191
192static inline void wait_migrated_callbacks(void)
193{
194 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
195}
196
197static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
198 unsigned long action, void *hcpu)
199{
200 if (action == CPU_DYING) {
201 /*
202 * preempt_disable() in on_each_cpu() prevents stop_machine(),
203 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
204 * returns, all online cpus have queued rcu_barrier_func(),
205 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
206 *
207 * These callbacks ensure _rcu_barrier() waits for all
208 * RCU callbacks of the specified type to complete.
209 */
210 atomic_set(&rcu_migrate_type_count, 3);
211 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
212 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
213 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
214 } else if (action == CPU_POST_DEAD) {
215 /* rcu_migrate_head is protected by cpu_add_remove_lock */
216 wait_migrated_callbacks();
217 }
218
219 return NOTIFY_OK;
220}
221
179void __init rcu_init(void) 222void __init rcu_init(void)
180{ 223{
181 __rcu_init(); 224 __rcu_init();
225 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
182} 226}
183 227
184void rcu_scheduler_starting(void) 228void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ 147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148}; 148};
149 149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
150static DEFINE_PER_CPU(struct rcu_data, rcu_data); 193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
151static struct rcu_ctrlblk rcu_ctrlblk = { 195static struct rcu_ctrlblk rcu_ctrlblk = {
152 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
153 .completed = 0, 197 .completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
427 } 471 }
428} 472}
429 473
430DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
431 .dynticks = 1,
432};
433
434#ifdef CONFIG_NO_HZ 474#ifdef CONFIG_NO_HZ
435static DEFINE_PER_CPU(int, rcu_update_flag); 475static DEFINE_PER_CPU(int, rcu_update_flag);
436 476
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0a..9b4a975a4b4a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers = 0;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask;
129 130
130static int stutter_pause_test = 0; 131static int stutter_pause_test = 0;
131 132
@@ -889,10 +890,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
889 */ 890 */
890static void rcu_torture_shuffle_tasks(void) 891static void rcu_torture_shuffle_tasks(void)
891{ 892{
892 cpumask_t tmp_mask;
893 int i; 893 int i;
894 894
895 cpus_setall(tmp_mask); 895 cpumask_setall(shuffle_tmp_mask);
896 get_online_cpus(); 896 get_online_cpus();
897 897
898 /* No point in shuffling if there is only one online CPU (ex: UP) */ 898 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
902 } 902 }
903 903
904 if (rcu_idle_cpu != -1) 904 if (rcu_idle_cpu != -1)
905 cpu_clear(rcu_idle_cpu, tmp_mask); 905 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
906 906
907 set_cpus_allowed_ptr(current, &tmp_mask); 907 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
908 908
909 if (reader_tasks) { 909 if (reader_tasks) {
910 for (i = 0; i < nrealreaders; i++) 910 for (i = 0; i < nrealreaders; i++)
911 if (reader_tasks[i]) 911 if (reader_tasks[i])
912 set_cpus_allowed_ptr(reader_tasks[i], 912 set_cpus_allowed_ptr(reader_tasks[i],
913 &tmp_mask); 913 shuffle_tmp_mask);
914 } 914 }
915 915
916 if (fakewriter_tasks) { 916 if (fakewriter_tasks) {
917 for (i = 0; i < nfakewriters; i++) 917 for (i = 0; i < nfakewriters; i++)
918 if (fakewriter_tasks[i]) 918 if (fakewriter_tasks[i])
919 set_cpus_allowed_ptr(fakewriter_tasks[i], 919 set_cpus_allowed_ptr(fakewriter_tasks[i],
920 &tmp_mask); 920 shuffle_tmp_mask);
921 } 921 }
922 922
923 if (writer_task) 923 if (writer_task)
924 set_cpus_allowed_ptr(writer_task, &tmp_mask); 924 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
925 925
926 if (stats_task) 926 if (stats_task)
927 set_cpus_allowed_ptr(stats_task, &tmp_mask); 927 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
928 928
929 if (rcu_idle_cpu == -1) 929 if (rcu_idle_cpu == -1)
930 rcu_idle_cpu = num_online_cpus() - 1; 930 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
1012 if (shuffler_task) { 1012 if (shuffler_task) {
1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1014 kthread_stop(shuffler_task); 1014 kthread_stop(shuffler_task);
1015 free_cpumask_var(shuffle_tmp_mask);
1015 } 1016 }
1016 shuffler_task = NULL; 1017 shuffler_task = NULL;
1017 1018
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
1190 } 1191 }
1191 if (test_no_idle_hz) { 1192 if (test_no_idle_hz) {
1192 rcu_idle_cpu = num_online_cpus() - 1; 1193 rcu_idle_cpu = num_online_cpus() - 1;
1194
1195 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
1196 firsterr = -ENOMEM;
1197 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
1198 goto unwind;
1199 }
1200
1193 /* Create the shuffler thread */ 1201 /* Create the shuffler thread */
1194 shuffler_task = kthread_run(rcu_torture_shuffle, NULL, 1202 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
1195 "rcu_torture_shuffle"); 1203 "rcu_torture_shuffle");
1196 if (IS_ERR(shuffler_task)) { 1204 if (IS_ERR(shuffler_task)) {
1205 free_cpumask_var(shuffle_tmp_mask);
1197 firsterr = PTR_ERR(shuffler_task); 1206 firsterr = PTR_ERR(shuffler_task);
1198 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); 1207 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
1199 shuffler_task = NULL; 1208 shuffler_task = NULL;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..7f3266922572 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81/*
82 * Increment the quiescent state counter.
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag.
86 */
87void rcu_qsctr_inc(int cpu)
88{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
90 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed;
92}
93
94void rcu_bh_qsctr_inc(int cpu)
95{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed;
99}
100
81#ifdef CONFIG_NO_HZ 101#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 102DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1, 103 .dynticks_nesting = 1,
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
1
2/*
3 * RCU implementation internal declarations:
4 */
5extern struct rcu_state rcu_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data);
7
8extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4ee954f6a8d5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#include "rcutree.h"
47
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{ 49{
48 if (!rdp->beenonline) 50 if (!rdp->beenonline)
diff --git a/kernel/relay.c b/kernel/relay.c
index edc0ba6d8160..bc188549788f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -748,7 +748,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
748 * from the scheduler (trying to re-grab 748 * from the scheduler (trying to re-grab
749 * rq->lock), so defer it. 749 * rq->lock), so defer it.
750 */ 750 */
751 __mod_timer(&buf->timer, jiffies + 1); 751 mod_timer(&buf->timer, jiffies + 1);
752 } 752 }
753 753
754 old = buf->data; 754 old = buf->data;
@@ -795,13 +795,15 @@ void relay_subbufs_consumed(struct rchan *chan,
795 if (!chan) 795 if (!chan)
796 return; 796 return;
797 797
798 if (cpu >= NR_CPUS || !chan->buf[cpu]) 798 if (cpu >= NR_CPUS || !chan->buf[cpu] ||
799 subbufs_consumed > chan->n_subbufs)
799 return; 800 return;
800 801
801 buf = chan->buf[cpu]; 802 buf = chan->buf[cpu];
802 buf->subbufs_consumed += subbufs_consumed; 803 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
803 if (buf->subbufs_consumed > buf->subbufs_produced)
804 buf->subbufs_consumed = buf->subbufs_produced; 804 buf->subbufs_consumed = buf->subbufs_produced;
805 else
806 buf->subbufs_consumed += subbufs_consumed;
805} 807}
806EXPORT_SYMBOL_GPL(relay_subbufs_consumed); 808EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
807 809
diff --git a/kernel/sched.c b/kernel/sched.c
index 7299083e69e7..6cc1fd5d5072 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -331,6 +338,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 338 */
332static DEFINE_SPINLOCK(task_group_lock); 339static DEFINE_SPINLOCK(task_group_lock);
333 340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
334#ifdef CONFIG_FAIR_GROUP_SCHED 348#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 349#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +405,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 405
392#else 406#else
393 407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 416static inline struct task_group *task_group(struct task_struct *p)
396{ 417{
@@ -467,11 +488,17 @@ struct rt_rq {
467 struct rt_prio_array active; 488 struct rt_prio_array active;
468 unsigned long rt_nr_running; 489 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 491 struct {
492 int curr; /* highest queued rt task prio */
493#ifdef CONFIG_SMP
494 int next; /* next highest */
495#endif
496 } highest_prio;
471#endif 497#endif
472#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 499 unsigned long rt_nr_migratory;
474 int overloaded; 500 int overloaded;
501 struct plist_head pushable_tasks;
475#endif 502#endif
476 int rt_throttled; 503 int rt_throttled;
477 u64 rt_time; 504 u64 rt_time;
@@ -549,7 +576,6 @@ struct rq {
549 unsigned long nr_running; 576 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 577 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 578 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 579#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 580 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 581 unsigned char in_nohz_recently;
@@ -590,6 +616,7 @@ struct rq {
590 struct root_domain *rd; 616 struct root_domain *rd;
591 struct sched_domain *sd; 617 struct sched_domain *sd;
592 618
619 unsigned char idle_at_tick;
593 /* For active balancing */ 620 /* For active balancing */
594 int active_balance; 621 int active_balance;
595 int push_cpu; 622 int push_cpu;
@@ -618,9 +645,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 645 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 646
620 /* sys_sched_yield() stats */ 647 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 648 unsigned int yld_count;
625 649
626 /* schedule() stats */ 650 /* schedule() stats */
@@ -1093,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1093 if (rq == this_rq()) { 1117 if (rq == this_rq()) {
1094 hrtimer_restart(timer); 1118 hrtimer_restart(timer);
1095 } else if (!rq->hrtick_csd_pending) { 1119 } else if (!rq->hrtick_csd_pending) {
1096 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1120 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1097 rq->hrtick_csd_pending = 1; 1121 rq->hrtick_csd_pending = 1;
1098 } 1122 }
1099} 1123}
@@ -1129,7 +1153,8 @@ static __init void init_hrtick(void)
1129 */ 1153 */
1130static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1131{ 1155{
1132 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1133} 1158}
1134 1159
1135static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1183,10 +1208,10 @@ static void resched_task(struct task_struct *p)
1183 1208
1184 assert_spin_locked(&task_rq(p)->lock); 1209 assert_spin_locked(&task_rq(p)->lock);
1185 1210
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1211 if (test_tsk_need_resched(p))
1187 return; 1212 return;
1188 1213
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1214 set_tsk_need_resched(p);
1190 1215
1191 cpu = task_cpu(p); 1216 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1217 if (cpu == smp_processor_id())
@@ -1242,7 +1267,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1267 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1268 * idle task through an additional NOOP schedule()
1244 */ 1269 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1270 set_tsk_need_resched(rq->idle);
1246 1271
1247 /* NEED_RESCHED must be visible before we test polling */ 1272 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1273 smp_mb();
@@ -1610,21 +1635,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1635
1611#endif 1636#endif
1612 1637
1638#ifdef CONFIG_PREEMPT
1639
1613/* 1640/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1641 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1642 * way at the expense of forcing extra atomic operations in all
1643 * invocations. This assures that the double_lock is acquired using the
1644 * same underlying policy as the spinlock_t on this architecture, which
1645 * reduces latency compared to the unfair variant below. However, it
1646 * also adds more overhead and therefore may reduce throughput.
1615 */ 1647 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1648static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1649 __releases(this_rq->lock)
1650 __acquires(busiest->lock)
1651 __acquires(this_rq->lock)
1652{
1653 spin_unlock(&this_rq->lock);
1654 double_rq_lock(this_rq, busiest);
1655
1656 return 1;
1657}
1658
1659#else
1660/*
1661 * Unfair double_lock_balance: Optimizes throughput at the expense of
1662 * latency by eliminating extra atomic operations when the locks are
1663 * already in proper order on entry. This favors lower cpu-ids and will
1664 * grant the double lock to lower cpus over higher ids under contention,
1665 * regardless of entry order into the function.
1666 */
1667static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1668 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1669 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1670 __acquires(this_rq->lock)
1620{ 1671{
1621 int ret = 0; 1672 int ret = 0;
1622 1673
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1674 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1675 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1676 spin_unlock(&this_rq->lock);
@@ -1637,6 +1683,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1683 return ret;
1638} 1684}
1639 1685
1686#endif /* CONFIG_PREEMPT */
1687
1688/*
1689 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1690 */
1691static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1692{
1693 if (unlikely(!irqs_disabled())) {
1694 /* printk() doesn't work good under rq->lock */
1695 spin_unlock(&this_rq->lock);
1696 BUG_ON(1);
1697 }
1698
1699 return _double_lock_balance(this_rq, busiest);
1700}
1701
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1702static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1703 __releases(busiest->lock)
1642{ 1704{
@@ -1705,6 +1767,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1767
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1768static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1769{
1770 if (wakeup)
1771 p->se.start_runtime = p->se.sum_exec_runtime;
1772
1708 sched_info_queued(p); 1773 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1774 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1775 p->se.on_rq = 1;
@@ -1712,10 +1777,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1777
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1778static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1779{
1715 if (sleep && p->se.last_wakeup) { 1780 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1781 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1782 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1783 p->se.sum_exec_runtime - p->se.last_wakeup);
1784 p->se.last_wakeup = 0;
1785 } else {
1786 update_avg(&p->se.avg_wakeup,
1787 sysctl_sched_wakeup_granularity);
1788 }
1719 } 1789 }
1720 1790
1721 sched_info_dequeued(p); 1791 sched_info_dequeued(p);
@@ -2017,7 +2087,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2017 * it must be off the runqueue _entirely_, and not 2087 * it must be off the runqueue _entirely_, and not
2018 * preempted! 2088 * preempted!
2019 * 2089 *
2020 * So if it wa still runnable (but just not actively 2090 * So if it was still runnable (but just not actively
2021 * running right now), it's preempted, and we should 2091 * running right now), it's preempted, and we should
2022 * yield - it could be a while. 2092 * yield - it could be a while.
2023 */ 2093 */
@@ -2267,7 +2337,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2267 sync = 0; 2337 sync = 0;
2268 2338
2269#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2270 if (sched_feat(LB_WAKEUP_UPDATE)) { 2340 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2271 struct sched_domain *sd; 2341 struct sched_domain *sd;
2272 2342
2273 this_cpu = raw_smp_processor_id(); 2343 this_cpu = raw_smp_processor_id();
@@ -2345,6 +2415,22 @@ out_activate:
2345 activate_task(rq, p, 1); 2415 activate_task(rq, p, 1);
2346 success = 1; 2416 success = 1;
2347 2417
2418 /*
2419 * Only attribute actual wakeups done by this task.
2420 */
2421 if (!in_interrupt()) {
2422 struct sched_entity *se = &current->se;
2423 u64 sample = se->sum_exec_runtime;
2424
2425 if (se->last_wakeup)
2426 sample -= se->last_wakeup;
2427 else
2428 sample -= se->start_runtime;
2429 update_avg(&se->avg_wakeup, sample);
2430
2431 se->last_wakeup = se->sum_exec_runtime;
2432 }
2433
2348out_running: 2434out_running:
2349 trace_sched_wakeup(rq, p, success); 2435 trace_sched_wakeup(rq, p, success);
2350 check_preempt_curr(rq, p, sync); 2436 check_preempt_curr(rq, p, sync);
@@ -2355,8 +2441,6 @@ out_running:
2355 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
2356#endif 2442#endif
2357out: 2443out:
2358 current->se.last_wakeup = current->se.sum_exec_runtime;
2359
2360 task_rq_unlock(rq, &flags); 2444 task_rq_unlock(rq, &flags);
2361 2445
2362 return success; 2446 return success;
@@ -2386,6 +2470,8 @@ static void __sched_fork(struct task_struct *p)
2386 p->se.prev_sum_exec_runtime = 0; 2470 p->se.prev_sum_exec_runtime = 0;
2387 p->se.last_wakeup = 0; 2471 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2472 p->se.avg_overlap = 0;
2473 p->se.start_runtime = 0;
2474 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2389 2475
2390#ifdef CONFIG_SCHEDSTATS 2476#ifdef CONFIG_SCHEDSTATS
2391 p->se.wait_start = 0; 2477 p->se.wait_start = 0;
@@ -2448,6 +2534,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2448 /* Want to start with kernel preemption disabled. */ 2534 /* Want to start with kernel preemption disabled. */
2449 task_thread_info(p)->preempt_count = 1; 2535 task_thread_info(p)->preempt_count = 1;
2450#endif 2536#endif
2537 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2538
2451 put_cpu(); 2539 put_cpu();
2452} 2540}
2453 2541
@@ -2491,7 +2579,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2491#ifdef CONFIG_PREEMPT_NOTIFIERS 2579#ifdef CONFIG_PREEMPT_NOTIFIERS
2492 2580
2493/** 2581/**
2494 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2582 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2495 * @notifier: notifier struct to register 2583 * @notifier: notifier struct to register
2496 */ 2584 */
2497void preempt_notifier_register(struct preempt_notifier *notifier) 2585void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2588,6 +2676,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2588{ 2676{
2589 struct mm_struct *mm = rq->prev_mm; 2677 struct mm_struct *mm = rq->prev_mm;
2590 long prev_state; 2678 long prev_state;
2679#ifdef CONFIG_SMP
2680 int post_schedule = 0;
2681
2682 if (current->sched_class->needs_post_schedule)
2683 post_schedule = current->sched_class->needs_post_schedule(rq);
2684#endif
2591 2685
2592 rq->prev_mm = NULL; 2686 rq->prev_mm = NULL;
2593 2687
@@ -2606,7 +2700,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2606 finish_arch_switch(prev); 2700 finish_arch_switch(prev);
2607 finish_lock_switch(rq, prev); 2701 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2702#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2703 if (post_schedule)
2610 current->sched_class->post_schedule(rq); 2704 current->sched_class->post_schedule(rq);
2611#endif 2705#endif
2612 2706
@@ -2913,6 +3007,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2913 struct sched_domain *sd, enum cpu_idle_type idle, 3007 struct sched_domain *sd, enum cpu_idle_type idle,
2914 int *all_pinned) 3008 int *all_pinned)
2915{ 3009{
3010 int tsk_cache_hot = 0;
2916 /* 3011 /*
2917 * We do not migrate tasks that are: 3012 * We do not migrate tasks that are:
2918 * 1) running (obviously), or 3013 * 1) running (obviously), or
@@ -2936,10 +3031,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2936 * 2) too many balance attempts have failed. 3031 * 2) too many balance attempts have failed.
2937 */ 3032 */
2938 3033
2939 if (!task_hot(p, rq->clock, sd) || 3034 tsk_cache_hot = task_hot(p, rq->clock, sd);
2940 sd->nr_balance_failed > sd->cache_nice_tries) { 3035 if (!tsk_cache_hot ||
3036 sd->nr_balance_failed > sd->cache_nice_tries) {
2941#ifdef CONFIG_SCHEDSTATS 3037#ifdef CONFIG_SCHEDSTATS
2942 if (task_hot(p, rq->clock, sd)) { 3038 if (tsk_cache_hot) {
2943 schedstat_inc(sd, lb_hot_gained[idle]); 3039 schedstat_inc(sd, lb_hot_gained[idle]);
2944 schedstat_inc(p, se.nr_forced_migrations); 3040 schedstat_inc(p, se.nr_forced_migrations);
2945 } 3041 }
@@ -2947,7 +3043,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2947 return 1; 3043 return 1;
2948 } 3044 }
2949 3045
2950 if (task_hot(p, rq->clock, sd)) { 3046 if (tsk_cache_hot) {
2951 schedstat_inc(p, se.nr_failed_migrations_hot); 3047 schedstat_inc(p, se.nr_failed_migrations_hot);
2952 return 0; 3048 return 0;
2953 } 3049 }
@@ -2987,6 +3083,16 @@ next:
2987 pulled++; 3083 pulled++;
2988 rem_load_move -= p->se.load.weight; 3084 rem_load_move -= p->se.load.weight;
2989 3085
3086#ifdef CONFIG_PREEMPT
3087 /*
3088 * NEWIDLE balancing is a source of latency, so preemptible kernels
3089 * will stop after the first task is pulled to minimize the critical
3090 * section.
3091 */
3092 if (idle == CPU_NEWLY_IDLE)
3093 goto out;
3094#endif
3095
2990 /* 3096 /*
2991 * We only want to steal up to the prescribed amount of weighted load. 3097 * We only want to steal up to the prescribed amount of weighted load.
2992 */ 3098 */
@@ -3033,9 +3139,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3033 sd, idle, all_pinned, &this_best_prio); 3139 sd, idle, all_pinned, &this_best_prio);
3034 class = class->next; 3140 class = class->next;
3035 3141
3142#ifdef CONFIG_PREEMPT
3143 /*
3144 * NEWIDLE balancing is a source of latency, so preemptible
3145 * kernels will stop after the first task is pulled to minimize
3146 * the critical section.
3147 */
3036 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3148 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3037 break; 3149 break;
3038 3150#endif
3039 } while (class && max_load_move > total_load_moved); 3151 } while (class && max_load_move > total_load_moved);
3040 3152
3041 return total_load_moved > 0; 3153 return total_load_moved > 0;
@@ -3085,246 +3197,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3085 3197
3086 return 0; 3198 return 0;
3087} 3199}
3088 3200/********** Helpers for find_busiest_group ************************/
3089/* 3201/*
3090 * find_busiest_group finds and returns the busiest CPU group within the 3202 * sd_lb_stats - Structure to store the statistics of a sched_domain
3091 * domain. It calculates and returns the amount of weighted load which 3203 * during load balancing.
3092 * should be moved to restore balance via the imbalance parameter.
3093 */ 3204 */
3094static struct sched_group * 3205struct sd_lb_stats {
3095find_busiest_group(struct sched_domain *sd, int this_cpu, 3206 struct sched_group *busiest; /* Busiest group in this sd */
3096 unsigned long *imbalance, enum cpu_idle_type idle, 3207 struct sched_group *this; /* Local group in this sd */
3097 int *sd_idle, const struct cpumask *cpus, int *balance) 3208 unsigned long total_load; /* Total load of all groups in sd */
3098{ 3209 unsigned long total_pwr; /* Total power of all groups in sd */
3099 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3210 unsigned long avg_load; /* Average load across all groups in sd */
3100 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3211
3101 unsigned long max_pull; 3212 /** Statistics of this group */
3102 unsigned long busiest_load_per_task, busiest_nr_running; 3213 unsigned long this_load;
3103 unsigned long this_load_per_task, this_nr_running; 3214 unsigned long this_load_per_task;
3104 int load_idx, group_imb = 0; 3215 unsigned long this_nr_running;
3216
3217 /* Statistics of the busiest group */
3218 unsigned long max_load;
3219 unsigned long busiest_load_per_task;
3220 unsigned long busiest_nr_running;
3221
3222 int group_imb; /* Is there imbalance in this sd */
3105#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3223#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3106 int power_savings_balance = 1; 3224 int power_savings_balance; /* Is powersave balance needed for this sd */
3107 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3225 struct sched_group *group_min; /* Least loaded group in sd */
3108 unsigned long min_nr_running = ULONG_MAX; 3226 struct sched_group *group_leader; /* Group which relieves group_min */
3109 struct sched_group *group_min = NULL, *group_leader = NULL; 3227 unsigned long min_load_per_task; /* load_per_task in group_min */
3228 unsigned long leader_nr_running; /* Nr running of group_leader */
3229 unsigned long min_nr_running; /* Nr running of group_min */
3110#endif 3230#endif
3231};
3232
3233/*
3234 * sg_lb_stats - stats of a sched_group required for load_balancing
3235 */
3236struct sg_lb_stats {
3237 unsigned long avg_load; /*Avg load across the CPUs of the group */
3238 unsigned long group_load; /* Total load over the CPUs of the group */
3239 unsigned long sum_nr_running; /* Nr tasks running in the group */
3240 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3241 unsigned long group_capacity;
3242 int group_imb; /* Is there an imbalance in the group ? */
3243};
3244
3245/**
3246 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3247 * @group: The group whose first cpu is to be returned.
3248 */
3249static inline unsigned int group_first_cpu(struct sched_group *group)
3250{
3251 return cpumask_first(sched_group_cpus(group));
3252}
3111 3253
3112 max_load = this_load = total_load = total_pwr = 0; 3254/**
3113 busiest_load_per_task = busiest_nr_running = 0; 3255 * get_sd_load_idx - Obtain the load index for a given sched domain.
3114 this_load_per_task = this_nr_running = 0; 3256 * @sd: The sched_domain whose load_idx is to be obtained.
3257 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3258 */
3259static inline int get_sd_load_idx(struct sched_domain *sd,
3260 enum cpu_idle_type idle)
3261{
3262 int load_idx;
3115 3263
3116 if (idle == CPU_NOT_IDLE) 3264 switch (idle) {
3265 case CPU_NOT_IDLE:
3117 load_idx = sd->busy_idx; 3266 load_idx = sd->busy_idx;
3118 else if (idle == CPU_NEWLY_IDLE) 3267 break;
3268
3269 case CPU_NEWLY_IDLE:
3119 load_idx = sd->newidle_idx; 3270 load_idx = sd->newidle_idx;
3120 else 3271 break;
3272 default:
3121 load_idx = sd->idle_idx; 3273 load_idx = sd->idle_idx;
3274 break;
3275 }
3122 3276
3123 do { 3277 return load_idx;
3124 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3278}
3125 int local_group;
3126 int i;
3127 int __group_imb = 0;
3128 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3129 unsigned long sum_nr_running, sum_weighted_load;
3130 unsigned long sum_avg_load_per_task;
3131 unsigned long avg_load_per_task;
3132 3279
3133 local_group = cpumask_test_cpu(this_cpu,
3134 sched_group_cpus(group));
3135 3280
3136 if (local_group) 3281#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3137 balance_cpu = cpumask_first(sched_group_cpus(group)); 3282/**
3283 * init_sd_power_savings_stats - Initialize power savings statistics for
3284 * the given sched_domain, during load balancing.
3285 *
3286 * @sd: Sched domain whose power-savings statistics are to be initialized.
3287 * @sds: Variable containing the statistics for sd.
3288 * @idle: Idle status of the CPU at which we're performing load-balancing.
3289 */
3290static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3291 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3292{
3293 /*
3294 * Busy processors will not participate in power savings
3295 * balance.
3296 */
3297 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3298 sds->power_savings_balance = 0;
3299 else {
3300 sds->power_savings_balance = 1;
3301 sds->min_nr_running = ULONG_MAX;
3302 sds->leader_nr_running = 0;
3303 }
3304}
3305
3306/**
3307 * update_sd_power_savings_stats - Update the power saving stats for a
3308 * sched_domain while performing load balancing.
3309 *
3310 * @group: sched_group belonging to the sched_domain under consideration.
3311 * @sds: Variable containing the statistics of the sched_domain
3312 * @local_group: Does group contain the CPU for which we're performing
3313 * load balancing ?
3314 * @sgs: Variable containing the statistics of the group.
3315 */
3316static inline void update_sd_power_savings_stats(struct sched_group *group,
3317 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3318{
3138 3319
3139 /* Tally up the load of all CPUs in the group */ 3320 if (!sds->power_savings_balance)
3140 sum_weighted_load = sum_nr_running = avg_load = 0; 3321 return;
3141 sum_avg_load_per_task = avg_load_per_task = 0;
3142 3322
3143 max_cpu_load = 0; 3323 /*
3144 min_cpu_load = ~0UL; 3324 * If the local group is idle or completely loaded
3325 * no need to do power savings balance at this domain
3326 */
3327 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3328 !sds->this_nr_running))
3329 sds->power_savings_balance = 0;
3145 3330
3146 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3331 /*
3147 struct rq *rq = cpu_rq(i); 3332 * If a group is already running at full capacity or idle,
3333 * don't include that group in power savings calculations
3334 */
3335 if (!sds->power_savings_balance ||
3336 sgs->sum_nr_running >= sgs->group_capacity ||
3337 !sgs->sum_nr_running)
3338 return;
3148 3339
3149 if (*sd_idle && rq->nr_running) 3340 /*
3150 *sd_idle = 0; 3341 * Calculate the group which has the least non-idle load.
3342 * This is the group from where we need to pick up the load
3343 * for saving power
3344 */
3345 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3346 (sgs->sum_nr_running == sds->min_nr_running &&
3347 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3348 sds->group_min = group;
3349 sds->min_nr_running = sgs->sum_nr_running;
3350 sds->min_load_per_task = sgs->sum_weighted_load /
3351 sgs->sum_nr_running;
3352 }
3151 3353
3152 /* Bias balancing toward cpus of our domain */ 3354 /*
3153 if (local_group) { 3355 * Calculate the group which is almost near its
3154 if (idle_cpu(i) && !first_idle_cpu) { 3356 * capacity but still has some space to pick up some load
3155 first_idle_cpu = 1; 3357 * from other group and save more power
3156 balance_cpu = i; 3358 */
3157 } 3359 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3360 return;
3158 3361
3159 load = target_load(i, load_idx); 3362 if (sgs->sum_nr_running > sds->leader_nr_running ||
3160 } else { 3363 (sgs->sum_nr_running == sds->leader_nr_running &&
3161 load = source_load(i, load_idx); 3364 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3162 if (load > max_cpu_load) 3365 sds->group_leader = group;
3163 max_cpu_load = load; 3366 sds->leader_nr_running = sgs->sum_nr_running;
3164 if (min_cpu_load > load) 3367 }
3165 min_cpu_load = load; 3368}
3166 }
3167 3369
3168 avg_load += load; 3370/**
3169 sum_nr_running += rq->nr_running; 3371 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3170 sum_weighted_load += weighted_cpuload(i); 3372 * @sds: Variable containing the statistics of the sched_domain
3373 * under consideration.
3374 * @this_cpu: Cpu at which we're currently performing load-balancing.
3375 * @imbalance: Variable to store the imbalance.
3376 *
3377 * Description:
3378 * Check if we have potential to perform some power-savings balance.
3379 * If yes, set the busiest group to be the least loaded group in the
3380 * sched_domain, so that it's CPUs can be put to idle.
3381 *
3382 * Returns 1 if there is potential to perform power-savings balance.
3383 * Else returns 0.
3384 */
3385static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3386 int this_cpu, unsigned long *imbalance)
3387{
3388 if (!sds->power_savings_balance)
3389 return 0;
3171 3390
3172 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3391 if (sds->this != sds->group_leader ||
3173 } 3392 sds->group_leader == sds->group_min)
3393 return 0;
3174 3394
3175 /* 3395 *imbalance = sds->min_load_per_task;
3176 * First idle cpu or the first cpu(busiest) in this sched group 3396 sds->busiest = sds->group_min;
3177 * is eligible for doing load balancing at this and above
3178 * domains. In the newly idle case, we will allow all the cpu's
3179 * to do the newly idle load balance.
3180 */
3181 if (idle != CPU_NEWLY_IDLE && local_group &&
3182 balance_cpu != this_cpu && balance) {
3183 *balance = 0;
3184 goto ret;
3185 }
3186 3397
3187 total_load += avg_load; 3398 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3188 total_pwr += group->__cpu_power; 3399 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3400 group_first_cpu(sds->group_leader);
3401 }
3189 3402
3190 /* Adjust by relative CPU power of the group */ 3403 return 1;
3191 avg_load = sg_div_cpu_power(group,
3192 avg_load * SCHED_LOAD_SCALE);
3193 3404
3405}
3406#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3407static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3408 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3409{
3410 return;
3411}
3194 3412
3195 /* 3413static inline void update_sd_power_savings_stats(struct sched_group *group,
3196 * Consider the group unbalanced when the imbalance is larger 3414 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3197 * than the average weight of two tasks. 3415{
3198 * 3416 return;
3199 * APZ: with cgroup the avg task weight can vary wildly and 3417}
3200 * might not be a suitable number - should we keep a 3418
3201 * normalized nr_running number somewhere that negates 3419static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3202 * the hierarchy? 3420 int this_cpu, unsigned long *imbalance)
3203 */ 3421{
3204 avg_load_per_task = sg_div_cpu_power(group, 3422 return 0;
3205 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3423}
3424#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3425
3426
3427/**
3428 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3429 * @group: sched_group whose statistics are to be updated.
3430 * @this_cpu: Cpu for which load balance is currently performed.
3431 * @idle: Idle status of this_cpu
3432 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3433 * @sd_idle: Idle status of the sched_domain containing group.
3434 * @local_group: Does group contain this_cpu.
3435 * @cpus: Set of cpus considered for load balancing.
3436 * @balance: Should we balance.
3437 * @sgs: variable to hold the statistics for this group.
3438 */
3439static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3440 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3441 int local_group, const struct cpumask *cpus,
3442 int *balance, struct sg_lb_stats *sgs)
3443{
3444 unsigned long load, max_cpu_load, min_cpu_load;
3445 int i;
3446 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3447 unsigned long sum_avg_load_per_task;
3448 unsigned long avg_load_per_task;
3449
3450 if (local_group)
3451 balance_cpu = group_first_cpu(group);
3452
3453 /* Tally up the load of all CPUs in the group */
3454 sum_avg_load_per_task = avg_load_per_task = 0;
3455 max_cpu_load = 0;
3456 min_cpu_load = ~0UL;
3206 3457
3207 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3458 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3208 __group_imb = 1; 3459 struct rq *rq = cpu_rq(i);
3209 3460
3210 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3461 if (*sd_idle && rq->nr_running)
3462 *sd_idle = 0;
3211 3463
3464 /* Bias balancing toward cpus of our domain */
3212 if (local_group) { 3465 if (local_group) {
3213 this_load = avg_load; 3466 if (idle_cpu(i) && !first_idle_cpu) {
3214 this = group; 3467 first_idle_cpu = 1;
3215 this_nr_running = sum_nr_running; 3468 balance_cpu = i;
3216 this_load_per_task = sum_weighted_load; 3469 }
3217 } else if (avg_load > max_load && 3470
3218 (sum_nr_running > group_capacity || __group_imb)) { 3471 load = target_load(i, load_idx);
3219 max_load = avg_load; 3472 } else {
3220 busiest = group; 3473 load = source_load(i, load_idx);
3221 busiest_nr_running = sum_nr_running; 3474 if (load > max_cpu_load)
3222 busiest_load_per_task = sum_weighted_load; 3475 max_cpu_load = load;
3223 group_imb = __group_imb; 3476 if (min_cpu_load > load)
3477 min_cpu_load = load;
3224 } 3478 }
3225 3479
3226#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3480 sgs->group_load += load;
3227 /* 3481 sgs->sum_nr_running += rq->nr_running;
3228 * Busy processors will not participate in power savings 3482 sgs->sum_weighted_load += weighted_cpuload(i);
3229 * balance.
3230 */
3231 if (idle == CPU_NOT_IDLE ||
3232 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3233 goto group_next;
3234 3483
3235 /* 3484 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3236 * If the local group is idle or completely loaded 3485 }
3237 * no need to do power savings balance at this domain
3238 */
3239 if (local_group && (this_nr_running >= group_capacity ||
3240 !this_nr_running))
3241 power_savings_balance = 0;
3242 3486
3243 /* 3487 /*
3244 * If a group is already running at full capacity or idle, 3488 * First idle cpu or the first cpu(busiest) in this sched group
3245 * don't include that group in power savings calculations 3489 * is eligible for doing load balancing at this and above
3246 */ 3490 * domains. In the newly idle case, we will allow all the cpu's
3247 if (!power_savings_balance || sum_nr_running >= group_capacity 3491 * to do the newly idle load balance.
3248 || !sum_nr_running) 3492 */
3249 goto group_next; 3493 if (idle != CPU_NEWLY_IDLE && local_group &&
3494 balance_cpu != this_cpu && balance) {
3495 *balance = 0;
3496 return;
3497 }
3250 3498
3251 /* 3499 /* Adjust by relative CPU power of the group */
3252 * Calculate the group which has the least non-idle load. 3500 sgs->avg_load = sg_div_cpu_power(group,
3253 * This is the group from where we need to pick up the load 3501 sgs->group_load * SCHED_LOAD_SCALE);
3254 * for saving power
3255 */
3256 if ((sum_nr_running < min_nr_running) ||
3257 (sum_nr_running == min_nr_running &&
3258 cpumask_first(sched_group_cpus(group)) >
3259 cpumask_first(sched_group_cpus(group_min)))) {
3260 group_min = group;
3261 min_nr_running = sum_nr_running;
3262 min_load_per_task = sum_weighted_load /
3263 sum_nr_running;
3264 }
3265 3502
3266 /* 3503
3267 * Calculate the group which is almost near its 3504 /*
3268 * capacity but still has some space to pick up some load 3505 * Consider the group unbalanced when the imbalance is larger
3269 * from other group and save more power 3506 * than the average weight of two tasks.
3270 */ 3507 *
3271 if (sum_nr_running <= group_capacity - 1) { 3508 * APZ: with cgroup the avg task weight can vary wildly and
3272 if (sum_nr_running > leader_nr_running || 3509 * might not be a suitable number - should we keep a
3273 (sum_nr_running == leader_nr_running && 3510 * normalized nr_running number somewhere that negates
3274 cpumask_first(sched_group_cpus(group)) < 3511 * the hierarchy?
3275 cpumask_first(sched_group_cpus(group_leader)))) { 3512 */
3276 group_leader = group; 3513 avg_load_per_task = sg_div_cpu_power(group,
3277 leader_nr_running = sum_nr_running; 3514 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3278 } 3515
3516 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3517 sgs->group_imb = 1;
3518
3519 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3520
3521}
3522
3523/**
3524 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3525 * @sd: sched_domain whose statistics are to be updated.
3526 * @this_cpu: Cpu for which load balance is currently performed.
3527 * @idle: Idle status of this_cpu
3528 * @sd_idle: Idle status of the sched_domain containing group.
3529 * @cpus: Set of cpus considered for load balancing.
3530 * @balance: Should we balance.
3531 * @sds: variable to hold the statistics for this sched_domain.
3532 */
3533static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3534 enum cpu_idle_type idle, int *sd_idle,
3535 const struct cpumask *cpus, int *balance,
3536 struct sd_lb_stats *sds)
3537{
3538 struct sched_group *group = sd->groups;
3539 struct sg_lb_stats sgs;
3540 int load_idx;
3541
3542 init_sd_power_savings_stats(sd, sds, idle);
3543 load_idx = get_sd_load_idx(sd, idle);
3544
3545 do {
3546 int local_group;
3547
3548 local_group = cpumask_test_cpu(this_cpu,
3549 sched_group_cpus(group));
3550 memset(&sgs, 0, sizeof(sgs));
3551 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3552 local_group, cpus, balance, &sgs);
3553
3554 if (local_group && balance && !(*balance))
3555 return;
3556
3557 sds->total_load += sgs.group_load;
3558 sds->total_pwr += group->__cpu_power;
3559
3560 if (local_group) {
3561 sds->this_load = sgs.avg_load;
3562 sds->this = group;
3563 sds->this_nr_running = sgs.sum_nr_running;
3564 sds->this_load_per_task = sgs.sum_weighted_load;
3565 } else if (sgs.avg_load > sds->max_load &&
3566 (sgs.sum_nr_running > sgs.group_capacity ||
3567 sgs.group_imb)) {
3568 sds->max_load = sgs.avg_load;
3569 sds->busiest = group;
3570 sds->busiest_nr_running = sgs.sum_nr_running;
3571 sds->busiest_load_per_task = sgs.sum_weighted_load;
3572 sds->group_imb = sgs.group_imb;
3279 } 3573 }
3280group_next: 3574
3281#endif 3575 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3282 group = group->next; 3576 group = group->next;
3283 } while (group != sd->groups); 3577 } while (group != sd->groups);
3284 3578
3285 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3579}
3286 goto out_balanced;
3287
3288 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3289 3580
3290 if (this_load >= avg_load || 3581/**
3291 100*max_load <= sd->imbalance_pct*this_load) 3582 * fix_small_imbalance - Calculate the minor imbalance that exists
3292 goto out_balanced; 3583 * amongst the groups of a sched_domain, during
3584 * load balancing.
3585 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3586 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3587 * @imbalance: Variable to store the imbalance.
3588 */
3589static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3590 int this_cpu, unsigned long *imbalance)
3591{
3592 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3593 unsigned int imbn = 2;
3594
3595 if (sds->this_nr_running) {
3596 sds->this_load_per_task /= sds->this_nr_running;
3597 if (sds->busiest_load_per_task >
3598 sds->this_load_per_task)
3599 imbn = 1;
3600 } else
3601 sds->this_load_per_task =
3602 cpu_avg_load_per_task(this_cpu);
3293 3603
3294 busiest_load_per_task /= busiest_nr_running; 3604 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3295 if (group_imb) 3605 sds->busiest_load_per_task * imbn) {
3296 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3606 *imbalance = sds->busiest_load_per_task;
3607 return;
3608 }
3297 3609
3298 /* 3610 /*
3299 * We're trying to get all the cpus to the average_load, so we don't 3611 * OK, we don't have enough imbalance to justify moving tasks,
3300 * want to push ourselves above the average load, nor do we wish to 3612 * however we may be able to increase total CPU power used by
3301 * reduce the max loaded cpu below the average load, as either of these 3613 * moving them.
3302 * actions would just result in more rebalancing later, and ping-pong
3303 * tasks around. Thus we look for the minimum possible imbalance.
3304 * Negative imbalances (*we* are more loaded than anyone else) will
3305 * be counted as no imbalance for these purposes -- we can't fix that
3306 * by pulling tasks to us. Be careful of negative numbers as they'll
3307 * appear as very large values with unsigned longs.
3308 */ 3614 */
3309 if (max_load <= busiest_load_per_task)
3310 goto out_balanced;
3311 3615
3616 pwr_now += sds->busiest->__cpu_power *
3617 min(sds->busiest_load_per_task, sds->max_load);
3618 pwr_now += sds->this->__cpu_power *
3619 min(sds->this_load_per_task, sds->this_load);
3620 pwr_now /= SCHED_LOAD_SCALE;
3621
3622 /* Amount of load we'd subtract */
3623 tmp = sg_div_cpu_power(sds->busiest,
3624 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3625 if (sds->max_load > tmp)
3626 pwr_move += sds->busiest->__cpu_power *
3627 min(sds->busiest_load_per_task, sds->max_load - tmp);
3628
3629 /* Amount of load we'd add */
3630 if (sds->max_load * sds->busiest->__cpu_power <
3631 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3632 tmp = sg_div_cpu_power(sds->this,
3633 sds->max_load * sds->busiest->__cpu_power);
3634 else
3635 tmp = sg_div_cpu_power(sds->this,
3636 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3637 pwr_move += sds->this->__cpu_power *
3638 min(sds->this_load_per_task, sds->this_load + tmp);
3639 pwr_move /= SCHED_LOAD_SCALE;
3640
3641 /* Move if we gain throughput */
3642 if (pwr_move > pwr_now)
3643 *imbalance = sds->busiest_load_per_task;
3644}
3645
3646/**
3647 * calculate_imbalance - Calculate the amount of imbalance present within the
3648 * groups of a given sched_domain during load balance.
3649 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3650 * @this_cpu: Cpu for which currently load balance is being performed.
3651 * @imbalance: The variable to store the imbalance.
3652 */
3653static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3654 unsigned long *imbalance)
3655{
3656 unsigned long max_pull;
3312 /* 3657 /*
3313 * In the presence of smp nice balancing, certain scenarios can have 3658 * In the presence of smp nice balancing, certain scenarios can have
3314 * max load less than avg load(as we skip the groups at or below 3659 * max load less than avg load(as we skip the groups at or below
3315 * its cpu_power, while calculating max_load..) 3660 * its cpu_power, while calculating max_load..)
3316 */ 3661 */
3317 if (max_load < avg_load) { 3662 if (sds->max_load < sds->avg_load) {
3318 *imbalance = 0; 3663 *imbalance = 0;
3319 goto small_imbalance; 3664 return fix_small_imbalance(sds, this_cpu, imbalance);
3320 } 3665 }
3321 3666
3322 /* Don't want to pull so many tasks that a group would go idle */ 3667 /* Don't want to pull so many tasks that a group would go idle */
3323 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3668 max_pull = min(sds->max_load - sds->avg_load,
3669 sds->max_load - sds->busiest_load_per_task);
3324 3670
3325 /* How much load to actually move to equalise the imbalance */ 3671 /* How much load to actually move to equalise the imbalance */
3326 *imbalance = min(max_pull * busiest->__cpu_power, 3672 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3327 (avg_load - this_load) * this->__cpu_power) 3673 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3328 / SCHED_LOAD_SCALE; 3674 / SCHED_LOAD_SCALE;
3329 3675
3330 /* 3676 /*
@@ -3333,78 +3679,110 @@ group_next:
3333 * a think about bumping its value to force at least one task to be 3679 * a think about bumping its value to force at least one task to be
3334 * moved 3680 * moved
3335 */ 3681 */
3336 if (*imbalance < busiest_load_per_task) { 3682 if (*imbalance < sds->busiest_load_per_task)
3337 unsigned long tmp, pwr_now, pwr_move; 3683 return fix_small_imbalance(sds, this_cpu, imbalance);
3338 unsigned int imbn;
3339
3340small_imbalance:
3341 pwr_move = pwr_now = 0;
3342 imbn = 2;
3343 if (this_nr_running) {
3344 this_load_per_task /= this_nr_running;
3345 if (busiest_load_per_task > this_load_per_task)
3346 imbn = 1;
3347 } else
3348 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3349 3684
3350 if (max_load - this_load + busiest_load_per_task >= 3685}
3351 busiest_load_per_task * imbn) { 3686/******* find_busiest_group() helpers end here *********************/
3352 *imbalance = busiest_load_per_task;
3353 return busiest;
3354 }
3355 3687
3356 /* 3688/**
3357 * OK, we don't have enough imbalance to justify moving tasks, 3689 * find_busiest_group - Returns the busiest group within the sched_domain
3358 * however we may be able to increase total CPU power used by 3690 * if there is an imbalance. If there isn't an imbalance, and
3359 * moving them. 3691 * the user has opted for power-savings, it returns a group whose
3360 */ 3692 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3693 * such a group exists.
3694 *
3695 * Also calculates the amount of weighted load which should be moved
3696 * to restore balance.
3697 *
3698 * @sd: The sched_domain whose busiest group is to be returned.
3699 * @this_cpu: The cpu for which load balancing is currently being performed.
3700 * @imbalance: Variable which stores amount of weighted load which should
3701 * be moved to restore balance/put a group to idle.
3702 * @idle: The idle status of this_cpu.
3703 * @sd_idle: The idleness of sd
3704 * @cpus: The set of CPUs under consideration for load-balancing.
3705 * @balance: Pointer to a variable indicating if this_cpu
3706 * is the appropriate cpu to perform load balancing at this_level.
3707 *
3708 * Returns: - the busiest group if imbalance exists.
3709 * - If no imbalance and user has opted for power-savings balance,
3710 * return the least loaded group whose CPUs can be
3711 * put to idle by rebalancing its tasks onto our group.
3712 */
3713static struct sched_group *
3714find_busiest_group(struct sched_domain *sd, int this_cpu,
3715 unsigned long *imbalance, enum cpu_idle_type idle,
3716 int *sd_idle, const struct cpumask *cpus, int *balance)
3717{
3718 struct sd_lb_stats sds;
3361 3719
3362 pwr_now += busiest->__cpu_power * 3720 memset(&sds, 0, sizeof(sds));
3363 min(busiest_load_per_task, max_load);
3364 pwr_now += this->__cpu_power *
3365 min(this_load_per_task, this_load);
3366 pwr_now /= SCHED_LOAD_SCALE;
3367
3368 /* Amount of load we'd subtract */
3369 tmp = sg_div_cpu_power(busiest,
3370 busiest_load_per_task * SCHED_LOAD_SCALE);
3371 if (max_load > tmp)
3372 pwr_move += busiest->__cpu_power *
3373 min(busiest_load_per_task, max_load - tmp);
3374
3375 /* Amount of load we'd add */
3376 if (max_load * busiest->__cpu_power <
3377 busiest_load_per_task * SCHED_LOAD_SCALE)
3378 tmp = sg_div_cpu_power(this,
3379 max_load * busiest->__cpu_power);
3380 else
3381 tmp = sg_div_cpu_power(this,
3382 busiest_load_per_task * SCHED_LOAD_SCALE);
3383 pwr_move += this->__cpu_power *
3384 min(this_load_per_task, this_load + tmp);
3385 pwr_move /= SCHED_LOAD_SCALE;
3386 3721
3387 /* Move if we gain throughput */ 3722 /*
3388 if (pwr_move > pwr_now) 3723 * Compute the various statistics relavent for load balancing at
3389 *imbalance = busiest_load_per_task; 3724 * this level.
3390 } 3725 */
3726 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3727 balance, &sds);
3728
3729 /* Cases where imbalance does not exist from POV of this_cpu */
3730 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3731 * at this level.
3732 * 2) There is no busy sibling group to pull from.
3733 * 3) This group is the busiest group.
3734 * 4) This group is more busy than the avg busieness at this
3735 * sched_domain.
3736 * 5) The imbalance is within the specified limit.
3737 * 6) Any rebalance would lead to ping-pong
3738 */
3739 if (balance && !(*balance))
3740 goto ret;
3391 3741
3392 return busiest; 3742 if (!sds.busiest || sds.busiest_nr_running == 0)
3743 goto out_balanced;
3393 3744
3394out_balanced: 3745 if (sds.this_load >= sds.max_load)
3395#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3746 goto out_balanced;
3396 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3397 goto ret;
3398 3747
3399 if (this == group_leader && group_leader != group_min) { 3748 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3400 *imbalance = min_load_per_task; 3749
3401 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3750 if (sds.this_load >= sds.avg_load)
3402 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3751 goto out_balanced;
3403 cpumask_first(sched_group_cpus(group_leader)); 3752
3404 } 3753 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3405 return group_min; 3754 goto out_balanced;
3406 } 3755
3407#endif 3756 sds.busiest_load_per_task /= sds.busiest_nr_running;
3757 if (sds.group_imb)
3758 sds.busiest_load_per_task =
3759 min(sds.busiest_load_per_task, sds.avg_load);
3760
3761 /*
3762 * We're trying to get all the cpus to the average_load, so we don't
3763 * want to push ourselves above the average load, nor do we wish to
3764 * reduce the max loaded cpu below the average load, as either of these
3765 * actions would just result in more rebalancing later, and ping-pong
3766 * tasks around. Thus we look for the minimum possible imbalance.
3767 * Negative imbalances (*we* are more loaded than anyone else) will
3768 * be counted as no imbalance for these purposes -- we can't fix that
3769 * by pulling tasks to us. Be careful of negative numbers as they'll
3770 * appear as very large values with unsigned longs.
3771 */
3772 if (sds.max_load <= sds.busiest_load_per_task)
3773 goto out_balanced;
3774
3775 /* Looks like there is an imbalance. Compute it */
3776 calculate_imbalance(&sds, this_cpu, imbalance);
3777 return sds.busiest;
3778
3779out_balanced:
3780 /*
3781 * There is no obvious imbalance. But check if we can do some balancing
3782 * to save power.
3783 */
3784 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3785 return sds.busiest;
3408ret: 3786ret:
3409 *imbalance = 0; 3787 *imbalance = 0;
3410 return NULL; 3788 return NULL;
@@ -3448,19 +3826,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3448 */ 3826 */
3449#define MAX_PINNED_INTERVAL 512 3827#define MAX_PINNED_INTERVAL 512
3450 3828
3829/* Working cpumask for load_balance and load_balance_newidle. */
3830static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3831
3451/* 3832/*
3452 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3833 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3453 * tasks if there is an imbalance. 3834 * tasks if there is an imbalance.
3454 */ 3835 */
3455static int load_balance(int this_cpu, struct rq *this_rq, 3836static int load_balance(int this_cpu, struct rq *this_rq,
3456 struct sched_domain *sd, enum cpu_idle_type idle, 3837 struct sched_domain *sd, enum cpu_idle_type idle,
3457 int *balance, struct cpumask *cpus) 3838 int *balance)
3458{ 3839{
3459 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3840 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3460 struct sched_group *group; 3841 struct sched_group *group;
3461 unsigned long imbalance; 3842 unsigned long imbalance;
3462 struct rq *busiest; 3843 struct rq *busiest;
3463 unsigned long flags; 3844 unsigned long flags;
3845 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3464 3846
3465 cpumask_setall(cpus); 3847 cpumask_setall(cpus);
3466 3848
@@ -3615,8 +3997,7 @@ out:
3615 * this_rq is locked. 3997 * this_rq is locked.
3616 */ 3998 */
3617static int 3999static int
3618load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4000load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3619 struct cpumask *cpus)
3620{ 4001{
3621 struct sched_group *group; 4002 struct sched_group *group;
3622 struct rq *busiest = NULL; 4003 struct rq *busiest = NULL;
@@ -3624,6 +4005,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3624 int ld_moved = 0; 4005 int ld_moved = 0;
3625 int sd_idle = 0; 4006 int sd_idle = 0;
3626 int all_pinned = 0; 4007 int all_pinned = 0;
4008 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3627 4009
3628 cpumask_setall(cpus); 4010 cpumask_setall(cpus);
3629 4011
@@ -3764,10 +4146,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3764 struct sched_domain *sd; 4146 struct sched_domain *sd;
3765 int pulled_task = 0; 4147 int pulled_task = 0;
3766 unsigned long next_balance = jiffies + HZ; 4148 unsigned long next_balance = jiffies + HZ;
3767 cpumask_var_t tmpmask;
3768
3769 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3770 return;
3771 4149
3772 for_each_domain(this_cpu, sd) { 4150 for_each_domain(this_cpu, sd) {
3773 unsigned long interval; 4151 unsigned long interval;
@@ -3778,7 +4156,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3778 if (sd->flags & SD_BALANCE_NEWIDLE) 4156 if (sd->flags & SD_BALANCE_NEWIDLE)
3779 /* If we've pulled tasks over stop searching: */ 4157 /* If we've pulled tasks over stop searching: */
3780 pulled_task = load_balance_newidle(this_cpu, this_rq, 4158 pulled_task = load_balance_newidle(this_cpu, this_rq,
3781 sd, tmpmask); 4159 sd);
3782 4160
3783 interval = msecs_to_jiffies(sd->balance_interval); 4161 interval = msecs_to_jiffies(sd->balance_interval);
3784 if (time_after(next_balance, sd->last_balance + interval)) 4162 if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +4171,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3793 */ 4171 */
3794 this_rq->next_balance = next_balance; 4172 this_rq->next_balance = next_balance;
3795 } 4173 }
3796 free_cpumask_var(tmpmask);
3797} 4174}
3798 4175
3799/* 4176/*
@@ -3943,11 +4320,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3943 unsigned long next_balance = jiffies + 60*HZ; 4320 unsigned long next_balance = jiffies + 60*HZ;
3944 int update_next_balance = 0; 4321 int update_next_balance = 0;
3945 int need_serialize; 4322 int need_serialize;
3946 cpumask_var_t tmp;
3947
3948 /* Fails alloc? Rebalancing probably not a priority right now. */
3949 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3950 return;
3951 4323
3952 for_each_domain(cpu, sd) { 4324 for_each_domain(cpu, sd) {
3953 if (!(sd->flags & SD_LOAD_BALANCE)) 4325 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3972,7 +4344,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3972 } 4344 }
3973 4345
3974 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4346 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3975 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4347 if (load_balance(cpu, rq, sd, idle, &balance)) {
3976 /* 4348 /*
3977 * We've pulled tasks over so either we're no 4349 * We've pulled tasks over so either we're no
3978 * longer idle, or one of our SMT siblings is 4350 * longer idle, or one of our SMT siblings is
@@ -4006,8 +4378,6 @@ out:
4006 */ 4378 */
4007 if (likely(update_next_balance)) 4379 if (likely(update_next_balance))
4008 rq->next_balance = next_balance; 4380 rq->next_balance = next_balance;
4009
4010 free_cpumask_var(tmp);
4011} 4381}
4012 4382
4013/* 4383/*
@@ -4057,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4057#endif 4427#endif
4058} 4428}
4059 4429
4430static inline int on_null_domain(int cpu)
4431{
4432 return !rcu_dereference(cpu_rq(cpu)->sd);
4433}
4434
4060/* 4435/*
4061 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4436 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4062 * 4437 *
@@ -4114,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4114 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4489 cpumask_test_cpu(cpu, nohz.cpu_mask))
4115 return; 4490 return;
4116#endif 4491#endif
4117 if (time_after_eq(jiffies, rq->next_balance)) 4492 /* Don't need to rebalance while attached to NULL domain */
4493 if (time_after_eq(jiffies, rq->next_balance) &&
4494 likely(!on_null_domain(cpu)))
4118 raise_softirq(SCHED_SOFTIRQ); 4495 raise_softirq(SCHED_SOFTIRQ);
4119} 4496}
4120 4497
@@ -4508,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
4508#endif 4885#endif
4509} 4886}
4510 4887
4888static void put_prev_task(struct rq *rq, struct task_struct *prev)
4889{
4890 if (prev->state == TASK_RUNNING) {
4891 u64 runtime = prev->se.sum_exec_runtime;
4892
4893 runtime -= prev->se.prev_sum_exec_runtime;
4894 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4895
4896 /*
4897 * In order to avoid avg_overlap growing stale when we are
4898 * indeed overlapping and hence not getting put to sleep, grow
4899 * the avg_overlap on preemption.
4900 *
4901 * We use the average preemption runtime because that
4902 * correlates to the amount of cache footprint a task can
4903 * build up.
4904 */
4905 update_avg(&prev->se.avg_overlap, runtime);
4906 }
4907 prev->sched_class->put_prev_task(rq, prev);
4908}
4909
4511/* 4910/*
4512 * Pick up the highest-prio task: 4911 * Pick up the highest-prio task:
4513 */ 4912 */
4514static inline struct task_struct * 4913static inline struct task_struct *
4515pick_next_task(struct rq *rq, struct task_struct *prev) 4914pick_next_task(struct rq *rq)
4516{ 4915{
4517 const struct sched_class *class; 4916 const struct sched_class *class;
4518 struct task_struct *p; 4917 struct task_struct *p;
@@ -4584,8 +4983,8 @@ need_resched_nonpreemptible:
4584 if (unlikely(!rq->nr_running)) 4983 if (unlikely(!rq->nr_running))
4585 idle_balance(cpu, rq); 4984 idle_balance(cpu, rq);
4586 4985
4587 prev->sched_class->put_prev_task(rq, prev); 4986 put_prev_task(rq, prev);
4588 next = pick_next_task(rq, prev); 4987 next = pick_next_task(rq);
4589 4988
4590 if (likely(prev != next)) { 4989 if (likely(prev != next)) {
4591 sched_info_switch(prev, next); 4990 sched_info_switch(prev, next);
@@ -4707,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
4707 * between schedule and now. 5106 * between schedule and now.
4708 */ 5107 */
4709 barrier(); 5108 barrier();
4710 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5109 } while (need_resched());
4711} 5110}
4712EXPORT_SYMBOL(preempt_schedule); 5111EXPORT_SYMBOL(preempt_schedule);
4713 5112
@@ -4736,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4736 * between schedule and now. 5135 * between schedule and now.
4737 */ 5136 */
4738 barrier(); 5137 barrier();
4739 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5138 } while (need_resched());
4740} 5139}
4741 5140
4742#endif /* CONFIG_PREEMPT */ 5141#endif /* CONFIG_PREEMPT */
@@ -4797,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4797 __wake_up_common(q, mode, 1, 0, NULL); 5196 __wake_up_common(q, mode, 1, 0, NULL);
4798} 5197}
4799 5198
5199void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5200{
5201 __wake_up_common(q, mode, 1, 0, key);
5202}
5203
4800/** 5204/**
4801 * __wake_up_sync - wake up threads blocked on a waitqueue. 5205 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4802 * @q: the waitqueue 5206 * @q: the waitqueue
4803 * @mode: which threads 5207 * @mode: which threads
4804 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5208 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5209 * @key: opaque value to be passed to wakeup targets
4805 * 5210 *
4806 * The sync wakeup differs that the waker knows that it will schedule 5211 * The sync wakeup differs that the waker knows that it will schedule
4807 * away soon, so while the target thread will be woken up, it will not 5212 * away soon, so while the target thread will be woken up, it will not
@@ -4810,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4810 * 5215 *
4811 * On UP it can prevent extra preemption. 5216 * On UP it can prevent extra preemption.
4812 */ 5217 */
4813void 5218void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4814__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5219 int nr_exclusive, void *key)
4815{ 5220{
4816 unsigned long flags; 5221 unsigned long flags;
4817 int sync = 1; 5222 int sync = 1;
@@ -4823,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4823 sync = 0; 5228 sync = 0;
4824 5229
4825 spin_lock_irqsave(&q->lock, flags); 5230 spin_lock_irqsave(&q->lock, flags);
4826 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5231 __wake_up_common(q, mode, nr_exclusive, sync, key);
4827 spin_unlock_irqrestore(&q->lock, flags); 5232 spin_unlock_irqrestore(&q->lock, flags);
4828} 5233}
5234EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5235
5236/*
5237 * __wake_up_sync - see __wake_up_sync_key()
5238 */
5239void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5240{
5241 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5242}
4829EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5243EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4830 5244
4831/** 5245/**
@@ -5210,7 +5624,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5210 if (increment > 40) 5624 if (increment > 40)
5211 increment = 40; 5625 increment = 40;
5212 5626
5213 nice = PRIO_TO_NICE(current->static_prio) + increment; 5627 nice = TASK_NICE(current) + increment;
5214 if (nice < -20) 5628 if (nice < -20)
5215 nice = -20; 5629 nice = -20;
5216 if (nice > 19) 5630 if (nice > 19)
@@ -6483,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6483 if (!rq->nr_running) 6897 if (!rq->nr_running)
6484 break; 6898 break;
6485 update_rq_clock(rq); 6899 update_rq_clock(rq);
6486 next = pick_next_task(rq, rq->curr); 6900 next = pick_next_task(rq);
6487 if (!next) 6901 if (!next)
6488 break; 6902 break;
6489 next->sched_class->put_prev_task(rq, next); 6903 next->sched_class->put_prev_task(rq, next);
@@ -7314,7 +7728,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7314{ 7728{
7315 int group; 7729 int group;
7316 7730
7317 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7731 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7318 group = cpumask_first(mask); 7732 group = cpumask_first(mask);
7319 if (sg) 7733 if (sg)
7320 *sg = &per_cpu(sched_group_core, group).sg; 7734 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7343,7 +7757,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7343 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7757 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7344 group = cpumask_first(mask); 7758 group = cpumask_first(mask);
7345#elif defined(CONFIG_SCHED_SMT) 7759#elif defined(CONFIG_SCHED_SMT)
7346 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7760 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7347 group = cpumask_first(mask); 7761 group = cpumask_first(mask);
7348#else 7762#else
7349 group = cpu; 7763 group = cpu;
@@ -7686,7 +8100,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7686 SD_INIT(sd, SIBLING); 8100 SD_INIT(sd, SIBLING);
7687 set_domain_attribute(sd, attr); 8101 set_domain_attribute(sd, attr);
7688 cpumask_and(sched_domain_span(sd), 8102 cpumask_and(sched_domain_span(sd),
7689 &per_cpu(cpu_sibling_map, i), cpu_map); 8103 topology_thread_cpumask(i), cpu_map);
7690 sd->parent = p; 8104 sd->parent = p;
7691 p->child = sd; 8105 p->child = sd;
7692 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8106 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7697,7 +8111,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7697 /* Set up CPU (sibling) groups */ 8111 /* Set up CPU (sibling) groups */
7698 for_each_cpu(i, cpu_map) { 8112 for_each_cpu(i, cpu_map) {
7699 cpumask_and(this_sibling_map, 8113 cpumask_and(this_sibling_map,
7700 &per_cpu(cpu_sibling_map, i), cpu_map); 8114 topology_thread_cpumask(i), cpu_map);
7701 if (i != cpumask_first(this_sibling_map)) 8115 if (i != cpumask_first(this_sibling_map))
7702 continue; 8116 continue;
7703 8117
@@ -8278,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8278 __set_bit(MAX_RT_PRIO, array->bitmap); 8692 __set_bit(MAX_RT_PRIO, array->bitmap);
8279 8693
8280#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8694#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8281 rt_rq->highest_prio = MAX_RT_PRIO; 8695 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8696#ifdef CONFIG_SMP
8697 rt_rq->highest_prio.next = MAX_RT_PRIO;
8698#endif
8282#endif 8699#endif
8283#ifdef CONFIG_SMP 8700#ifdef CONFIG_SMP
8284 rt_rq->rt_nr_migratory = 0; 8701 rt_rq->rt_nr_migratory = 0;
8285 rt_rq->overloaded = 0; 8702 rt_rq->overloaded = 0;
8703 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8286#endif 8704#endif
8287 8705
8288 rt_rq->rt_time = 0; 8706 rt_rq->rt_time = 0;
@@ -8369,6 +8787,9 @@ void __init sched_init(void)
8369#ifdef CONFIG_USER_SCHED 8787#ifdef CONFIG_USER_SCHED
8370 alloc_size *= 2; 8788 alloc_size *= 2;
8371#endif 8789#endif
8790#ifdef CONFIG_CPUMASK_OFFSTACK
8791 alloc_size += num_possible_cpus() * cpumask_size();
8792#endif
8372 /* 8793 /*
8373 * As sched_init() is called before page_alloc is setup, 8794 * As sched_init() is called before page_alloc is setup,
8374 * we use alloc_bootmem(). 8795 * we use alloc_bootmem().
@@ -8406,6 +8827,12 @@ void __init sched_init(void)
8406 ptr += nr_cpu_ids * sizeof(void **); 8827 ptr += nr_cpu_ids * sizeof(void **);
8407#endif /* CONFIG_USER_SCHED */ 8828#endif /* CONFIG_USER_SCHED */
8408#endif /* CONFIG_RT_GROUP_SCHED */ 8829#endif /* CONFIG_RT_GROUP_SCHED */
8830#ifdef CONFIG_CPUMASK_OFFSTACK
8831 for_each_possible_cpu(i) {
8832 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8833 ptr += cpumask_size();
8834 }
8835#endif /* CONFIG_CPUMASK_OFFSTACK */
8409 } 8836 }
8410 8837
8411#ifdef CONFIG_SMP 8838#ifdef CONFIG_SMP
@@ -9658,7 +10085,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9658 struct cpuacct *ca; 10085 struct cpuacct *ca;
9659 int cpu; 10086 int cpu;
9660 10087
9661 if (!cpuacct_subsys.active) 10088 if (unlikely(!cpuacct_subsys.active))
9662 return; 10089 return;
9663 10090
9664 cpu = task_cpu(tsk); 10091 cpu = task_cpu(tsk);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 7ec82c1c61c5..819f17ac796e 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -45,9 +45,6 @@ static __read_mostly int sched_clock_running;
45 45
46#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 46#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
47__read_mostly int sched_clock_stable; 47__read_mostly int sched_clock_stable;
48#else
49static const int sched_clock_stable = 1;
50#endif
51 48
52struct sched_clock_data { 49struct sched_clock_data {
53 /* 50 /*
@@ -116,14 +113,9 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
116 s64 delta = now - scd->tick_raw; 113 s64 delta = now - scd->tick_raw;
117 u64 clock, min_clock, max_clock; 114 u64 clock, min_clock, max_clock;
118 115
119 WARN_ON_ONCE(!irqs_disabled());
120
121 if (unlikely(delta < 0)) 116 if (unlikely(delta < 0))
122 delta = 0; 117 delta = 0;
123 118
124 if (unlikely(!sched_clock_running))
125 return 0ull;
126
127 /* 119 /*
128 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
129 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
@@ -213,18 +205,20 @@ u64 sched_clock_cpu(int cpu)
213 return clock; 205 return clock;
214} 206}
215 207
216#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
217
218void sched_clock_tick(void) 208void sched_clock_tick(void)
219{ 209{
220 struct sched_clock_data *scd = this_scd(); 210 struct sched_clock_data *scd;
221 u64 now, now_gtod; 211 u64 now, now_gtod;
222 212
213 if (sched_clock_stable)
214 return;
215
223 if (unlikely(!sched_clock_running)) 216 if (unlikely(!sched_clock_running))
224 return; 217 return;
225 218
226 WARN_ON_ONCE(!irqs_disabled()); 219 WARN_ON_ONCE(!irqs_disabled());
227 220
221 scd = this_scd();
228 now_gtod = ktime_to_ns(ktime_get()); 222 now_gtod = ktime_to_ns(ktime_get());
229 now = sched_clock(); 223 now = sched_clock();
230 224
@@ -257,6 +251,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
257} 251}
258EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 252EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
259 253
254#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255
256void sched_clock_init(void)
257{
258 sched_clock_running = 1;
259}
260
261u64 sched_clock_cpu(int cpu)
262{
263 if (unlikely(!sched_clock_running))
264 return 0;
265
266 return sched_clock();
267}
268
260#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 269#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
261 270
262unsigned long long cpu_clock(int cpu) 271unsigned long long cpu_clock(int cpu)
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0a..9a7e859b8fbf 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
25 25
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..467ca72f1657 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
272 P(nr_switches); 272 P(nr_switches);
273 P(nr_load_updates); 273 P(nr_load_updates);
274 P(nr_uninterruptible); 274 P(nr_uninterruptible);
275 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
276 PN(next_balance); 275 PN(next_balance);
277 P(curr->pid); 276 P(curr->pid);
278 PN(clock); 277 PN(clock);
@@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
287#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
288#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
289 288
290 P(yld_exp_empty);
291 P(yld_act_empty);
292 P(yld_both_empty);
293 P(yld_count); 289 P(yld_count);
294 290
295 P(sched_switch); 291 P(sched_switch);
@@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
314 u64 now = ktime_to_ns(ktime_get()); 310 u64 now = ktime_to_ns(ktime_get());
315 int cpu; 311 int cpu;
316 312
317 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", 313 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
318 init_utsname()->release, 314 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "), 315 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version); 316 init_utsname()->version);
@@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
325 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 321 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
326#define PN(x) \ 322#define PN(x) \
327 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 323 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
324 P(jiffies);
328 PN(sysctl_sched_latency); 325 PN(sysctl_sched_latency);
329 PN(sysctl_sched_min_granularity); 326 PN(sysctl_sched_min_granularity);
330 PN(sysctl_sched_wakeup_granularity); 327 PN(sysctl_sched_wakeup_granularity);
@@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
397 PN(se.vruntime); 394 PN(se.vruntime);
398 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
399 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup);
400 398
401 nr_switches = p->nvcsw + p->nivcsw; 399 nr_switches = p->nvcsw + p->nivcsw;
402 400
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0566f2a03c42..3816f217f119 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1314,16 +1314,63 @@ out:
1314} 1314}
1315#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1316 1316
1317static unsigned long wakeup_gran(struct sched_entity *se) 1317/*
1318 * Adaptive granularity
1319 *
1320 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1321 * with the limit of wakeup_gran -- when it never does a wakeup.
1322 *
1323 * So the smaller avg_wakeup is the faster we want this task to preempt,
1324 * but we don't want to treat the preemptee unfairly and therefore allow it
1325 * to run for at least the amount of time we'd like to run.
1326 *
1327 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1328 *
1329 * NOTE: we use *nr_running to scale with load, this nicely matches the
1330 * degrading latency on load.
1331 */
1332static unsigned long
1333adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1334{
1335 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1336 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1337 u64 gran = 0;
1338
1339 if (this_run < expected_wakeup)
1340 gran = expected_wakeup - this_run;
1341
1342 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1343}
1344
1345static unsigned long
1346wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1318{ 1347{
1319 unsigned long gran = sysctl_sched_wakeup_granularity; 1348 unsigned long gran = sysctl_sched_wakeup_granularity;
1320 1349
1350 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1351 gran = adaptive_gran(curr, se);
1352
1321 /* 1353 /*
1322 * More easily preempt - nice tasks, while not making it harder for 1354 * Since its curr running now, convert the gran from real-time
1323 * + nice tasks. 1355 * to virtual-time in his units.
1324 */ 1356 */
1325 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) 1357 if (sched_feat(ASYM_GRAN)) {
1326 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); 1358 /*
1359 * By using 'se' instead of 'curr' we penalize light tasks, so
1360 * they get preempted easier. That is, if 'se' < 'curr' then
1361 * the resulting gran will be larger, therefore penalizing the
1362 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1363 * be smaller, again penalizing the lighter task.
1364 *
1365 * This is especially important for buddies when the leftmost
1366 * task is higher priority than the buddy.
1367 */
1368 if (unlikely(se->load.weight != NICE_0_LOAD))
1369 gran = calc_delta_fair(gran, se);
1370 } else {
1371 if (unlikely(curr->load.weight != NICE_0_LOAD))
1372 gran = calc_delta_fair(gran, curr);
1373 }
1327 1374
1328 return gran; 1375 return gran;
1329} 1376}
@@ -1350,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1350 if (vdiff <= 0) 1397 if (vdiff <= 0)
1351 return -1; 1398 return -1;
1352 1399
1353 gran = wakeup_gran(curr); 1400 gran = wakeup_gran(curr, se);
1354 if (vdiff > gran) 1401 if (vdiff > gran)
1355 return 1; 1402 return 1;
1356 1403
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 07bc02e99ab1..4569bfa7df9b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1)
3SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
4SCHED_FEAT(START_DEBIT, 1) 5SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 6SCHED_FEAT(AFFINE_WAKEUPS, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index da932f4c8524..299d012b4394 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{
8 return container_of(rt_se, struct task_struct, rt);
9}
10
11#ifdef CONFIG_RT_GROUP_SCHED
12
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{
15 return rt_rq->rq;
16}
17
18static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
19{
20 return rt_se->rt_rq;
21}
22
23#else /* CONFIG_RT_GROUP_SCHED */
24
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{
27 return container_of(rt_rq, struct rq, rt);
28}
29
30static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
31{
32 struct task_struct *p = rt_task_of(rt_se);
33 struct rq *rq = task_rq(p);
34
35 return &rq->rt;
36}
37
38#endif /* CONFIG_RT_GROUP_SCHED */
39
6#ifdef CONFIG_SMP 40#ifdef CONFIG_SMP
7 41
8static inline int rt_overloaded(struct rq *rq) 42static inline int rt_overloaded(struct rq *rq)
@@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)
37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 71 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 72}
39 73
40static void update_rt_migration(struct rq *rq) 74static void update_rt_migration(struct rt_rq *rt_rq)
41{ 75{
42 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { 76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
43 if (!rq->rt.overloaded) { 77 if (!rt_rq->overloaded) {
44 rt_set_overload(rq); 78 rt_set_overload(rq_of_rt_rq(rt_rq));
45 rq->rt.overloaded = 1; 79 rt_rq->overloaded = 1;
46 } 80 }
47 } else if (rq->rt.overloaded) { 81 } else if (rt_rq->overloaded) {
48 rt_clear_overload(rq); 82 rt_clear_overload(rq_of_rt_rq(rt_rq));
49 rq->rt.overloaded = 0; 83 rt_rq->overloaded = 0;
50 } 84 }
51} 85}
52#endif /* CONFIG_SMP */
53 86
54static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{
89 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++;
91
92 update_rt_migration(rt_rq);
93}
94
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{
97 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--;
99
100 update_rt_migration(rt_rq);
101}
102
103static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
104{
105 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
106 plist_node_init(&p->pushable_tasks, p->prio);
107 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
108}
109
110static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
111{
112 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113}
114
115#else
116
117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
55{ 118{
56 return container_of(rt_se, struct task_struct, rt);
57} 119}
58 120
121static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
122{
123}
124
125static inline
126void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
127{
128}
129
130static inline
131void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
132{
133}
134
135#endif /* CONFIG_SMP */
136
59static inline int on_rt_rq(struct sched_rt_entity *rt_se) 137static inline int on_rt_rq(struct sched_rt_entity *rt_se)
60{ 138{
61 return !list_empty(&rt_se->run_list); 139 return !list_empty(&rt_se->run_list);
@@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 157#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 158 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 159
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{
84 return rt_rq->rq;
85}
86
87static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
88{
89 return rt_se->rt_rq;
90}
91
92#define for_each_sched_rt_entity(rt_se) \ 160#define for_each_sched_rt_entity(rt_se) \
93 for (; rt_se; rt_se = rt_se->parent) 161 for (; rt_se; rt_se = rt_se->parent)
94 162
@@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
108 if (rt_rq->rt_nr_running) { 176 if (rt_rq->rt_nr_running) {
109 if (rt_se && !on_rt_rq(rt_se)) 177 if (rt_se && !on_rt_rq(rt_se))
110 enqueue_rt_entity(rt_se); 178 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 179 if (rt_rq->highest_prio.curr < curr->prio)
112 resched_task(curr); 180 resched_task(curr);
113 } 181 }
114} 182}
@@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
176#define for_each_leaf_rt_rq(rt_rq, rq) \ 244#define for_each_leaf_rt_rq(rt_rq, rq) \
177 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 245 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
178 246
179static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
180{
181 return container_of(rt_rq, struct rq, rt);
182}
183
184static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
185{
186 struct task_struct *p = rt_task_of(rt_se);
187 struct rq *rq = task_rq(p);
188
189 return &rq->rt;
190}
191
192#define for_each_sched_rt_entity(rt_se) \ 247#define for_each_sched_rt_entity(rt_se) \
193 for (; rt_se; rt_se = NULL) 248 for (; rt_se; rt_se = NULL)
194 249
@@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
473 struct rt_rq *rt_rq = group_rt_rq(rt_se); 528 struct rt_rq *rt_rq = group_rt_rq(rt_se);
474 529
475 if (rt_rq) 530 if (rt_rq)
476 return rt_rq->highest_prio; 531 return rt_rq->highest_prio.curr;
477#endif 532#endif
478 533
479 return rt_task_of(rt_se)->prio; 534 return rt_task_of(rt_se)->prio;
@@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)
547 } 602 }
548} 603}
549 604
550static inline 605#if defined CONFIG_SMP
551void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 606
607static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
608
609static inline int next_prio(struct rq *rq)
552{ 610{
553 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 611 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
554 rt_rq->rt_nr_running++; 612
555#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 613 if (next && rt_prio(next->prio))
556 if (rt_se_prio(rt_se) < rt_rq->highest_prio) { 614 return next->prio;
557#ifdef CONFIG_SMP 615 else
558 struct rq *rq = rq_of_rt_rq(rt_rq); 616 return MAX_RT_PRIO;
559#endif 617}
618
619static void
620inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
621{
622 struct rq *rq = rq_of_rt_rq(rt_rq);
623
624 if (prio < prev_prio) {
625
626 /*
627 * If the new task is higher in priority than anything on the
628 * run-queue, we know that the previous high becomes our
629 * next-highest.
630 */
631 rt_rq->highest_prio.next = prev_prio;
560 632
561 rt_rq->highest_prio = rt_se_prio(rt_se);
562#ifdef CONFIG_SMP
563 if (rq->online) 633 if (rq->online)
564 cpupri_set(&rq->rd->cpupri, rq->cpu, 634 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
565 rt_se_prio(rt_se));
566#endif
567 }
568#endif
569#ifdef CONFIG_SMP
570 if (rt_se->nr_cpus_allowed > 1) {
571 struct rq *rq = rq_of_rt_rq(rt_rq);
572 635
573 rq->rt.rt_nr_migratory++; 636 } else if (prio == rt_rq->highest_prio.curr)
574 } 637 /*
638 * If the next task is equal in priority to the highest on
639 * the run-queue, then we implicitly know that the next highest
640 * task cannot be any lower than current
641 */
642 rt_rq->highest_prio.next = prio;
643 else if (prio < rt_rq->highest_prio.next)
644 /*
645 * Otherwise, we need to recompute next-highest
646 */
647 rt_rq->highest_prio.next = next_prio(rq);
648}
575 649
576 update_rt_migration(rq_of_rt_rq(rt_rq)); 650static void
577#endif 651dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
578#ifdef CONFIG_RT_GROUP_SCHED 652{
579 if (rt_se_boosted(rt_se)) 653 struct rq *rq = rq_of_rt_rq(rt_rq);
580 rt_rq->rt_nr_boosted++;
581 654
582 if (rt_rq->tg) 655 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
583 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 656 rt_rq->highest_prio.next = next_prio(rq);
584#else 657
585 start_rt_bandwidth(&def_rt_bandwidth); 658 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
586#endif 659 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
587} 660}
588 661
662#else /* CONFIG_SMP */
663
589static inline 664static inline
590void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 665void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
591{ 666static inline
592#ifdef CONFIG_SMP 667void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
593 int highest_prio = rt_rq->highest_prio; 668
594#endif 669#endif /* CONFIG_SMP */
595 670
596 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
597 WARN_ON(!rt_rq->rt_nr_running);
598 rt_rq->rt_nr_running--;
599#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 671#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
672static void
673inc_rt_prio(struct rt_rq *rt_rq, int prio)
674{
675 int prev_prio = rt_rq->highest_prio.curr;
676
677 if (prio < prev_prio)
678 rt_rq->highest_prio.curr = prio;
679
680 inc_rt_prio_smp(rt_rq, prio, prev_prio);
681}
682
683static void
684dec_rt_prio(struct rt_rq *rt_rq, int prio)
685{
686 int prev_prio = rt_rq->highest_prio.curr;
687
600 if (rt_rq->rt_nr_running) { 688 if (rt_rq->rt_nr_running) {
601 struct rt_prio_array *array;
602 689
603 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); 690 WARN_ON(prio < prev_prio);
604 if (rt_se_prio(rt_se) == rt_rq->highest_prio) { 691
605 /* recalculate */ 692 /*
606 array = &rt_rq->active; 693 * This may have been our highest task, and therefore
607 rt_rq->highest_prio = 694 * we may have some recomputation to do
695 */
696 if (prio == prev_prio) {
697 struct rt_prio_array *array = &rt_rq->active;
698
699 rt_rq->highest_prio.curr =
608 sched_find_first_bit(array->bitmap); 700 sched_find_first_bit(array->bitmap);
609 } /* otherwise leave rq->highest prio alone */ 701 }
702
610 } else 703 } else
611 rt_rq->highest_prio = MAX_RT_PRIO; 704 rt_rq->highest_prio.curr = MAX_RT_PRIO;
612#endif
613#ifdef CONFIG_SMP
614 if (rt_se->nr_cpus_allowed > 1) {
615 struct rq *rq = rq_of_rt_rq(rt_rq);
616 rq->rt.rt_nr_migratory--;
617 }
618 705
619 if (rt_rq->highest_prio != highest_prio) { 706 dec_rt_prio_smp(rt_rq, prio, prev_prio);
620 struct rq *rq = rq_of_rt_rq(rt_rq); 707}
621 708
622 if (rq->online) 709#else
623 cpupri_set(&rq->rd->cpupri, rq->cpu, 710
624 rt_rq->highest_prio); 711static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
625 } 712static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
713
714#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
626 715
627 update_rt_migration(rq_of_rt_rq(rt_rq));
628#endif /* CONFIG_SMP */
629#ifdef CONFIG_RT_GROUP_SCHED 716#ifdef CONFIG_RT_GROUP_SCHED
717
718static void
719inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
720{
721 if (rt_se_boosted(rt_se))
722 rt_rq->rt_nr_boosted++;
723
724 if (rt_rq->tg)
725 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
726}
727
728static void
729dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
730{
630 if (rt_se_boosted(rt_se)) 731 if (rt_se_boosted(rt_se))
631 rt_rq->rt_nr_boosted--; 732 rt_rq->rt_nr_boosted--;
632 733
633 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 734 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
634#endif 735}
736
737#else /* CONFIG_RT_GROUP_SCHED */
738
739static void
740inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
741{
742 start_rt_bandwidth(&def_rt_bandwidth);
743}
744
745static inline
746void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
747
748#endif /* CONFIG_RT_GROUP_SCHED */
749
750static inline
751void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
752{
753 int prio = rt_se_prio(rt_se);
754
755 WARN_ON(!rt_prio(prio));
756 rt_rq->rt_nr_running++;
757
758 inc_rt_prio(rt_rq, prio);
759 inc_rt_migration(rt_se, rt_rq);
760 inc_rt_group(rt_se, rt_rq);
761}
762
763static inline
764void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
765{
766 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
767 WARN_ON(!rt_rq->rt_nr_running);
768 rt_rq->rt_nr_running--;
769
770 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
771 dec_rt_migration(rt_se, rt_rq);
772 dec_rt_group(rt_se, rt_rq);
635} 773}
636 774
637static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 775static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
718 856
719 enqueue_rt_entity(rt_se); 857 enqueue_rt_entity(rt_se);
720 858
859 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860 enqueue_pushable_task(rq, p);
861
721 inc_cpu_load(rq, p->se.load.weight); 862 inc_cpu_load(rq, p->se.load.weight);
722} 863}
723 864
@@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
728 update_curr_rt(rq); 869 update_curr_rt(rq);
729 dequeue_rt_entity(rt_se); 870 dequeue_rt_entity(rt_se);
730 871
872 dequeue_pushable_task(rq, p);
873
731 dec_cpu_load(rq, p->se.load.weight); 874 dec_cpu_load(rq, p->se.load.weight);
732} 875}
733 876
@@ -878,7 +1021,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
878 return next; 1021 return next;
879} 1022}
880 1023
881static struct task_struct *pick_next_task_rt(struct rq *rq) 1024static struct task_struct *_pick_next_task_rt(struct rq *rq)
882{ 1025{
883 struct sched_rt_entity *rt_se; 1026 struct sched_rt_entity *rt_se;
884 struct task_struct *p; 1027 struct task_struct *p;
@@ -900,6 +1043,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
900 1043
901 p = rt_task_of(rt_se); 1044 p = rt_task_of(rt_se);
902 p->se.exec_start = rq->clock; 1045 p->se.exec_start = rq->clock;
1046
1047 return p;
1048}
1049
1050static struct task_struct *pick_next_task_rt(struct rq *rq)
1051{
1052 struct task_struct *p = _pick_next_task_rt(rq);
1053
1054 /* The running task is never eligible for pushing */
1055 if (p)
1056 dequeue_pushable_task(rq, p);
1057
903 return p; 1058 return p;
904} 1059}
905 1060
@@ -907,6 +1062,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
907{ 1062{
908 update_curr_rt(rq); 1063 update_curr_rt(rq);
909 p->se.exec_start = 0; 1064 p->se.exec_start = 0;
1065
1066 /*
1067 * The previous task needs to be made eligible for pushing
1068 * if it is still active
1069 */
1070 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1071 enqueue_pushable_task(rq, p);
910} 1072}
911 1073
912#ifdef CONFIG_SMP 1074#ifdef CONFIG_SMP
@@ -1080,7 +1242,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1080 } 1242 }
1081 1243
1082 /* If this rq is still suitable use it. */ 1244 /* If this rq is still suitable use it. */
1083 if (lowest_rq->rt.highest_prio > task->prio) 1245 if (lowest_rq->rt.highest_prio.curr > task->prio)
1084 break; 1246 break;
1085 1247
1086 /* try again */ 1248 /* try again */
@@ -1091,6 +1253,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1091 return lowest_rq; 1253 return lowest_rq;
1092} 1254}
1093 1255
1256static inline int has_pushable_tasks(struct rq *rq)
1257{
1258 return !plist_head_empty(&rq->rt.pushable_tasks);
1259}
1260
1261static struct task_struct *pick_next_pushable_task(struct rq *rq)
1262{
1263 struct task_struct *p;
1264
1265 if (!has_pushable_tasks(rq))
1266 return NULL;
1267
1268 p = plist_first_entry(&rq->rt.pushable_tasks,
1269 struct task_struct, pushable_tasks);
1270
1271 BUG_ON(rq->cpu != task_cpu(p));
1272 BUG_ON(task_current(rq, p));
1273 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1274
1275 BUG_ON(!p->se.on_rq);
1276 BUG_ON(!rt_task(p));
1277
1278 return p;
1279}
1280
1094/* 1281/*
1095 * If the current CPU has more than one RT task, see if the non 1282 * If the current CPU has more than one RT task, see if the non
1096 * running task can migrate over to a CPU that is running a task 1283 * running task can migrate over to a CPU that is running a task
@@ -1100,13 +1287,11 @@ static int push_rt_task(struct rq *rq)
1100{ 1287{
1101 struct task_struct *next_task; 1288 struct task_struct *next_task;
1102 struct rq *lowest_rq; 1289 struct rq *lowest_rq;
1103 int ret = 0;
1104 int paranoid = RT_MAX_TRIES;
1105 1290
1106 if (!rq->rt.overloaded) 1291 if (!rq->rt.overloaded)
1107 return 0; 1292 return 0;
1108 1293
1109 next_task = pick_next_highest_task_rt(rq, -1); 1294 next_task = pick_next_pushable_task(rq);
1110 if (!next_task) 1295 if (!next_task)
1111 return 0; 1296 return 0;
1112 1297
@@ -1135,16 +1320,34 @@ static int push_rt_task(struct rq *rq)
1135 struct task_struct *task; 1320 struct task_struct *task;
1136 /* 1321 /*
1137 * find lock_lowest_rq releases rq->lock 1322 * find lock_lowest_rq releases rq->lock
1138 * so it is possible that next_task has changed. 1323 * so it is possible that next_task has migrated.
1139 * If it has, then try again. 1324 *
1325 * We need to make sure that the task is still on the same
1326 * run-queue and is also still the next task eligible for
1327 * pushing.
1140 */ 1328 */
1141 task = pick_next_highest_task_rt(rq, -1); 1329 task = pick_next_pushable_task(rq);
1142 if (unlikely(task != next_task) && task && paranoid--) { 1330 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1143 put_task_struct(next_task); 1331 /*
1144 next_task = task; 1332 * If we get here, the task hasnt moved at all, but
1145 goto retry; 1333 * it has failed to push. We will not try again,
1334 * since the other cpus will pull from us when they
1335 * are ready.
1336 */
1337 dequeue_pushable_task(rq, next_task);
1338 goto out;
1146 } 1339 }
1147 goto out; 1340
1341 if (!task)
1342 /* No more tasks, just exit */
1343 goto out;
1344
1345 /*
1346 * Something has shifted, try again.
1347 */
1348 put_task_struct(next_task);
1349 next_task = task;
1350 goto retry;
1148 } 1351 }
1149 1352
1150 deactivate_task(rq, next_task, 0); 1353 deactivate_task(rq, next_task, 0);
@@ -1155,23 +1358,12 @@ static int push_rt_task(struct rq *rq)
1155 1358
1156 double_unlock_balance(rq, lowest_rq); 1359 double_unlock_balance(rq, lowest_rq);
1157 1360
1158 ret = 1;
1159out: 1361out:
1160 put_task_struct(next_task); 1362 put_task_struct(next_task);
1161 1363
1162 return ret; 1364 return 1;
1163} 1365}
1164 1366
1165/*
1166 * TODO: Currently we just use the second highest prio task on
1167 * the queue, and stop when it can't migrate (or there's
1168 * no more RT tasks). There may be a case where a lower
1169 * priority RT task has a different affinity than the
1170 * higher RT task. In this case the lower RT task could
1171 * possibly be able to migrate where as the higher priority
1172 * RT task could not. We currently ignore this issue.
1173 * Enhancements are welcome!
1174 */
1175static void push_rt_tasks(struct rq *rq) 1367static void push_rt_tasks(struct rq *rq)
1176{ 1368{
1177 /* push_rt_task will return true if it moved an RT */ 1369 /* push_rt_task will return true if it moved an RT */
@@ -1182,33 +1374,35 @@ static void push_rt_tasks(struct rq *rq)
1182static int pull_rt_task(struct rq *this_rq) 1374static int pull_rt_task(struct rq *this_rq)
1183{ 1375{
1184 int this_cpu = this_rq->cpu, ret = 0, cpu; 1376 int this_cpu = this_rq->cpu, ret = 0, cpu;
1185 struct task_struct *p, *next; 1377 struct task_struct *p;
1186 struct rq *src_rq; 1378 struct rq *src_rq;
1187 1379
1188 if (likely(!rt_overloaded(this_rq))) 1380 if (likely(!rt_overloaded(this_rq)))
1189 return 0; 1381 return 0;
1190 1382
1191 next = pick_next_task_rt(this_rq);
1192
1193 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1383 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1194 if (this_cpu == cpu) 1384 if (this_cpu == cpu)
1195 continue; 1385 continue;
1196 1386
1197 src_rq = cpu_rq(cpu); 1387 src_rq = cpu_rq(cpu);
1388
1389 /*
1390 * Don't bother taking the src_rq->lock if the next highest
1391 * task is known to be lower-priority than our current task.
1392 * This may look racy, but if this value is about to go
1393 * logically higher, the src_rq will push this task away.
1394 * And if its going logically lower, we do not care
1395 */
1396 if (src_rq->rt.highest_prio.next >=
1397 this_rq->rt.highest_prio.curr)
1398 continue;
1399
1198 /* 1400 /*
1199 * We can potentially drop this_rq's lock in 1401 * We can potentially drop this_rq's lock in
1200 * double_lock_balance, and another CPU could 1402 * double_lock_balance, and another CPU could
1201 * steal our next task - hence we must cause 1403 * alter this_rq
1202 * the caller to recalculate the next task
1203 * in that case:
1204 */ 1404 */
1205 if (double_lock_balance(this_rq, src_rq)) { 1405 double_lock_balance(this_rq, src_rq);
1206 struct task_struct *old_next = next;
1207
1208 next = pick_next_task_rt(this_rq);
1209 if (next != old_next)
1210 ret = 1;
1211 }
1212 1406
1213 /* 1407 /*
1214 * Are there still pullable RT tasks? 1408 * Are there still pullable RT tasks?
@@ -1222,7 +1416,7 @@ static int pull_rt_task(struct rq *this_rq)
1222 * Do we have an RT task that preempts 1416 * Do we have an RT task that preempts
1223 * the to-be-scheduled task? 1417 * the to-be-scheduled task?
1224 */ 1418 */
1225 if (p && (!next || (p->prio < next->prio))) { 1419 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1226 WARN_ON(p == src_rq->curr); 1420 WARN_ON(p == src_rq->curr);
1227 WARN_ON(!p->se.on_rq); 1421 WARN_ON(!p->se.on_rq);
1228 1422
@@ -1232,12 +1426,9 @@ static int pull_rt_task(struct rq *this_rq)
1232 * This is just that p is wakeing up and hasn't 1426 * This is just that p is wakeing up and hasn't
1233 * had a chance to schedule. We only pull 1427 * had a chance to schedule. We only pull
1234 * p if it is lower in priority than the 1428 * p if it is lower in priority than the
1235 * current task on the run queue or 1429 * current task on the run queue
1236 * this_rq next task is lower in prio than
1237 * the current task on that rq.
1238 */ 1430 */
1239 if (p->prio < src_rq->curr->prio || 1431 if (p->prio < src_rq->curr->prio)
1240 (next && next->prio < src_rq->curr->prio))
1241 goto skip; 1432 goto skip;
1242 1433
1243 ret = 1; 1434 ret = 1;
@@ -1250,13 +1441,7 @@ static int pull_rt_task(struct rq *this_rq)
1250 * case there's an even higher prio task 1441 * case there's an even higher prio task
1251 * in another runqueue. (low likelyhood 1442 * in another runqueue. (low likelyhood
1252 * but possible) 1443 * but possible)
1253 *
1254 * Update next so that we won't pick a task
1255 * on another cpu with a priority lower (or equal)
1256 * than the one we just picked.
1257 */ 1444 */
1258 next = p;
1259
1260 } 1445 }
1261 skip: 1446 skip:
1262 double_unlock_balance(this_rq, src_rq); 1447 double_unlock_balance(this_rq, src_rq);
@@ -1268,24 +1453,27 @@ static int pull_rt_task(struct rq *this_rq)
1268static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1453static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1269{ 1454{
1270 /* Try to pull RT tasks here if we lower this rq's prio */ 1455 /* Try to pull RT tasks here if we lower this rq's prio */
1271 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) 1456 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1272 pull_rt_task(rq); 1457 pull_rt_task(rq);
1273} 1458}
1274 1459
1460/*
1461 * assumes rq->lock is held
1462 */
1463static int needs_post_schedule_rt(struct rq *rq)
1464{
1465 return has_pushable_tasks(rq);
1466}
1467
1275static void post_schedule_rt(struct rq *rq) 1468static void post_schedule_rt(struct rq *rq)
1276{ 1469{
1277 /* 1470 /*
1278 * If we have more than one rt_task queued, then 1471 * This is only called if needs_post_schedule_rt() indicates that
1279 * see if we can push the other rt_tasks off to other CPUS. 1472 * we need to push tasks away
1280 * Note we may release the rq lock, and since
1281 * the lock was owned by prev, we need to release it
1282 * first via finish_lock_switch and then reaquire it here.
1283 */ 1473 */
1284 if (unlikely(rq->rt.overloaded)) { 1474 spin_lock_irq(&rq->lock);
1285 spin_lock_irq(&rq->lock); 1475 push_rt_tasks(rq);
1286 push_rt_tasks(rq); 1476 spin_unlock_irq(&rq->lock);
1287 spin_unlock_irq(&rq->lock);
1288 }
1289} 1477}
1290 1478
1291/* 1479/*
@@ -1296,7 +1484,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1296{ 1484{
1297 if (!task_running(rq, p) && 1485 if (!task_running(rq, p) &&
1298 !test_tsk_need_resched(rq->curr) && 1486 !test_tsk_need_resched(rq->curr) &&
1299 rq->rt.overloaded) 1487 has_pushable_tasks(rq) &&
1488 p->rt.nr_cpus_allowed > 1)
1300 push_rt_tasks(rq); 1489 push_rt_tasks(rq);
1301} 1490}
1302 1491
@@ -1332,6 +1521,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1332 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1521 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1333 struct rq *rq = task_rq(p); 1522 struct rq *rq = task_rq(p);
1334 1523
1524 if (!task_current(rq, p)) {
1525 /*
1526 * Make sure we dequeue this task from the pushable list
1527 * before going further. It will either remain off of
1528 * the list because we are no longer pushable, or it
1529 * will be requeued.
1530 */
1531 if (p->rt.nr_cpus_allowed > 1)
1532 dequeue_pushable_task(rq, p);
1533
1534 /*
1535 * Requeue if our weight is changing and still > 1
1536 */
1537 if (weight > 1)
1538 enqueue_pushable_task(rq, p);
1539
1540 }
1541
1335 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1542 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1336 rq->rt.rt_nr_migratory++; 1543 rq->rt.rt_nr_migratory++;
1337 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { 1544 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1339,7 +1546,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1339 rq->rt.rt_nr_migratory--; 1546 rq->rt.rt_nr_migratory--;
1340 } 1547 }
1341 1548
1342 update_rt_migration(rq); 1549 update_rt_migration(&rq->rt);
1343 } 1550 }
1344 1551
1345 cpumask_copy(&p->cpus_allowed, new_mask); 1552 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -1354,7 +1561,7 @@ static void rq_online_rt(struct rq *rq)
1354 1561
1355 __enable_runtime(rq); 1562 __enable_runtime(rq);
1356 1563
1357 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); 1564 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1358} 1565}
1359 1566
1360/* Assumes rq->lock is held */ 1567/* Assumes rq->lock is held */
@@ -1446,7 +1653,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1446 * can release the rq lock and p could migrate. 1653 * can release the rq lock and p could migrate.
1447 * Only reschedule if p is still on the same runqueue. 1654 * Only reschedule if p is still on the same runqueue.
1448 */ 1655 */
1449 if (p->prio > rq->rt.highest_prio && rq->curr == p) 1656 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1450 resched_task(p); 1657 resched_task(p);
1451#else 1658#else
1452 /* For UP simply resched on drop of prio */ 1659 /* For UP simply resched on drop of prio */
@@ -1517,6 +1724,9 @@ static void set_curr_task_rt(struct rq *rq)
1517 struct task_struct *p = rq->curr; 1724 struct task_struct *p = rq->curr;
1518 1725
1519 p->se.exec_start = rq->clock; 1726 p->se.exec_start = rq->clock;
1727
1728 /* The running task is never eligible for pushing */
1729 dequeue_pushable_task(rq, p);
1520} 1730}
1521 1731
1522static const struct sched_class rt_sched_class = { 1732static const struct sched_class rt_sched_class = {
@@ -1539,6 +1749,7 @@ static const struct sched_class rt_sched_class = {
1539 .rq_online = rq_online_rt, 1749 .rq_online = rq_online_rt,
1540 .rq_offline = rq_offline_rt, 1750 .rq_offline = rq_offline_rt,
1541 .pre_schedule = pre_schedule_rt, 1751 .pre_schedule = pre_schedule_rt,
1752 .needs_post_schedule = needs_post_schedule_rt,
1542 .post_schedule = post_schedule_rt, 1753 .post_schedule = post_schedule_rt,
1543 .task_wake_up = task_wake_up_rt, 1754 .task_wake_up = task_wake_up_rt,
1544 .switched_from = switched_from_rt, 1755 .switched_from = switched_from_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index a8f93dd374e1..32d2bd4061b0 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,7 +4,7 @@
4 * bump this up when changing the output format or the meaning of an existing 4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort) 5 * format, so that tools can adapt (or abort)
6 */ 6 */
7#define SCHEDSTAT_VERSION 14 7#define SCHEDSTAT_VERSION 15
8 8
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
26 26
27 /* runqueue-specific stats */ 27 /* runqueue-specific stats */
28 seq_printf(seq, 28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", 29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_both_empty, 30 cpu, rq->yld_count,
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 32 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_cpu_time, 33 rq->rq_cpu_time,
diff --git a/kernel/signal.c b/kernel/signal.c
index 2a74fe87c0dd..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
55 (handler == SIG_DFL && sig_kernel_ignore(sig)); 55 (handler == SIG_DFL && sig_kernel_ignore(sig));
56} 56}
57 57
58static int sig_ignored(struct task_struct *t, int sig) 58static int sig_task_ignored(struct task_struct *t, int sig,
59 int from_ancestor_ns)
59{ 60{
60 void __user *handler; 61 void __user *handler;
61 62
63 handler = sig_handler(t, sig);
64
65 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
66 handler == SIG_DFL && !from_ancestor_ns)
67 return 1;
68
69 return sig_handler_ignored(handler, sig);
70}
71
72static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
73{
62 /* 74 /*
63 * Blocked signals are never ignored, since the 75 * Blocked signals are never ignored, since the
64 * signal handler may change by the time it is 76 * signal handler may change by the time it is
@@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
67 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 79 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
68 return 0; 80 return 0;
69 81
70 handler = sig_handler(t, sig); 82 if (!sig_task_ignored(t, sig, from_ancestor_ns))
71 if (!sig_handler_ignored(handler, sig))
72 return 0; 83 return 0;
73 84
74 /* 85 /*
75 * Tracers may want to know about even ignored signals. 86 * Tracers may want to know about even ignored signals.
76 */ 87 */
77 return !tracehook_consider_ignored_signal(t, sig, handler); 88 return !tracehook_consider_ignored_signal(t, sig);
78} 89}
79 90
80/* 91/*
@@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
318 return 1; 329 return 1;
319 if (handler != SIG_IGN && handler != SIG_DFL) 330 if (handler != SIG_IGN && handler != SIG_DFL)
320 return 0; 331 return 0;
321 return !tracehook_consider_fatal_signal(tsk, sig, handler); 332 return !tracehook_consider_fatal_signal(tsk, sig);
322} 333}
323 334
324 335
@@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
624 * Returns true if the signal should be actually delivered, otherwise 635 * Returns true if the signal should be actually delivered, otherwise
625 * it should be dropped. 636 * it should be dropped.
626 */ 637 */
627static int prepare_signal(int sig, struct task_struct *p) 638static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
628{ 639{
629 struct signal_struct *signal = p->signal; 640 struct signal_struct *signal = p->signal;
630 struct task_struct *t; 641 struct task_struct *t;
@@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
708 } 719 }
709 } 720 }
710 721
711 return !sig_ignored(p, sig); 722 return !sig_ignored(p, sig, from_ancestor_ns);
712} 723}
713 724
714/* 725/*
@@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
777 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 788 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
778 !sigismember(&t->real_blocked, sig) && 789 !sigismember(&t->real_blocked, sig) &&
779 (sig == SIGKILL || 790 (sig == SIGKILL ||
780 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { 791 !tracehook_consider_fatal_signal(t, sig))) {
781 /* 792 /*
782 * This signal will be fatal to the whole group. 793 * This signal will be fatal to the whole group.
783 */ 794 */
@@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
813 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 824 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
814} 825}
815 826
816static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 827static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
817 int group) 828 int group, int from_ancestor_ns)
818{ 829{
819 struct sigpending *pending; 830 struct sigpending *pending;
820 struct sigqueue *q; 831 struct sigqueue *q;
@@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
822 trace_sched_signal_send(sig, t); 833 trace_sched_signal_send(sig, t);
823 834
824 assert_spin_locked(&t->sighand->siglock); 835 assert_spin_locked(&t->sighand->siglock);
825 if (!prepare_signal(sig, t)) 836
837 if (!prepare_signal(sig, t, from_ancestor_ns))
826 return 0; 838 return 0;
827 839
828 pending = group ? &t->signal->shared_pending : &t->pending; 840 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
871 break; 883 break;
872 default: 884 default:
873 copy_siginfo(&q->info, info); 885 copy_siginfo(&q->info, info);
886 if (from_ancestor_ns)
887 q->info.si_pid = 0;
874 break; 888 break;
875 } 889 }
876 } else if (!is_si_special(info)) { 890 } else if (!is_si_special(info)) {
@@ -889,6 +903,20 @@ out_set:
889 return 0; 903 return 0;
890} 904}
891 905
906static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
907 int group)
908{
909 int from_ancestor_ns = 0;
910
911#ifdef CONFIG_PID_NS
912 if (!is_si_special(info) && SI_FROMUSER(info) &&
913 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
914 from_ancestor_ns = 1;
915#endif
916
917 return __send_signal(sig, info, t, group, from_ancestor_ns);
918}
919
892int print_fatal_signals; 920int print_fatal_signals;
893 921
894static void print_fatal_signal(struct pt_regs *regs, int signr) 922static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1133 if (sig && p->sighand) { 1161 if (sig && p->sighand) {
1134 unsigned long flags; 1162 unsigned long flags;
1135 spin_lock_irqsave(&p->sighand->siglock, flags); 1163 spin_lock_irqsave(&p->sighand->siglock, flags);
1136 ret = __group_send_sig_info(sig, info, p); 1164 ret = __send_signal(sig, info, p, 1, 0);
1137 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1165 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1138 } 1166 }
1139out_unlock: 1167out_unlock:
@@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1320 goto ret; 1348 goto ret;
1321 1349
1322 ret = 1; /* the signal is ignored */ 1350 ret = 1; /* the signal is ignored */
1323 if (!prepare_signal(sig, t)) 1351 if (!prepare_signal(sig, t, 0))
1324 goto out; 1352 goto out;
1325 1353
1326 ret = 0; 1354 ret = 0;
@@ -1575,7 +1603,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1575 read_lock(&tasklist_lock); 1603 read_lock(&tasklist_lock);
1576 if (may_ptrace_stop()) { 1604 if (may_ptrace_stop()) {
1577 do_notify_parent_cldstop(current, CLD_TRAPPED); 1605 do_notify_parent_cldstop(current, CLD_TRAPPED);
1606 /*
1607 * Don't want to allow preemption here, because
1608 * sys_ptrace() needs this task to be inactive.
1609 *
1610 * XXX: implement read_unlock_no_resched().
1611 */
1612 preempt_disable();
1578 read_unlock(&tasklist_lock); 1613 read_unlock(&tasklist_lock);
1614 preempt_enable_no_resched();
1579 schedule(); 1615 schedule();
1580 } else { 1616 } else {
1581 /* 1617 /*
@@ -1836,9 +1872,16 @@ relock:
1836 1872
1837 /* 1873 /*
1838 * Global init gets no signals it doesn't want. 1874 * Global init gets no signals it doesn't want.
1875 * Container-init gets no signals it doesn't want from same
1876 * container.
1877 *
1878 * Note that if global/container-init sees a sig_kernel_only()
1879 * signal here, the signal must have been generated internally
1880 * or must have come from an ancestor namespace. In either
1881 * case, the signal cannot be dropped.
1839 */ 1882 */
1840 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && 1883 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1841 !signal_group_exit(signal)) 1884 !sig_kernel_only(signr))
1842 continue; 1885 continue;
1843 1886
1844 if (sig_kernel_stop(signr)) { 1887 if (sig_kernel_stop(signr)) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..cf2bc01186ef
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24
25static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long);
27
28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
30 void __user *, size_t *, loff_t *);
31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
33 void __user *, size_t *, loff_t *);
34#endif
35
36/*
37 * The pool of threads has at least min threads in it as long as someone is
38 * using the facility, and may have as many as max.
39 *
40 * A portion of the pool may be processing very slow operations.
41 */
42static unsigned slow_work_min_threads = 2;
43static unsigned slow_work_max_threads = 4;
44static unsigned vslow_work_proportion = 50; /* % of threads that may process
45 * very slow work */
46
47#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255;
50static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99;
52
53ctl_table slow_work_sysctls[] = {
54 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads",
57 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned),
59 .mode = 0644,
60 .proc_handler = slow_work_min_threads_sysctl,
61 .extra1 = (void *) &slow_work_min_min_threads,
62 .extra2 = &slow_work_max_threads,
63 },
64 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads",
67 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned),
69 .mode = 0644,
70 .proc_handler = slow_work_max_threads_sysctl,
71 .extra1 = &slow_work_min_threads,
72 .extra2 = (void *) &slow_work_max_max_threads,
73 },
74 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned),
79 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow,
83 },
84 { .ctl_name = 0 }
85};
86#endif
87
88/*
89 * The active state of the thread pool
90 */
91static atomic_t slow_work_thread_count;
92static atomic_t vslow_work_executing_count;
93
94static bool slow_work_may_not_start_new_thread;
95static bool slow_work_cull; /* cull a thread due to lack of activity */
96static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
97static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */
99
100/*
101 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs.
104 *
105 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items.
107 */
108static LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock);
111
112/*
113 * The thread controls. A variable used to signal to the threads that they
114 * should exit when the queue is empty, a waitqueue used by the threads to wait
115 * for signals, and a completion set by the last thread to exit.
116 */
117static bool slow_work_threads_should_exit;
118static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
119static DECLARE_COMPLETION(slow_work_last_thread_exited);
120
121/*
122 * The number of users of the thread pool and its lock. Whilst this is zero we
123 * have no threads hanging around, and when this reaches zero, we wait for all
124 * active or queued work items to complete and kill all the threads we do have.
125 */
126static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock);
128
129/*
130 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items.
132 *
133 * The answer is rounded up to at least 1, but may not equal or exceed the
134 * maximum number of the threads in the pool. This means we always have at
135 * least one thread that can process slow work items, and we always have at
136 * least one thread that won't get tied up doing so.
137 */
138static unsigned slow_work_calc_vsmax(void)
139{
140 unsigned vsmax;
141
142 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
143 vsmax /= 100;
144 vsmax = max(vsmax, 1U);
145 return min(vsmax, slow_work_max_threads - 1);
146}
147
148/*
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do.
151 */
152static bool slow_work_execute(void)
153{
154 struct slow_work *work = NULL;
155 unsigned vsmax;
156 bool very_slow;
157
158 vsmax = slow_work_calc_vsmax();
159
160 /* see if we can schedule a new thread to be started if we're not
161 * keeping up with the work */
162 if (!waitqueue_active(&slow_work_thread_wq) &&
163 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
164 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
165 !slow_work_may_not_start_new_thread)
166 slow_work_enqueue(&slow_work_new_thread);
167
168 /* find something to execute */
169 spin_lock_irq(&slow_work_queue_lock);
170 if (!list_empty(&vslow_work_queue) &&
171 atomic_read(&vslow_work_executing_count) < vsmax) {
172 work = list_entry(vslow_work_queue.next,
173 struct slow_work, link);
174 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
175 BUG();
176 list_del_init(&work->link);
177 atomic_inc(&vslow_work_executing_count);
178 very_slow = true;
179 } else if (!list_empty(&slow_work_queue)) {
180 work = list_entry(slow_work_queue.next,
181 struct slow_work, link);
182 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
183 BUG();
184 list_del_init(&work->link);
185 very_slow = false;
186 } else {
187 very_slow = false; /* avoid the compiler warning */
188 }
189 spin_unlock_irq(&slow_work_queue_lock);
190
191 if (!work)
192 return false;
193
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG();
196
197 work->ops->execute(work);
198
199 if (very_slow)
200 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202
203 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously
206 *
207 * there is, however, a race between us testing the pending flag and
208 * getting the spinlock, and between the enqueuer setting the pending
209 * flag and getting the spinlock, so we use a deferral bit to tell us
210 * if the enqueuer got there first
211 */
212 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
213 spin_lock_irq(&slow_work_queue_lock);
214
215 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
216 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
217 goto auto_requeue;
218
219 spin_unlock_irq(&slow_work_queue_lock);
220 }
221
222 work->ops->put_ref(work);
223 return true;
224
225auto_requeue:
226 /* we must complete the enqueue operation
227 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already
229 */
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue);
232 else
233 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock);
235 return true;
236}
237
238/**
239 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue
241 *
242 * Schedule a slow work item for processing. If the item is already undergoing
243 * execution, this guarantees not to re-enter the execution routine until the
244 * first execution finishes.
245 *
246 * The item is pinned by this function as it retains a reference to it, managed
247 * through the item operations. The item is unpinned once it has been
248 * executed.
249 *
250 * An item may hog the thread that is running it for a relatively large amount
251 * of time, sufficient, for example, to perform several lookup, mkdir, create
252 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
253 *
254 * Conversely, if a number of items are awaiting processing, it may take some
255 * time before any given item is given attention. The number of threads in the
256 * pool may be increased to deal with demand, but only up to a limit.
257 *
258 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
259 * the very slow queue, from which only a portion of the threads will be
260 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow.
262 *
263 * Returns 0 if successful, -EAGAIN if not.
264 */
265int slow_work_enqueue(struct slow_work *work)
266{
267 unsigned long flags;
268
269 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work);
271 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273
274 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once
276 * per enqueue request
277 *
278 * we use the PENDING bit to merge together repeat requests without
279 * having to disable IRQs and take the spinlock, whilst still
280 * maintaining our promise
281 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
283 spin_lock_irqsave(&slow_work_queue_lock, flags);
284
285 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously
287 *
288 * this, however, leaves us with a problem if we're asked to
289 * enqueue the work whilst someone is executing the work
290 * function as simply queueing the work immediately means that
291 * another thread may try executing it whilst it is already
292 * under execution
293 *
294 * to deal with this, we set the ENQ_DEFERRED bit instead of
295 * enqueueing, and the thread currently executing the work
296 * function will enqueue the work item when the work function
297 * returns and it has cleared the EXECUTING bit
298 */
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else {
302 if (work->ops->get_ref(work) < 0)
303 goto cant_get_ref;
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
305 list_add_tail(&work->link, &vslow_work_queue);
306 else
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq);
309 }
310
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 }
313 return 0;
314
315cant_get_ref:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN;
318}
319EXPORT_SYMBOL(slow_work_enqueue);
320
321/*
322 * Worker thread culling algorithm
323 */
324static bool slow_work_cull_thread(void)
325{
326 unsigned long flags;
327 bool do_cull = false;
328
329 spin_lock_irqsave(&slow_work_queue_lock, flags);
330
331 if (slow_work_cull) {
332 slow_work_cull = false;
333
334 if (list_empty(&slow_work_queue) &&
335 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer,
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true;
341 }
342 }
343
344 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
345 return do_cull;
346}
347
348/*
349 * Determine if there is slow work available for dispatch
350 */
351static inline bool slow_work_available(int vsmax)
352{
353 return !list_empty(&slow_work_queue) ||
354 (!list_empty(&vslow_work_queue) &&
355 atomic_read(&vslow_work_executing_count) < vsmax);
356}
357
358/*
359 * Worker thread dispatcher
360 */
361static int slow_work_thread(void *_data)
362{
363 int vsmax;
364
365 DEFINE_WAIT(wait);
366
367 set_freezable();
368 set_user_nice(current, -5);
369
370 for (;;) {
371 vsmax = vslow_work_proportion;
372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100;
374
375 prepare_to_wait(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) &&
378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) &&
380 !slow_work_cull)
381 schedule();
382 finish_wait(&slow_work_thread_wq, &wait);
383
384 try_to_freeze();
385
386 vsmax = vslow_work_proportion;
387 vsmax *= atomic_read(&slow_work_thread_count);
388 vsmax /= 100;
389
390 if (slow_work_available(vsmax) && slow_work_execute()) {
391 cond_resched();
392 if (list_empty(&slow_work_queue) &&
393 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer,
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue;
399 }
400
401 if (slow_work_threads_should_exit)
402 break;
403
404 if (slow_work_cull && slow_work_cull_thread())
405 break;
406 }
407
408 if (atomic_dec_and_test(&slow_work_thread_count))
409 complete_and_exit(&slow_work_last_thread_exited, 0);
410 return 0;
411}
412
413/*
414 * Handle thread cull timer expiration
415 */
416static void slow_work_cull_timeout(unsigned long data)
417{
418 slow_work_cull = true;
419 wake_up(&slow_work_thread_wq);
420}
421
422/*
423 * Get a reference on slow work thread starter
424 */
425static int slow_work_new_thread_get_ref(struct slow_work *work)
426{
427 return 0;
428}
429
430/*
431 * Drop a reference on slow work thread starter
432 */
433static void slow_work_new_thread_put_ref(struct slow_work *work)
434{
435}
436
437/*
438 * Start a new slow work thread
439 */
440static void slow_work_new_thread_execute(struct slow_work *work)
441{
442 struct task_struct *p;
443
444 if (slow_work_threads_should_exit)
445 return;
446
447 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
448 return;
449
450 if (!mutex_trylock(&slow_work_user_lock))
451 return;
452
453 slow_work_may_not_start_new_thread = true;
454 atomic_inc(&slow_work_thread_count);
455 p = kthread_run(slow_work_thread, NULL, "kslowd");
456 if (IS_ERR(p)) {
457 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
458 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT);
462 } else {
463 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1);
465 }
466
467 mutex_unlock(&slow_work_user_lock);
468}
469
470static const struct slow_work_ops slow_work_new_thread_ops = {
471 .get_ref = slow_work_new_thread_get_ref,
472 .put_ref = slow_work_new_thread_put_ref,
473 .execute = slow_work_new_thread_execute,
474};
475
476/*
477 * post-OOM new thread start suppression expiration
478 */
479static void slow_work_oom_timeout(unsigned long data)
480{
481 slow_work_may_not_start_new_thread = false;
482}
483
484#ifdef CONFIG_SYSCTL
485/*
486 * Handle adjustment of the minimum number of threads
487 */
488static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
489 struct file *filp, void __user *buffer,
490 size_t *lenp, loff_t *ppos)
491{
492 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
493 int n;
494
495 if (ret == 0) {
496 mutex_lock(&slow_work_user_lock);
497 if (slow_work_user_count > 0) {
498 /* see if we need to start or stop threads */
499 n = atomic_read(&slow_work_thread_count) -
500 slow_work_min_threads;
501
502 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0)
505 mod_timer(&slow_work_cull_timer,
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 }
508 mutex_unlock(&slow_work_user_lock);
509 }
510
511 return ret;
512}
513
514/*
515 * Handle adjustment of the maximum number of threads
516 */
517static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
518 struct file *filp, void __user *buffer,
519 size_t *lenp, loff_t *ppos)
520{
521 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
522 int n;
523
524 if (ret == 0) {
525 mutex_lock(&slow_work_user_lock);
526 if (slow_work_user_count > 0) {
527 /* see if we need to stop threads */
528 n = slow_work_max_threads -
529 atomic_read(&slow_work_thread_count);
530
531 if (n < 0)
532 mod_timer(&slow_work_cull_timer,
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 }
535 mutex_unlock(&slow_work_user_lock);
536 }
537
538 return ret;
539}
540#endif /* CONFIG_SYSCTL */
541
542/**
543 * slow_work_register_user - Register a user of the facility
544 *
545 * Register a user of the facility, starting up the initial threads if there
546 * aren't any other users at this point. This will return 0 if successful, or
547 * an error if not.
548 */
549int slow_work_register_user(void)
550{
551 struct task_struct *p;
552 int loop;
553
554 mutex_lock(&slow_work_user_lock);
555
556 if (slow_work_user_count == 0) {
557 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
558 init_completion(&slow_work_last_thread_exited);
559
560 slow_work_threads_should_exit = false;
561 slow_work_init(&slow_work_new_thread,
562 &slow_work_new_thread_ops);
563 slow_work_may_not_start_new_thread = false;
564 slow_work_cull = false;
565
566 /* start the minimum number of threads */
567 for (loop = 0; loop < slow_work_min_threads; loop++) {
568 atomic_inc(&slow_work_thread_count);
569 p = kthread_run(slow_work_thread, NULL, "kslowd");
570 if (IS_ERR(p))
571 goto error;
572 }
573 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
574 }
575
576 slow_work_user_count++;
577 mutex_unlock(&slow_work_user_lock);
578 return 0;
579
580error:
581 if (atomic_dec_and_test(&slow_work_thread_count))
582 complete(&slow_work_last_thread_exited);
583 if (loop > 0) {
584 printk(KERN_ERR "Slow work thread pool:"
585 " Aborting startup on ENOMEM\n");
586 slow_work_threads_should_exit = true;
587 wake_up_all(&slow_work_thread_wq);
588 wait_for_completion(&slow_work_last_thread_exited);
589 printk(KERN_ERR "Slow work thread pool: Aborted\n");
590 }
591 mutex_unlock(&slow_work_user_lock);
592 return PTR_ERR(p);
593}
594EXPORT_SYMBOL(slow_work_register_user);
595
596/**
597 * slow_work_unregister_user - Unregister a user of the facility
598 *
599 * Unregister a user of the facility, killing all the threads if this was the
600 * last one.
601 */
602void slow_work_unregister_user(void)
603{
604 mutex_lock(&slow_work_user_lock);
605
606 BUG_ON(slow_work_user_count <= 0);
607
608 slow_work_user_count--;
609 if (slow_work_user_count == 0) {
610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
611 slow_work_threads_should_exit = true;
612 wake_up_all(&slow_work_thread_wq);
613 wait_for_completion(&slow_work_last_thread_exited);
614 printk(KERN_NOTICE "Slow work thread pool:"
615 " Shut down complete\n");
616 }
617
618 del_timer_sync(&slow_work_cull_timer);
619
620 mutex_unlock(&slow_work_user_lock);
621}
622EXPORT_SYMBOL(slow_work_unregister_user);
623
624/*
625 * Initialise the slow work facility
626 */
627static int __init init_slow_work(void)
628{
629 unsigned nr_cpus = num_possible_cpus();
630
631 if (slow_work_max_threads < nr_cpus)
632 slow_work_max_threads = nr_cpus;
633#ifdef CONFIG_SYSCTL
634 if (slow_work_max_max_threads < nr_cpus * 2)
635 slow_work_max_max_threads = nr_cpus * 2;
636#endif
637 return 0;
638}
639
640subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7efe32..858baac568ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,40 +2,82 @@
2 * Generic helpers for smp ipi calls 2 * Generic helpers for smp ipi calls
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 *
6 */ 5 */
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/percpu.h>
10#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
11#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/percpu.h>
11#include <linux/init.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h>
13 14
14static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); 15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
15static LIST_HEAD(call_function_queue); 16
16__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); 17static struct {
18 struct list_head queue;
19 spinlock_t lock;
20} call_function __cacheline_aligned_in_smp =
21 {
22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
24 };
17 25
18enum { 26enum {
19 CSD_FLAG_WAIT = 0x01, 27 CSD_FLAG_LOCK = 0x01,
20 CSD_FLAG_ALLOC = 0x02,
21 CSD_FLAG_LOCK = 0x04,
22}; 28};
23 29
24struct call_function_data { 30struct call_function_data {
25 struct call_single_data csd; 31 struct call_single_data csd;
26 spinlock_t lock; 32 spinlock_t lock;
27 unsigned int refs; 33 unsigned int refs;
28 struct rcu_head rcu_head; 34 cpumask_var_t cpumask;
29 unsigned long cpumask_bits[];
30}; 35};
31 36
32struct call_single_queue { 37struct call_single_queue {
33 struct list_head list; 38 struct list_head list;
34 spinlock_t lock; 39 spinlock_t lock;
40};
41
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45
46static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48{
49 long cpu = (long)hcpu;
50 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
51
52 switch (action) {
53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu)))
57 return NOTIFY_BAD;
58 break;
59
60#ifdef CONFIG_CPU_HOTPLUG
61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN:
63
64 case CPU_DEAD:
65 case CPU_DEAD_FROZEN:
66 free_cpumask_var(cfd->cpumask);
67 break;
68#endif
69 };
70
71 return NOTIFY_OK;
72}
73
74static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
75 .notifier_call = hotplug_cfd,
35}; 76};
36 77
37static int __cpuinit init_call_single_data(void) 78static int __cpuinit init_call_single_data(void)
38{ 79{
80 void *cpu = (void *)(long)smp_processor_id();
39 int i; 81 int i;
40 82
41 for_each_possible_cpu(i) { 83 for_each_possible_cpu(i) {
@@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
44 spin_lock_init(&q->lock); 86 spin_lock_init(&q->lock);
45 INIT_LIST_HEAD(&q->list); 87 INIT_LIST_HEAD(&q->list);
46 } 88 }
89
90 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
91 register_cpu_notifier(&hotplug_cfd_notifier);
92
47 return 0; 93 return 0;
48} 94}
49early_initcall(init_call_single_data); 95early_initcall(init_call_single_data);
50 96
51static void csd_flag_wait(struct call_single_data *data) 97/*
98 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
99 *
100 * For non-synchronous ipi calls the csd can still be in use by the
101 * previous function call. For multi-cpu calls its even more interesting
102 * as we'll have to ensure no other cpu is observing our csd.
103 */
104static void csd_lock_wait(struct call_single_data *data)
52{ 105{
53 /* Wait for response */ 106 while (data->flags & CSD_FLAG_LOCK)
54 do {
55 if (!(data->flags & CSD_FLAG_WAIT))
56 break;
57 cpu_relax(); 107 cpu_relax();
58 } while (1); 108}
109
110static void csd_lock(struct call_single_data *data)
111{
112 csd_lock_wait(data);
113 data->flags = CSD_FLAG_LOCK;
114
115 /*
116 * prevent CPU from reordering the above assignment
117 * to ->flags with any subsequent assignments to other
118 * fields of the specified call_single_data structure:
119 */
120 smp_mb();
121}
122
123static void csd_unlock(struct call_single_data *data)
124{
125 WARN_ON(!(data->flags & CSD_FLAG_LOCK));
126
127 /*
128 * ensure we're all done before releasing data:
129 */
130 smp_mb();
131
132 data->flags &= ~CSD_FLAG_LOCK;
59} 133}
60 134
61/* 135/*
62 * Insert a previously allocated call_single_data element for execution 136 * Insert a previously allocated call_single_data element
63 * on the given CPU. data must already have ->func, ->info, and ->flags set. 137 * for execution on the given CPU. data must already have
138 * ->func, ->info, and ->flags set.
64 */ 139 */
65static void generic_exec_single(int cpu, struct call_single_data *data) 140static
141void generic_exec_single(int cpu, struct call_single_data *data, int wait)
66{ 142{
67 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 143 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
68 int wait = data->flags & CSD_FLAG_WAIT, ipi;
69 unsigned long flags; 144 unsigned long flags;
145 int ipi;
70 146
71 spin_lock_irqsave(&dst->lock, flags); 147 spin_lock_irqsave(&dst->lock, flags);
72 ipi = list_empty(&dst->list); 148 ipi = list_empty(&dst->list);
@@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
74 spin_unlock_irqrestore(&dst->lock, flags); 150 spin_unlock_irqrestore(&dst->lock, flags);
75 151
76 /* 152 /*
77 * Make the list addition visible before sending the ipi. 153 * The list addition should be visible before sending the IPI
154 * handler locks the list to pull the entry off it because of
155 * normal cache coherency rules implied by spinlocks.
156 *
157 * If IPIs can go out of order to the cache coherency protocol
158 * in an architecture, sufficient synchronisation should be added
159 * to arch code to make it appear to obey cache coherency WRT
160 * locking and barrier primitives. Generic code isn't really
161 * equipped to do the right thing...
78 */ 162 */
79 smp_mb();
80
81 if (ipi) 163 if (ipi)
82 arch_send_call_function_single_ipi(cpu); 164 arch_send_call_function_single_ipi(cpu);
83 165
84 if (wait) 166 if (wait)
85 csd_flag_wait(data); 167 csd_lock_wait(data);
86}
87
88static void rcu_free_call_data(struct rcu_head *head)
89{
90 struct call_function_data *data;
91
92 data = container_of(head, struct call_function_data, rcu_head);
93
94 kfree(data);
95} 168}
96 169
97/* 170/*
@@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)
104 int cpu = get_cpu(); 177 int cpu = get_cpu();
105 178
106 /* 179 /*
107 * It's ok to use list_for_each_rcu() here even though we may delete 180 * Ensure entry is visible on call_function_queue after we have
108 * 'pos', since list_del_rcu() doesn't clear ->next 181 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list
183 * and never get another IPI to process it.
184 */
185 smp_mb();
186
187 /*
188 * It's ok to use list_for_each_rcu() here even though we may
189 * delete 'pos', since list_del_rcu() doesn't clear ->next
109 */ 190 */
110 rcu_read_lock(); 191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
111 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
112 int refs; 192 int refs;
113 193
114 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) 194 spin_lock(&data->lock);
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
115 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
116 201
117 data->csd.func(data->csd.info); 202 data->csd.func(data->csd.info);
118 203
119 spin_lock(&data->lock); 204 spin_lock(&data->lock);
120 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
121 WARN_ON(data->refs == 0); 205 WARN_ON(data->refs == 0);
122 data->refs--; 206 refs = --data->refs;
123 refs = data->refs; 207 if (!refs) {
208 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock);
211 }
124 spin_unlock(&data->lock); 212 spin_unlock(&data->lock);
125 213
126 if (refs) 214 if (refs)
127 continue; 215 continue;
128 216
129 spin_lock(&call_function_lock); 217 csd_unlock(&data->csd);
130 list_del_rcu(&data->csd.list);
131 spin_unlock(&call_function_lock);
132
133 if (data->csd.flags & CSD_FLAG_WAIT) {
134 /*
135 * serialize stores to data with the flag clear
136 * and wakeup
137 */
138 smp_wmb();
139 data->csd.flags &= ~CSD_FLAG_WAIT;
140 }
141 if (data->csd.flags & CSD_FLAG_ALLOC)
142 call_rcu(&data->rcu_head, rcu_free_call_data);
143 } 218 }
144 rcu_read_unlock();
145 219
146 put_cpu(); 220 put_cpu();
147} 221}
148 222
149/* 223/*
150 * Invoked by arch to handle an IPI for call function single. Must be called 224 * Invoked by arch to handle an IPI for call function single. Must be
151 * from the arch with interrupts disabled. 225 * called from the arch with interrupts disabled.
152 */ 226 */
153void generic_smp_call_function_single_interrupt(void) 227void generic_smp_call_function_single_interrupt(void)
154{ 228{
155 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 229 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
230 unsigned int data_flags;
156 LIST_HEAD(list); 231 LIST_HEAD(list);
157 232
158 /* 233 spin_lock(&q->lock);
159 * Need to see other stores to list head for checking whether 234 list_replace_init(&q->list, &list);
160 * list is empty without holding q->lock 235 spin_unlock(&q->lock);
161 */ 236
162 smp_read_barrier_depends(); 237 while (!list_empty(&list)) {
163 while (!list_empty(&q->list)) { 238 struct call_single_data *data;
164 unsigned int data_flags; 239
165 240 data = list_entry(list.next, struct call_single_data, list);
166 spin_lock(&q->lock); 241 list_del(&data->list);
167 list_replace_init(&q->list, &list); 242
168 spin_unlock(&q->lock); 243 /*
169 244 * 'data' can be invalid after this call if flags == 0
170 while (!list_empty(&list)) { 245 * (when called through generic_exec_single()),
171 struct call_single_data *data; 246 * so save them away before making the call:
172 247 */
173 data = list_entry(list.next, struct call_single_data, 248 data_flags = data->flags;
174 list); 249
175 list_del(&data->list); 250 data->func(data->info);
176 251
177 /*
178 * 'data' can be invalid after this call if
179 * flags == 0 (when called through
180 * generic_exec_single(), so save them away before
181 * making the call.
182 */
183 data_flags = data->flags;
184
185 data->func(data->info);
186
187 if (data_flags & CSD_FLAG_WAIT) {
188 smp_wmb();
189 data->flags &= ~CSD_FLAG_WAIT;
190 } else if (data_flags & CSD_FLAG_LOCK) {
191 smp_wmb();
192 data->flags &= ~CSD_FLAG_LOCK;
193 } else if (data_flags & CSD_FLAG_ALLOC)
194 kfree(data);
195 }
196 /* 252 /*
197 * See comment on outer loop 253 * Unlocked CSDs are valid through generic_exec_single():
198 */ 254 */
199 smp_read_barrier_depends(); 255 if (data_flags & CSD_FLAG_LOCK)
256 csd_unlock(data);
200 } 257 }
201} 258}
202 259
@@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
215int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 272int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
216 int wait) 273 int wait)
217{ 274{
218 struct call_single_data d; 275 struct call_single_data d = {
276 .flags = 0,
277 };
219 unsigned long flags; 278 unsigned long flags;
220 /* prevent preemption and reschedule on another processor, 279 int this_cpu;
221 as well as CPU removal */
222 int me = get_cpu();
223 int err = 0; 280 int err = 0;
224 281
282 /*
283 * prevent preemption and reschedule on another processor,
284 * as well as CPU removal
285 */
286 this_cpu = get_cpu();
287
225 /* Can deadlock when called with interrupts disabled */ 288 /* Can deadlock when called with interrupts disabled */
226 WARN_ON(irqs_disabled()); 289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
227 290
228 if (cpu == me) { 291 if (cpu == this_cpu) {
229 local_irq_save(flags); 292 local_irq_save(flags);
230 func(info); 293 func(info);
231 local_irq_restore(flags); 294 local_irq_restore(flags);
232 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 295 } else {
233 struct call_single_data *data; 296 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
297 struct call_single_data *data = &d;
298
299 if (!wait)
300 data = &__get_cpu_var(csd_data);
234 301
235 if (!wait) { 302 csd_lock(data);
236 /* 303
237 * We are calling a function on a single CPU 304 data->func = func;
238 * and we are not going to wait for it to finish. 305 data->info = info;
239 * We first try to allocate the data, but if we 306 generic_exec_single(cpu, data, wait);
240 * fail, we fall back to use a per cpu data to pass
241 * the information to that CPU. Since all callers
242 * of this code will use the same data, we must
243 * synchronize the callers to prevent a new caller
244 * from corrupting the data before the callee
245 * can access it.
246 *
247 * The CSD_FLAG_LOCK is used to let us know when
248 * the IPI handler is done with the data.
249 * The first caller will set it, and the callee
250 * will clear it. The next caller must wait for
251 * it to clear before we set it again. This
252 * will make sure the callee is done with the
253 * data before a new caller will use it.
254 */
255 data = kmalloc(sizeof(*data), GFP_ATOMIC);
256 if (data)
257 data->flags = CSD_FLAG_ALLOC;
258 else {
259 data = &per_cpu(csd_data, me);
260 while (data->flags & CSD_FLAG_LOCK)
261 cpu_relax();
262 data->flags = CSD_FLAG_LOCK;
263 }
264 } else { 307 } else {
265 data = &d; 308 err = -ENXIO; /* CPU not online */
266 data->flags = CSD_FLAG_WAIT;
267 } 309 }
268
269 data->func = func;
270 data->info = info;
271 generic_exec_single(cpu, data);
272 } else {
273 err = -ENXIO; /* CPU not online */
274 } 310 }
275 311
276 put_cpu(); 312 put_cpu();
313
277 return err; 314 return err;
278} 315}
279EXPORT_SYMBOL(smp_call_function_single); 316EXPORT_SYMBOL(smp_call_function_single);
@@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
283 * @cpu: The CPU to run on. 320 * @cpu: The CPU to run on.
284 * @data: Pre-allocated and setup data structure 321 * @data: Pre-allocated and setup data structure
285 * 322 *
286 * Like smp_call_function_single(), but allow caller to pass in a pre-allocated 323 * Like smp_call_function_single(), but allow caller to pass in a
287 * data structure. Useful for embedding @data inside other structures, for 324 * pre-allocated data structure. Useful for embedding @data inside
288 * instance. 325 * other structures, for instance.
289 *
290 */ 326 */
291void __smp_call_function_single(int cpu, struct call_single_data *data) 327void __smp_call_function_single(int cpu, struct call_single_data *data,
328 int wait)
292{ 329{
330 csd_lock(data);
331
293 /* Can deadlock when called with interrupts disabled */ 332 /* Can deadlock when called with interrupts disabled */
294 WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); 333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
295 334
296 generic_exec_single(cpu, data); 335 generic_exec_single(cpu, data, wait);
297} 336}
298 337
299/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ 338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
300#ifndef arch_send_call_function_ipi_mask 340#ifndef arch_send_call_function_ipi_mask
301#define arch_send_call_function_ipi_mask(maskp) \ 341# define arch_send_call_function_ipi_mask(maskp) \
302 arch_send_call_function_ipi(*(maskp)) 342 arch_send_call_function_ipi(*(maskp))
303#endif 343#endif
304 344
305/** 345/**
@@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
307 * @mask: The set of cpus to run on (only runs on online subset). 347 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 348 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 349 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 350 * @wait: If true, wait (atomically) until function has completed
351 * on other CPUs.
311 * 352 *
312 * If @wait is true, then returns once @func has returned. Note that @wait 353 * If @wait is true, then returns once @func has returned. Note that @wait
313 * will be implicitly turned on in case of allocation failures, since 354 * will be implicitly turned on in case of allocation failures, since
@@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
318 * must be disabled when calling this function. 359 * must be disabled when calling this function.
319 */ 360 */
320void smp_call_function_many(const struct cpumask *mask, 361void smp_call_function_many(const struct cpumask *mask,
321 void (*func)(void *), void *info, 362 void (*func)(void *), void *info, bool wait)
322 bool wait)
323{ 363{
324 struct call_function_data *data; 364 struct call_function_data *data;
325 unsigned long flags; 365 unsigned long flags;
326 int cpu, next_cpu; 366 int cpu, next_cpu, this_cpu = smp_processor_id();
327 367
328 /* Can deadlock when called with interrupts disabled */ 368 /* Can deadlock when called with interrupts disabled */
329 WARN_ON(irqs_disabled()); 369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
330 370
331 /* So, what's a CPU they want? Ignoring this one. */ 371 /* So, what's a CPU they want? Ignoring this one. */
332 cpu = cpumask_first_and(mask, cpu_online_mask); 372 cpu = cpumask_first_and(mask, cpu_online_mask);
333 if (cpu == smp_processor_id()) 373 if (cpu == this_cpu)
334 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 374 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
375
335 /* No online cpus? We're done. */ 376 /* No online cpus? We're done. */
336 if (cpu >= nr_cpu_ids) 377 if (cpu >= nr_cpu_ids)
337 return; 378 return;
338 379
339 /* Do we have another CPU which isn't us? */ 380 /* Do we have another CPU which isn't us? */
340 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 381 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
341 if (next_cpu == smp_processor_id()) 382 if (next_cpu == this_cpu)
342 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); 383 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
343 384
344 /* Fastpath: do that cpu by itself. */ 385 /* Fastpath: do that cpu by itself. */
@@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
347 return; 388 return;
348 } 389 }
349 390
350 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); 391 data = &__get_cpu_var(cfd_data);
351 if (unlikely(!data)) { 392 csd_lock(&data->csd);
352 /* Slow path. */
353 for_each_online_cpu(cpu) {
354 if (cpu == smp_processor_id())
355 continue;
356 if (cpumask_test_cpu(cpu, mask))
357 smp_call_function_single(cpu, func, info, wait);
358 }
359 return;
360 }
361 393
362 spin_lock_init(&data->lock); 394 spin_lock_irqsave(&data->lock, flags);
363 data->csd.flags = CSD_FLAG_ALLOC;
364 if (wait)
365 data->csd.flags |= CSD_FLAG_WAIT;
366 data->csd.func = func; 395 data->csd.func = func;
367 data->csd.info = info; 396 data->csd.info = info;
368 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); 397 cpumask_and(data->cpumask, mask, cpu_online_mask);
369 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); 398 cpumask_clear_cpu(this_cpu, data->cpumask);
370 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); 399 data->refs = cpumask_weight(data->cpumask);
371 400
372 spin_lock_irqsave(&call_function_lock, flags); 401 spin_lock(&call_function.lock);
373 list_add_tail_rcu(&data->csd.list, &call_function_queue); 402 /*
374 spin_unlock_irqrestore(&call_function_lock, flags); 403 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries:
406 */
407 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock);
409
410 spin_unlock_irqrestore(&data->lock, flags);
375 411
376 /* 412 /*
377 * Make the list addition visible before sending the ipi. 413 * Make the list addition visible before sending the ipi.
414 * (IPIs must obey or appear to obey normal Linux cache
415 * coherency rules -- see comment in generic_exec_single).
378 */ 416 */
379 smp_mb(); 417 smp_mb();
380 418
381 /* Send a message to all CPUs in the map */ 419 /* Send a message to all CPUs in the map */
382 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); 420 arch_send_call_function_ipi_mask(data->cpumask);
383 421
384 /* optionally wait for the CPUs to complete */ 422 /* Optionally wait for the CPUs to complete */
385 if (wait) 423 if (wait)
386 csd_flag_wait(&data->csd); 424 csd_lock_wait(&data->csd);
387} 425}
388EXPORT_SYMBOL(smp_call_function_many); 426EXPORT_SYMBOL(smp_call_function_many);
389 427
@@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
391 * smp_call_function(): Run a function on all other CPUs. 429 * smp_call_function(): Run a function on all other CPUs.
392 * @func: The function to run. This must be fast and non-blocking. 430 * @func: The function to run. This must be fast and non-blocking.
393 * @info: An arbitrary pointer to pass to the function. 431 * @info: An arbitrary pointer to pass to the function.
394 * @wait: If true, wait (atomically) until function has completed on other CPUs. 432 * @wait: If true, wait (atomically) until function has completed
433 * on other CPUs.
395 * 434 *
396 * Returns 0. 435 * Returns 0.
397 * 436 *
@@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
407 preempt_disable(); 446 preempt_disable();
408 smp_call_function_many(cpu_online_mask, func, info, wait); 447 smp_call_function_many(cpu_online_mask, func, info, wait);
409 preempt_enable(); 448 preempt_enable();
449
410 return 0; 450 return 0;
411} 451}
412EXPORT_SYMBOL(smp_call_function); 452EXPORT_SYMBOL(smp_call_function);
413 453
414void ipi_call_lock(void) 454void ipi_call_lock(void)
415{ 455{
416 spin_lock(&call_function_lock); 456 spin_lock(&call_function.lock);
417} 457}
418 458
419void ipi_call_unlock(void) 459void ipi_call_unlock(void)
420{ 460{
421 spin_unlock(&call_function_lock); 461 spin_unlock(&call_function.lock);
422} 462}
423 463
424void ipi_call_lock_irq(void) 464void ipi_call_lock_irq(void)
425{ 465{
426 spin_lock_irq(&call_function_lock); 466 spin_lock_irq(&call_function.lock);
427} 467}
428 468
429void ipi_call_unlock_irq(void) 469void ipi_call_unlock_irq(void)
430{ 470{
431 spin_unlock_irq(&call_function_lock); 471 spin_unlock_irq(&call_function.lock);
432} 472}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 65ff3e3961b4..2fecefacdc5b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,7 +65,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
65 * to the pending events, so lets the scheduler to balance 65 * to the pending events, so lets the scheduler to balance
66 * the softirq load for us. 66 * the softirq load for us.
67 */ 67 */
68static inline void wakeup_softirqd(void) 68void wakeup_softirqd(void)
69{ 69{
70 /* Interrupts are disabled: no need to stop preemption */ 70 /* Interrupts are disabled: no need to stop preemption */
71 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 71 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -518,7 +518,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
518 cp->flags = 0; 518 cp->flags = 0;
519 cp->priv = softirq; 519 cp->priv = softirq;
520 520
521 __smp_call_function_single(cpu, cp); 521 __smp_call_function_single(cpu, cp, 0);
522 return 0; 522 return 0;
523 } 523 }
524 return 1; 524 return 1;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
121 local_irq_save(flags); 121 local_irq_save(flags);
122 preempt_disable(); 122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); 124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
125 return flags; 126 return flags;
126} 127}
127EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
151 local_irq_save(flags); 152 local_irq_save(flags);
152 preempt_disable(); 153 preempt_disable();
153 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
154 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); 155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
155 return flags; 157 return flags;
156} 158}
157EXPORT_SYMBOL(_write_lock_irqsave); 159EXPORT_SYMBOL(_write_lock_irqsave);
@@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
299 local_irq_save(flags); 301 local_irq_save(flags);
300 preempt_disable(); 302 preempt_disable();
301 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 303 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
302 /* 304 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
303 * On lockdep we dont want the hand-coded irq-enable of 305 _raw_spin_lock_flags, &flags);
304 * _raw_spin_lock_flags() code, because lockdep assumes
305 * that interrupts are not re-enabled during lock-acquire:
306 */
307#ifdef CONFIG_LOCKDEP
308 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
309#else
310 _raw_spin_lock_flags(lock, &flags);
311#endif
312 return flags; 306 return flags;
313} 307}
314EXPORT_SYMBOL(_spin_lock_irqsave_nested); 308EXPORT_SYMBOL(_spin_lock_irqsave_nested);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 74541ca49536..912823e2a11b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
44static int refcount; 44static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const cpumask_t *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..51dbb55604e8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/fs_struct.h>
37 38
38#include <linux/compat.h> 39#include <linux/compat.h>
39#include <linux/syscalls.h> 40#include <linux/syscalls.h>
@@ -1013,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
1013 if (err) 1014 if (err)
1014 goto out; 1015 goto out;
1015 1016
1016 if (task_pgrp(p) != pgrp) { 1017 if (task_pgrp(p) != pgrp)
1017 change_pid(p, PIDTYPE_PGID, pgrp); 1018 change_pid(p, PIDTYPE_PGID, pgrp);
1018 set_task_pgrp(p, pid_nr(pgrp));
1019 }
1020 1019
1021 err = 0; 1020 err = 0;
1022out: 1021out:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..b125e3387568 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
48#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/processor.h> 54#include <asm/processor.h>
@@ -95,12 +96,9 @@ static int sixty = 60;
95static int neg_one = -1; 96static int neg_one = -1;
96#endif 97#endif
97 98
98#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
99static int two = 2;
100#endif
101
102static int zero; 99static int zero;
103static int one = 1; 100static int __maybe_unused one = 1;
101static int __maybe_unused two = 2;
104static unsigned long one_ul = 1; 102static unsigned long one_ul = 1;
105static int one_hundred = 100; 103static int one_hundred = 100;
106 104
@@ -900,6 +898,14 @@ static struct ctl_table kern_table[] = {
900 .proc_handler = &scan_unevictable_handler, 898 .proc_handler = &scan_unevictable_handler,
901 }, 899 },
902#endif 900#endif
901#ifdef CONFIG_SLOW_WORK
902 {
903 .ctl_name = CTL_UNNUMBERED,
904 .procname = "slow-work",
905 .mode = 0555,
906 .child = slow_work_sysctls,
907 },
908#endif
903/* 909/*
904 * NOTE: do not add new entries to this table unless you have read 910 * NOTE: do not add new entries to this table unless you have read
905 * Documentation/sysctl/ctl_unnumbered.txt 911 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1010,7 +1016,7 @@ static struct ctl_table vm_table[] = {
1010 .data = &dirty_expire_interval, 1016 .data = &dirty_expire_interval,
1011 .maxlen = sizeof(dirty_expire_interval), 1017 .maxlen = sizeof(dirty_expire_interval),
1012 .mode = 0644, 1018 .mode = 0644,
1013 .proc_handler = &proc_dointvec_userhz_jiffies, 1019 .proc_handler = &proc_dointvec,
1014 }, 1020 },
1015 { 1021 {
1016 .ctl_name = VM_NR_PDFLUSH_THREADS, 1022 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -1373,10 +1379,7 @@ static struct ctl_table fs_table[] = {
1373 .data = &lease_break_time, 1379 .data = &lease_break_time,
1374 .maxlen = sizeof(int), 1380 .maxlen = sizeof(int),
1375 .mode = 0644, 1381 .mode = 0644,
1376 .proc_handler = &proc_dointvec_minmax, 1382 .proc_handler = &proc_dointvec,
1377 .strategy = &sysctl_intvec,
1378 .extra1 = &zero,
1379 .extra2 = &two,
1380 }, 1383 },
1381#endif 1384#endif
1382#ifdef CONFIG_AIO 1385#ifdef CONFIG_AIO
@@ -1417,7 +1420,10 @@ static struct ctl_table fs_table[] = {
1417 .data = &suid_dumpable, 1420 .data = &suid_dumpable,
1418 .maxlen = sizeof(int), 1421 .maxlen = sizeof(int),
1419 .mode = 0644, 1422 .mode = 0644,
1420 .proc_handler = &proc_dointvec, 1423 .proc_handler = &proc_dointvec_minmax,
1424 .strategy = &sysctl_intvec,
1425 .extra1 = &zero,
1426 .extra2 = &two,
1421 }, 1427 },
1422#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1428#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1423 { 1429 {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, 219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, 220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, 221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
222 {} 223 {}
223}; 224};
224 225
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792d..0b0a6366c9d4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83cf..d13be216a790 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
68 if (dev->mode != mode) { 68 if (dev->mode != mode) {
69 dev->set_mode(mode, dev); 69 dev->set_mode(mode, dev);
70 dev->mode = mode; 70 dev->mode = mode;
71
72 /*
73 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
74 * on it, so fix it up and emit a warning:
75 */
76 if (mode == CLOCK_EVT_MODE_ONESHOT) {
77 if (unlikely(!dev->mult)) {
78 dev->mult = 1;
79 WARN_ON(1);
80 }
81 }
71 } 82 }
72} 83}
73 84
@@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 179 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask); 180 BUG_ON(!dev->cpumask);
170 181
171 /*
172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
173 * on it, so fix it up and emit a warning:
174 */
175 if (unlikely(!dev->mult)) {
176 dev->mult = 1;
177 WARN_ON(1);
178 }
179
180 spin_lock(&clockevents_lock); 182 spin_lock(&clockevents_lock);
181 183
182 list_add(&dev->list, &clockevent_devices); 184 list_add(&dev->list, &clockevent_devices);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f08..c46c931a7fe7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 32#include <linux/tick.h>
33 33
34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc,
36 u64 start_tstamp)
37{
38 tc->cc = cc;
39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp;
41}
42EXPORT_SYMBOL(timecounter_init);
43
44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function
46 * @tc: Pointer to time counter
47 *
48 * When the underlying cycle counter runs over, this will be handled
49 * correctly as long as it does not run over more than once between
50 * calls.
51 *
52 * The first call to this function for a new time counter initializes
53 * the time tracking and returns an undefined result.
54 */
55static u64 timecounter_read_delta(struct timecounter *tc)
56{
57 cycle_t cycle_now, cycle_delta;
58 u64 ns_offset;
59
60 /* read cycle counter: */
61 cycle_now = tc->cc->read(tc->cc);
62
63 /* calculate the delta since the last timecounter_read_delta(): */
64 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
65
66 /* convert to nanoseconds: */
67 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
68
69 /* update time stamp of timecounter_read_delta() call: */
70 tc->cycle_last = cycle_now;
71
72 return ns_offset;
73}
74
75u64 timecounter_read(struct timecounter *tc)
76{
77 u64 nsec;
78
79 /* increment time by nanoseconds since last call */
80 nsec = timecounter_read_delta(tc);
81 nsec += tc->nsec;
82 tc->nsec = nsec;
83
84 return nsec;
85}
86EXPORT_SYMBOL(timecounter_read);
87
88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp)
90{
91 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
92 u64 nsec;
93
94 /*
95 * Instead of always treating cycle_tstamp as more recent
96 * than tc->cycle_last, detect when it is too far in the
97 * future and treat it as old time stamp instead.
98 */
99 if (cycle_delta > tc->cc->mask / 2) {
100 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
101 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
102 } else {
103 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
104 }
105
106 return nsec;
107}
108EXPORT_SYMBOL(timecounter_cyc2time);
109
34/* XXX - Would like a better way for initializing curr_clocksource */ 110/* XXX - Would like a better way for initializing curr_clocksource */
35extern struct clocksource clocksource_jiffies; 111extern struct clocksource clocksource_jiffies;
36 112
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f5f793d92415..7fc64375ff43 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,71 +1,129 @@
1/* 1/*
2 * linux/kernel/time/ntp.c
3 *
4 * NTP state machine interfaces and logic. 2 * NTP state machine interfaces and logic.
5 * 3 *
6 * This code was mainly moved from kernel/timer.c and kernel/time.c 4 * This code was mainly moved from kernel/timer.c and kernel/time.c
7 * Please see those files for relevant copyright info and historical 5 * Please see those files for relevant copyright info and historical
8 * changelogs. 6 * changelogs.
9 */ 7 */
10
11#include <linux/mm.h>
12#include <linux/time.h>
13#include <linux/timex.h>
14#include <linux/jiffies.h>
15#include <linux/hrtimer.h>
16#include <linux/capability.h> 8#include <linux/capability.h>
17#include <linux/math64.h>
18#include <linux/clocksource.h> 9#include <linux/clocksource.h>
19#include <linux/workqueue.h> 10#include <linux/workqueue.h>
20#include <asm/timex.h> 11#include <linux/hrtimer.h>
12#include <linux/jiffies.h>
13#include <linux/math64.h>
14#include <linux/timex.h>
15#include <linux/time.h>
16#include <linux/mm.h>
21 17
22/* 18/*
23 * Timekeeping variables 19 * NTP timekeeping variables:
24 */ 20 */
25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
26unsigned long tick_nsec; /* ACTHZ period (nsec) */
27u64 tick_length;
28static u64 tick_length_base;
29 21
30static struct hrtimer leap_timer; 22/* USER_HZ period (usecs): */
23unsigned long tick_usec = TICK_USEC;
31 24
32#define MAX_TICKADJ 500 /* microsecs */ 25/* ACTHZ period (nsecs): */
33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26unsigned long tick_nsec;
34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 27
28u64 tick_length;
29static u64 tick_length_base;
30
31static struct hrtimer leap_timer;
32
33#define MAX_TICKADJ 500LL /* usecs */
34#define MAX_TICKADJ_SCALED \
35 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
35 36
36/* 37/*
37 * phase-lock loop variables 38 * phase-lock loop variables
38 */ 39 */
39/* TIME_ERROR prevents overwriting the CMOS clock */
40static int time_state = TIME_OK; /* clock synchronization status */
41int time_status = STA_UNSYNC; /* clock status bits */
42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
44static long time_constant = 2; /* pll time constant */
45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
47static s64 time_freq; /* frequency offset (scaled ns/s)*/
48static long time_reftime; /* time at last adjustment (s) */
49long time_adjust;
50static long ntp_tick_adj;
51 40
41/*
42 * clock synchronization status
43 *
44 * (TIME_ERROR prevents overwriting the CMOS clock)
45 */
46static int time_state = TIME_OK;
47
48/* clock status bits: */
49int time_status = STA_UNSYNC;
50
51/* TAI offset (secs): */
52static long time_tai;
53
54/* time adjustment (nsecs): */
55static s64 time_offset;
56
57/* pll time constant: */
58static long time_constant = 2;
59
60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT;
62
63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT;
65
66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq;
68
69/* time at last adjustment (secs): */
70static long time_reftime;
71
72long time_adjust;
73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj;
76
77/*
78 * NTP methods:
79 */
80
81/*
82 * Update (tick_length, tick_length_base, tick_nsec), based
83 * on (tick_usec, ntp_tick_adj, time_freq):
84 */
52static void ntp_update_frequency(void) 85static void ntp_update_frequency(void)
53{ 86{
54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 87 u64 second_length;
55 << NTP_SCALE_SHIFT; 88 u64 new_base;
56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; 89
57 second_length += time_freq; 90 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
91 << NTP_SCALE_SHIFT;
92
93 second_length += ntp_tick_adj;
94 second_length += time_freq;
58 95
59 tick_length_base = second_length; 96 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
97 new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
60 98
61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; 99 /*
62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); 100 * Don't wait for the next second_overflow, apply
101 * the change to the tick length immediately:
102 */
103 tick_length += new_base - tick_length_base;
104 tick_length_base = new_base;
105}
106
107static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
108{
109 time_status &= ~STA_MODE;
110
111 if (secs < MINSEC)
112 return 0;
113
114 if (!(time_status & STA_FLL) && (secs <= MAXSEC))
115 return 0;
116
117 time_status |= STA_MODE;
118
119 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
63} 120}
64 121
65static void ntp_update_offset(long offset) 122static void ntp_update_offset(long offset)
66{ 123{
67 long mtemp;
68 s64 freq_adj; 124 s64 freq_adj;
125 s64 offset64;
126 long secs;
69 127
70 if (!(time_status & STA_PLL)) 128 if (!(time_status & STA_PLL))
71 return; 129 return;
@@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)
84 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
86 */ 144 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0) 145 secs = xtime.tv_sec - time_reftime;
88 time_reftime = xtime.tv_sec; 146 if (unlikely(time_status & STA_FREQHOLD))
89 mtemp = xtime.tv_sec - time_reftime; 147 secs = 0;
148
90 time_reftime = xtime.tv_sec; 149 time_reftime = xtime.tv_sec;
91 150
92 freq_adj = (s64)offset * mtemp; 151 offset64 = offset;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); 152 freq_adj = (offset64 * secs) <<
94 time_status &= ~STA_MODE; 153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103 154
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); 155 freq_adj += ntp_update_offset_fll(offset64, secs);
156
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158
159 time_freq = max(freq_adj, -MAXFREQ_SCALED);
160
161 time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
105} 162}
106 163
107/** 164/**
@@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)
111 */ 168 */
112void ntp_clear(void) 169void ntp_clear(void)
113{ 170{
114 time_adjust = 0; /* stop active adjtime() */ 171 time_adjust = 0; /* stop active adjtime() */
115 time_status |= STA_UNSYNC; 172 time_status |= STA_UNSYNC;
116 time_maxerror = NTP_PHASE_LIMIT; 173 time_maxerror = NTP_PHASE_LIMIT;
117 time_esterror = NTP_PHASE_LIMIT; 174 time_esterror = NTP_PHASE_LIMIT;
118 175
119 ntp_update_frequency(); 176 ntp_update_frequency();
120 177
121 tick_length = tick_length_base; 178 tick_length = tick_length_base;
122 time_offset = 0; 179 time_offset = 0;
123} 180}
124 181
125/* 182/*
@@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
140 xtime.tv_sec--; 197 xtime.tv_sec--;
141 wall_to_monotonic.tv_sec++; 198 wall_to_monotonic.tv_sec++;
142 time_state = TIME_OOP; 199 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 200 printk(KERN_NOTICE
144 "inserting leap second 23:59:60 UTC\n"); 201 "Clock: inserting leap second 23:59:60 UTC\n");
145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 202 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 res = HRTIMER_RESTART; 203 res = HRTIMER_RESTART;
147 break; 204 break;
@@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
150 time_tai--; 207 time_tai--;
151 wall_to_monotonic.tv_sec--; 208 wall_to_monotonic.tv_sec--;
152 time_state = TIME_WAIT; 209 time_state = TIME_WAIT;
153 printk(KERN_NOTICE "Clock: " 210 printk(KERN_NOTICE
154 "deleting leap second 23:59:59 UTC\n"); 211 "Clock: deleting leap second 23:59:59 UTC\n");
155 break; 212 break;
156 case TIME_OOP: 213 case TIME_OOP:
157 time_tai++; 214 time_tai++;
@@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
179 */ 236 */
180void second_overflow(void) 237void second_overflow(void)
181{ 238{
182 s64 time_adj; 239 s64 delta;
183 240
184 /* Bump the maxerror field */ 241 /* Bump the maxerror field */
185 time_maxerror += MAXFREQ / NSEC_PER_USEC; 242 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -192,24 +249,30 @@ void second_overflow(void)
192 * Compute the phase adjustment for the next second. The offset is 249 * Compute the phase adjustment for the next second. The offset is
193 * reduced by a fixed factor times the time constant. 250 * reduced by a fixed factor times the time constant.
194 */ 251 */
195 tick_length = tick_length_base; 252 tick_length = tick_length_base;
196 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 253
197 time_offset -= time_adj; 254 delta = shift_right(time_offset, SHIFT_PLL + time_constant);
198 tick_length += time_adj; 255 time_offset -= delta;
199 256 tick_length += delta;
200 if (unlikely(time_adjust)) { 257
201 if (time_adjust > MAX_TICKADJ) { 258 if (!time_adjust)
202 time_adjust -= MAX_TICKADJ; 259 return;
203 tick_length += MAX_TICKADJ_SCALED; 260
204 } else if (time_adjust < -MAX_TICKADJ) { 261 if (time_adjust > MAX_TICKADJ) {
205 time_adjust += MAX_TICKADJ; 262 time_adjust -= MAX_TICKADJ;
206 tick_length -= MAX_TICKADJ_SCALED; 263 tick_length += MAX_TICKADJ_SCALED;
207 } else { 264 return;
208 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
209 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
210 time_adjust = 0;
211 }
212 } 265 }
266
267 if (time_adjust < -MAX_TICKADJ) {
268 time_adjust += MAX_TICKADJ;
269 tick_length -= MAX_TICKADJ_SCALED;
270 return;
271 }
272
273 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
274 << NTP_SCALE_SHIFT;
275 time_adjust = 0;
213} 276}
214 277
215#ifdef CONFIG_GENERIC_CMOS_UPDATE 278#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)
233 * This code is run on a timer. If the clock is set, that timer 296 * This code is run on a timer. If the clock is set, that timer
234 * may not expire at the correct time. Thus, we adjust... 297 * may not expire at the correct time. Thus, we adjust...
235 */ 298 */
236 if (!ntp_synced()) 299 if (!ntp_synced()) {
237 /* 300 /*
238 * Not synced, exit, do not restart a timer (if one is 301 * Not synced, exit, do not restart a timer (if one is
239 * running, let it run out). 302 * running, let it run out).
240 */ 303 */
241 return; 304 return;
305 }
242 306
243 getnstimeofday(&now); 307 getnstimeofday(&now);
244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 308 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@@ -270,7 +334,116 @@ static void notify_cmos_timer(void)
270static inline void notify_cmos_timer(void) { } 334static inline void notify_cmos_timer(void) { }
271#endif 335#endif
272 336
273/* adjtimex mainly allows reading (and writing, if superuser) of 337/*
338 * Start the leap seconds timer:
339 */
340static inline void ntp_start_leap_timer(struct timespec *ts)
341{
342 long now = ts->tv_sec;
343
344 if (time_status & STA_INS) {
345 time_state = TIME_INS;
346 now += 86400 - now % 86400;
347 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
348
349 return;
350 }
351
352 if (time_status & STA_DEL) {
353 time_state = TIME_DEL;
354 now += 86400 - (now + 1) % 86400;
355 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
356 }
357}
358
359/*
360 * Propagate a new txc->status value into the NTP state:
361 */
362static inline void process_adj_status(struct timex *txc, struct timespec *ts)
363{
364 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
365 time_state = TIME_OK;
366 time_status = STA_UNSYNC;
367 }
368
369 /*
370 * If we turn on PLL adjustments then reset the
371 * reference time to current time.
372 */
373 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
374 time_reftime = xtime.tv_sec;
375
376 /* only set allowed bits */
377 time_status &= STA_RONLY;
378 time_status |= txc->status & ~STA_RONLY;
379
380 switch (time_state) {
381 case TIME_OK:
382 ntp_start_leap_timer(ts);
383 break;
384 case TIME_INS:
385 case TIME_DEL:
386 time_state = TIME_OK;
387 ntp_start_leap_timer(ts);
388 case TIME_WAIT:
389 if (!(time_status & (STA_INS | STA_DEL)))
390 time_state = TIME_OK;
391 break;
392 case TIME_OOP:
393 hrtimer_restart(&leap_timer);
394 break;
395 }
396}
397/*
398 * Called with the xtime lock held, so we can access and modify
399 * all the global NTP state:
400 */
401static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
402{
403 if (txc->modes & ADJ_STATUS)
404 process_adj_status(txc, ts);
405
406 if (txc->modes & ADJ_NANO)
407 time_status |= STA_NANO;
408
409 if (txc->modes & ADJ_MICRO)
410 time_status &= ~STA_NANO;
411
412 if (txc->modes & ADJ_FREQUENCY) {
413 time_freq = txc->freq * PPM_SCALE;
414 time_freq = min(time_freq, MAXFREQ_SCALED);
415 time_freq = max(time_freq, -MAXFREQ_SCALED);
416 }
417
418 if (txc->modes & ADJ_MAXERROR)
419 time_maxerror = txc->maxerror;
420
421 if (txc->modes & ADJ_ESTERROR)
422 time_esterror = txc->esterror;
423
424 if (txc->modes & ADJ_TIMECONST) {
425 time_constant = txc->constant;
426 if (!(time_status & STA_NANO))
427 time_constant += 4;
428 time_constant = min(time_constant, (long)MAXTC);
429 time_constant = max(time_constant, 0l);
430 }
431
432 if (txc->modes & ADJ_TAI && txc->constant > 0)
433 time_tai = txc->constant;
434
435 if (txc->modes & ADJ_OFFSET)
436 ntp_update_offset(txc->offset);
437
438 if (txc->modes & ADJ_TICK)
439 tick_usec = txc->tick;
440
441 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
442 ntp_update_frequency();
443}
444
445/*
446 * adjtimex mainly allows reading (and writing, if superuser) of
274 * kernel time-keeping variables. used by xntpd. 447 * kernel time-keeping variables. used by xntpd.
275 */ 448 */
276int do_adjtimex(struct timex *txc) 449int do_adjtimex(struct timex *txc)
@@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)
291 if (txc->modes && !capable(CAP_SYS_TIME)) 464 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM; 465 return -EPERM;
293 466
294 /* if the quartz is off by more than 10% something is VERY wrong! */ 467 /*
468 * if the quartz is off by more than 10% then
469 * something is VERY wrong!
470 */
295 if (txc->modes & ADJ_TICK && 471 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ || 472 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ)) 473 txc->tick > 1100000/USER_HZ))
298 return -EINVAL; 474 return -EINVAL;
299 475
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK) 476 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer); 477 hrtimer_cancel(&leap_timer);
@@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)
305 481
306 write_seqlock_irq(&xtime_lock); 482 write_seqlock_irq(&xtime_lock);
307 483
308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) { 484 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust; 485 long save_adjust = time_adjust;
311 486
@@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)
315 ntp_update_frequency(); 490 ntp_update_frequency();
316 } 491 }
317 txc->offset = save_adjust; 492 txc->offset = save_adjust;
318 goto adj_done; 493 } else {
319 }
320 if (txc->modes) {
321 long sec;
322
323 if (txc->modes & ADJ_STATUS) {
324 if ((time_status & STA_PLL) &&
325 !(txc->status & STA_PLL)) {
326 time_state = TIME_OK;
327 time_status = STA_UNSYNC;
328 }
329 /* only set allowed bits */
330 time_status &= STA_RONLY;
331 time_status |= txc->status & ~STA_RONLY;
332
333 switch (time_state) {
334 case TIME_OK:
335 start_timer:
336 sec = ts.tv_sec;
337 if (time_status & STA_INS) {
338 time_state = TIME_INS;
339 sec += 86400 - sec % 86400;
340 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
341 } else if (time_status & STA_DEL) {
342 time_state = TIME_DEL;
343 sec += 86400 - (sec + 1) % 86400;
344 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
345 }
346 break;
347 case TIME_INS:
348 case TIME_DEL:
349 time_state = TIME_OK;
350 goto start_timer;
351 break;
352 case TIME_WAIT:
353 if (!(time_status & (STA_INS | STA_DEL)))
354 time_state = TIME_OK;
355 break;
356 case TIME_OOP:
357 hrtimer_restart(&leap_timer);
358 break;
359 }
360 }
361
362 if (txc->modes & ADJ_NANO)
363 time_status |= STA_NANO;
364 if (txc->modes & ADJ_MICRO)
365 time_status &= ~STA_NANO;
366
367 if (txc->modes & ADJ_FREQUENCY) {
368 time_freq = (s64)txc->freq * PPM_SCALE;
369 time_freq = min(time_freq, MAXFREQ_SCALED);
370 time_freq = max(time_freq, -MAXFREQ_SCALED);
371 }
372
373 if (txc->modes & ADJ_MAXERROR)
374 time_maxerror = txc->maxerror;
375 if (txc->modes & ADJ_ESTERROR)
376 time_esterror = txc->esterror;
377
378 if (txc->modes & ADJ_TIMECONST) {
379 time_constant = txc->constant;
380 if (!(time_status & STA_NANO))
381 time_constant += 4;
382 time_constant = min(time_constant, (long)MAXTC);
383 time_constant = max(time_constant, 0l);
384 }
385
386 if (txc->modes & ADJ_TAI && txc->constant > 0)
387 time_tai = txc->constant;
388
389 if (txc->modes & ADJ_OFFSET)
390 ntp_update_offset(txc->offset);
391 if (txc->modes & ADJ_TICK)
392 tick_usec = txc->tick;
393 494
394 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) 495 /* If there are input parameters, then process them: */
395 ntp_update_frequency(); 496 if (txc->modes)
396 } 497 process_adjtimex_modes(txc, &ts);
397 498
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 499 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT); 500 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO)) 501 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC; 502 txc->offset /= NSEC_PER_USEC;
503 }
402 504
403adj_done:
404 result = time_state; /* mostly `TIME_OK' */ 505 result = time_state; /* mostly `TIME_OK' */
405 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 506 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
406 result = TIME_ERROR; 507 result = TIME_ERROR;
407 508
408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 509 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); 510 PPM_SCALE_INV, NTP_SCALE_SHIFT);
410 txc->maxerror = time_maxerror; 511 txc->maxerror = time_maxerror;
411 txc->esterror = time_esterror; 512 txc->esterror = time_esterror;
412 txc->status = time_status; 513 txc->status = time_status;
@@ -425,6 +526,7 @@ adj_done:
425 txc->calcnt = 0; 526 txc->calcnt = 0;
426 txc->errcnt = 0; 527 txc->errcnt = 0;
427 txc->stbcnt = 0; 528 txc->stbcnt = 0;
529
428 write_sequnlock_irq(&xtime_lock); 530 write_sequnlock_irq(&xtime_lock);
429 531
430 txc->time.tv_sec = ts.tv_sec; 532 txc->time.tv_sec = ts.tv_sec;
@@ -440,6 +542,8 @@ adj_done:
440static int __init ntp_tick_adj_setup(char *str) 542static int __init ntp_tick_adj_setup(char *str)
441{ 543{
442 ntp_tick_adj = simple_strtol(str, NULL, 0); 544 ntp_tick_adj = simple_strtol(str, NULL, 0);
545 ntp_tick_adj <<= NTP_SCALE_SHIFT;
546
443 return 1; 547 return 1;
444} 548}
445 549
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 000000000000..71e7f1a19156
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/math64.h>
23
24/*
25 * fixed point arithmetic scale factor for skew
26 *
27 * Usually one would measure skew in ppb (parts per billion, 1e9), but
28 * using a factor of 2 simplifies the math.
29 */
30#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
31
32ktime_t timecompare_transform(struct timecompare *sync,
33 u64 source_tstamp)
34{
35 u64 nsec;
36
37 nsec = source_tstamp + sync->offset;
38 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
39 TIMECOMPARE_SKEW_RESOLUTION;
40
41 return ns_to_ktime(nsec);
42}
43EXPORT_SYMBOL(timecompare_transform);
44
45int timecompare_offset(struct timecompare *sync,
46 s64 *offset,
47 u64 *source_tstamp)
48{
49 u64 start_source = 0, end_source = 0;
50 struct {
51 s64 offset;
52 s64 duration_target;
53 } buffer[10], sample, *samples;
54 int counter = 0, i;
55 int used;
56 int index;
57 int num_samples = sync->num_samples;
58
59 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
60 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
61 if (!samples) {
62 samples = buffer;
63 num_samples = sizeof(buffer)/sizeof(buffer[0]);
64 }
65 } else {
66 samples = buffer;
67 }
68
69 /* run until we have enough valid samples, but do not try forever */
70 i = 0;
71 counter = 0;
72 while (1) {
73 u64 ts;
74 ktime_t start, end;
75
76 start = sync->target();
77 ts = timecounter_read(sync->source);
78 end = sync->target();
79
80 if (!i)
81 start_source = ts;
82
83 /* ignore negative durations */
84 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
85 if (sample.duration_target >= 0) {
86 /*
87 * assume symetric delay to and from source:
88 * average target time corresponds to measured
89 * source time
90 */
91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 -
93 ts;
94
95 /* simple insertion sort based on duration */
96 index = counter - 1;
97 while (index >= 0) {
98 if (samples[index].duration_target <
99 sample.duration_target)
100 break;
101 samples[index + 1] = samples[index];
102 index--;
103 }
104 samples[index + 1] = sample;
105 counter++;
106 }
107
108 i++;
109 if (counter >= num_samples || i >= 100000) {
110 end_source = ts;
111 break;
112 }
113 }
114
115 *source_tstamp = (end_source + start_source) / 2;
116
117 /* remove outliers by only using 75% of the samples */
118 used = counter * 3 / 4;
119 if (!used)
120 used = counter;
121 if (used) {
122 /* calculate average */
123 s64 off = 0;
124 for (index = 0; index < used; index++)
125 off += samples[index].offset;
126 *offset = div_s64(off, used);
127 }
128
129 if (samples && samples != buffer)
130 kfree(samples);
131
132 return used;
133}
134EXPORT_SYMBOL(timecompare_offset);
135
136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp)
138{
139 s64 offset;
140 u64 average_time;
141
142 if (!timecompare_offset(sync, &offset, &average_time))
143 return;
144
145 if (!sync->last_update) {
146 sync->last_update = average_time;
147 sync->offset = offset;
148 sync->skew = 0;
149 } else {
150 s64 delta_nsec = average_time - sync->last_update;
151
152 /* avoid division by negative or small deltas */
153 if (delta_nsec >= 10000) {
154 s64 delta_offset_nsec = offset - sync->offset;
155 s64 skew; /* delta_offset_nsec *
156 TIMECOMPARE_SKEW_RESOLUTION /
157 delta_nsec */
158 u64 divisor;
159
160 /* div_s64() is limited to 32 bit divisor */
161 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
162 divisor = delta_nsec;
163 while (unlikely(divisor >= ((s64)1) << 32)) {
164 /* divide both by 2; beware, right shift
165 of negative value has undefined
166 behavior and can only be used for
167 the positive divisor */
168 skew = div_s64(skew, 2);
169 divisor >>= 1;
170 }
171 skew = div_s64(skew, divisor);
172
173 /*
174 * Calculate new overall skew as 4/16 the
175 * old value and 12/16 the new one. This is
176 * a rather arbitrary tradeoff between
177 * only using the latest measurement (0/16 and
178 * 16/16) and even more weight on past measurements.
179 */
180#define TIMECOMPARE_NEW_SKEW_PER_16 12
181 sync->skew =
182 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
183 sync->skew +
184 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
185 16);
186 sync->last_update = average_time;
187 sync->offset = offset;
188 }
189 }
190}
191EXPORT_SYMBOL(__timecompare_update);
diff --git a/kernel/timer.c b/kernel/timer.c
index ef1c385bc572..b4555568b4e4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -600,11 +600,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
600 } 600 }
601} 601}
602 602
603int __mod_timer(struct timer_list *timer, unsigned long expires) 603static inline int
604__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
604{ 605{
605 struct tvec_base *base, *new_base; 606 struct tvec_base *base, *new_base;
606 unsigned long flags; 607 unsigned long flags;
607 int ret = 0; 608 int ret;
609
610 ret = 0;
608 611
609 timer_stats_timer_set_start_info(timer); 612 timer_stats_timer_set_start_info(timer);
610 BUG_ON(!timer->function); 613 BUG_ON(!timer->function);
@@ -614,6 +617,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
614 if (timer_pending(timer)) { 617 if (timer_pending(timer)) {
615 detach_timer(timer, 0); 618 detach_timer(timer, 0);
616 ret = 1; 619 ret = 1;
620 } else {
621 if (pending_only)
622 goto out_unlock;
617 } 623 }
618 624
619 debug_timer_activate(timer); 625 debug_timer_activate(timer);
@@ -640,42 +646,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
640 646
641 timer->expires = expires; 647 timer->expires = expires;
642 internal_add_timer(base, timer); 648 internal_add_timer(base, timer);
649
650out_unlock:
643 spin_unlock_irqrestore(&base->lock, flags); 651 spin_unlock_irqrestore(&base->lock, flags);
644 652
645 return ret; 653 return ret;
646} 654}
647 655
648EXPORT_SYMBOL(__mod_timer);
649
650/** 656/**
651 * add_timer_on - start a timer on a particular CPU 657 * mod_timer_pending - modify a pending timer's timeout
652 * @timer: the timer to be added 658 * @timer: the pending timer to be modified
653 * @cpu: the CPU to start it on 659 * @expires: new timeout in jiffies
654 * 660 *
655 * This is not very scalable on SMP. Double adds are not possible. 661 * mod_timer_pending() is the same for pending timers as mod_timer(),
662 * but will not re-activate and modify already deleted timers.
663 *
664 * It is useful for unserialized use of timers.
656 */ 665 */
657void add_timer_on(struct timer_list *timer, int cpu) 666int mod_timer_pending(struct timer_list *timer, unsigned long expires)
658{ 667{
659 struct tvec_base *base = per_cpu(tvec_bases, cpu); 668 return __mod_timer(timer, expires, true);
660 unsigned long flags;
661
662 timer_stats_timer_set_start_info(timer);
663 BUG_ON(timer_pending(timer) || !timer->function);
664 spin_lock_irqsave(&base->lock, flags);
665 timer_set_base(timer, base);
666 debug_timer_activate(timer);
667 internal_add_timer(base, timer);
668 /*
669 * Check whether the other CPU is idle and needs to be
670 * triggered to reevaluate the timer wheel when nohz is
671 * active. We are protected against the other CPU fiddling
672 * with the timer by holding the timer base lock. This also
673 * makes sure that a CPU on the way to idle can not evaluate
674 * the timer wheel.
675 */
676 wake_up_idle_cpu(cpu);
677 spin_unlock_irqrestore(&base->lock, flags);
678} 669}
670EXPORT_SYMBOL(mod_timer_pending);
679 671
680/** 672/**
681 * mod_timer - modify a timer's timeout 673 * mod_timer - modify a timer's timeout
@@ -699,9 +691,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
699 */ 691 */
700int mod_timer(struct timer_list *timer, unsigned long expires) 692int mod_timer(struct timer_list *timer, unsigned long expires)
701{ 693{
702 BUG_ON(!timer->function);
703
704 timer_stats_timer_set_start_info(timer);
705 /* 694 /*
706 * This is a common optimization triggered by the 695 * This is a common optimization triggered by the
707 * networking code - if the timer is re-modified 696 * networking code - if the timer is re-modified
@@ -710,12 +699,62 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
710 if (timer->expires == expires && timer_pending(timer)) 699 if (timer->expires == expires && timer_pending(timer))
711 return 1; 700 return 1;
712 701
713 return __mod_timer(timer, expires); 702 return __mod_timer(timer, expires, false);
714} 703}
715
716EXPORT_SYMBOL(mod_timer); 704EXPORT_SYMBOL(mod_timer);
717 705
718/** 706/**
707 * add_timer - start a timer
708 * @timer: the timer to be added
709 *
710 * The kernel will do a ->function(->data) callback from the
711 * timer interrupt at the ->expires point in the future. The
712 * current time is 'jiffies'.
713 *
714 * The timer's ->expires, ->function (and if the handler uses it, ->data)
715 * fields must be set prior calling this function.
716 *
717 * Timers with an ->expires field in the past will be executed in the next
718 * timer tick.
719 */
720void add_timer(struct timer_list *timer)
721{
722 BUG_ON(timer_pending(timer));
723 mod_timer(timer, timer->expires);
724}
725EXPORT_SYMBOL(add_timer);
726
727/**
728 * add_timer_on - start a timer on a particular CPU
729 * @timer: the timer to be added
730 * @cpu: the CPU to start it on
731 *
732 * This is not very scalable on SMP. Double adds are not possible.
733 */
734void add_timer_on(struct timer_list *timer, int cpu)
735{
736 struct tvec_base *base = per_cpu(tvec_bases, cpu);
737 unsigned long flags;
738
739 timer_stats_timer_set_start_info(timer);
740 BUG_ON(timer_pending(timer) || !timer->function);
741 spin_lock_irqsave(&base->lock, flags);
742 timer_set_base(timer, base);
743 debug_timer_activate(timer);
744 internal_add_timer(base, timer);
745 /*
746 * Check whether the other CPU is idle and needs to be
747 * triggered to reevaluate the timer wheel when nohz is
748 * active. We are protected against the other CPU fiddling
749 * with the timer by holding the timer base lock. This also
750 * makes sure that a CPU on the way to idle can not evaluate
751 * the timer wheel.
752 */
753 wake_up_idle_cpu(cpu);
754 spin_unlock_irqrestore(&base->lock, flags);
755}
756
757/**
719 * del_timer - deactive a timer. 758 * del_timer - deactive a timer.
720 * @timer: the timer to be deactivated 759 * @timer: the timer to be deactivated
721 * 760 *
@@ -744,7 +783,6 @@ int del_timer(struct timer_list *timer)
744 783
745 return ret; 784 return ret;
746} 785}
747
748EXPORT_SYMBOL(del_timer); 786EXPORT_SYMBOL(del_timer);
749 787
750#ifdef CONFIG_SMP 788#ifdef CONFIG_SMP
@@ -778,7 +816,6 @@ out:
778 816
779 return ret; 817 return ret;
780} 818}
781
782EXPORT_SYMBOL(try_to_del_timer_sync); 819EXPORT_SYMBOL(try_to_del_timer_sync);
783 820
784/** 821/**
@@ -816,7 +853,6 @@ int del_timer_sync(struct timer_list *timer)
816 cpu_relax(); 853 cpu_relax();
817 } 854 }
818} 855}
819
820EXPORT_SYMBOL(del_timer_sync); 856EXPORT_SYMBOL(del_timer_sync);
821#endif 857#endif
822 858
@@ -1314,7 +1350,7 @@ signed long __sched schedule_timeout(signed long timeout)
1314 expire = timeout + jiffies; 1350 expire = timeout + jiffies;
1315 1351
1316 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1352 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1317 __mod_timer(&timer, expire); 1353 __mod_timer(&timer, expire, false);
1318 schedule(); 1354 schedule();
1319 del_singleshot_timer_sync(&timer); 1355 del_singleshot_timer_sync(&timer);
1320 1356
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8a4136096d7d..23b96ebbf893 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -99,11 +99,10 @@ config FUNCTION_GRAPH_TRACER
99 help 99 help
100 Enable the kernel to trace a function at both its return 100 Enable the kernel to trace a function at both its return
101 and its entry. 101 and its entry.
102 It's first purpose is to trace the duration of functions and 102 Its first purpose is to trace the duration of functions and
103 draw a call graph for each thread with some informations like 103 draw a call graph for each thread with some information like
104 the return value. 104 the return value. This is done by setting the current return
105 This is done by setting the current return address on the current 105 address on the current task structure into a stack of calls.
106 task structure into a stack of calls.
107 106
108 107
109config IRQSOFF_TRACER 108config IRQSOFF_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0e45c206c2f9..2630f5121ec1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o 45obj-$(CONFIG_EVENT_TRACER) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
48 49
49libftrace-y := ftrace.o 50libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b171778e3863..947c5b3f90c4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -30,7 +30,7 @@
30static unsigned int blktrace_seq __read_mostly = 1; 30static unsigned int blktrace_seq __read_mostly = 1;
31 31
32static struct trace_array *blk_tr; 32static struct trace_array *blk_tr;
33static int __read_mostly blk_tracer_enabled; 33static bool blk_tracer_enabled __read_mostly;
34 34
35/* Select an alternative, minimalistic output than the original one */ 35/* Select an alternative, minimalistic output than the original one */
36#define TRACE_BLK_OPT_CLASSIC 0x1 36#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = {
47}; 47};
48 48
49/* Global reference count of probes */ 49/* Global reference count of probes */
50static DEFINE_MUTEX(blk_probe_mutex);
51static atomic_t blk_probes_ref = ATOMIC_INIT(0); 50static atomic_t blk_probes_ref = ATOMIC_INIT(0);
52 51
53static int blk_register_tracepoints(void); 52static void blk_register_tracepoints(void);
54static void blk_unregister_tracepoints(void); 53static void blk_unregister_tracepoints(void);
55 54
56/* 55/*
@@ -60,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
60 const void *data, size_t len) 59 const void *data, size_t len)
61{ 60{
62 struct blk_io_trace *t; 61 struct blk_io_trace *t;
62 struct ring_buffer_event *event = NULL;
63 int pc = 0;
64 int cpu = smp_processor_id();
65 bool blk_tracer = blk_tracer_enabled;
66
67 if (blk_tracer) {
68 pc = preempt_count();
69 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
70 sizeof(*t) + len,
71 0, pc);
72 if (!event)
73 return;
74 t = ring_buffer_event_data(event);
75 goto record_it;
76 }
63 77
64 if (!bt->rchan) 78 if (!bt->rchan)
65 return; 79 return;
66 80
67 t = relay_reserve(bt->rchan, sizeof(*t) + len); 81 t = relay_reserve(bt->rchan, sizeof(*t) + len);
68 if (t) { 82 if (t) {
69 const int cpu = smp_processor_id();
70
71 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; 83 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
72 t->time = ktime_to_ns(ktime_get()); 84 t->time = ktime_to_ns(ktime_get());
85record_it:
73 t->device = bt->dev; 86 t->device = bt->dev;
74 t->action = action; 87 t->action = action;
75 t->pid = pid; 88 t->pid = pid;
76 t->cpu = cpu; 89 t->cpu = cpu;
77 t->pdu_len = len; 90 t->pdu_len = len;
78 memcpy((void *) t + sizeof(*t), data, len); 91 memcpy((void *) t + sizeof(*t), data, len);
92
93 if (blk_tracer)
94 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
79 } 95 }
80} 96}
81 97
@@ -111,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
111 unsigned long flags; 127 unsigned long flags;
112 char *buf; 128 char *buf;
113 129
114 if (blk_tr) { 130 if (unlikely(bt->trace_state != Blktrace_running &&
115 va_start(args, fmt); 131 !blk_tracer_enabled))
116 ftrace_vprintk(fmt, args);
117 va_end(args);
118 return;
119 }
120
121 if (!bt->msg_data)
122 return; 132 return;
123 133
124 local_irq_save(flags); 134 local_irq_save(flags);
@@ -148,8 +158,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
148/* 158/*
149 * Data direction bit lookup 159 * Data direction bit lookup
150 */ 160 */
151static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), 161static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
152 BLK_TC_ACT(BLK_TC_WRITE) }; 162 BLK_TC_ACT(BLK_TC_WRITE) };
153 163
154/* The ilog2() calls fall out because they're constant */ 164/* The ilog2() calls fall out because they're constant */
155#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ 165#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
@@ -169,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
169 unsigned long *sequence; 179 unsigned long *sequence;
170 pid_t pid; 180 pid_t pid;
171 int cpu, pc = 0; 181 int cpu, pc = 0;
182 bool blk_tracer = blk_tracer_enabled;
172 183
173 if (unlikely(bt->trace_state != Blktrace_running || 184 if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
174 !blk_tracer_enabled))
175 return; 185 return;
176 186
177 what |= ddir_act[rw & WRITE]; 187 what |= ddir_act[rw & WRITE];
@@ -186,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
186 return; 196 return;
187 cpu = raw_smp_processor_id(); 197 cpu = raw_smp_processor_id();
188 198
189 if (blk_tr) { 199 if (blk_tracer) {
190 tracing_record_cmdline(current); 200 tracing_record_cmdline(current);
191 201
192 pc = preempt_count(); 202 pc = preempt_count();
@@ -236,7 +246,7 @@ record_it:
236 if (pdu_len) 246 if (pdu_len)
237 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 247 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
238 248
239 if (blk_tr) { 249 if (blk_tracer) {
240 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 250 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
241 return; 251 return;
242 } 252 }
@@ -248,7 +258,7 @@ record_it:
248static struct dentry *blk_tree_root; 258static struct dentry *blk_tree_root;
249static DEFINE_MUTEX(blk_tree_mutex); 259static DEFINE_MUTEX(blk_tree_mutex);
250 260
251static void blk_trace_cleanup(struct blk_trace *bt) 261static void blk_trace_free(struct blk_trace *bt)
252{ 262{
253 debugfs_remove(bt->msg_file); 263 debugfs_remove(bt->msg_file);
254 debugfs_remove(bt->dropped_file); 264 debugfs_remove(bt->dropped_file);
@@ -256,10 +266,13 @@ static void blk_trace_cleanup(struct blk_trace *bt)
256 free_percpu(bt->sequence); 266 free_percpu(bt->sequence);
257 free_percpu(bt->msg_data); 267 free_percpu(bt->msg_data);
258 kfree(bt); 268 kfree(bt);
259 mutex_lock(&blk_probe_mutex); 269}
270
271static void blk_trace_cleanup(struct blk_trace *bt)
272{
273 blk_trace_free(bt);
260 if (atomic_dec_and_test(&blk_probes_ref)) 274 if (atomic_dec_and_test(&blk_probes_ref))
261 blk_unregister_tracepoints(); 275 blk_unregister_tracepoints();
262 mutex_unlock(&blk_probe_mutex);
263} 276}
264 277
265int blk_trace_remove(struct request_queue *q) 278int blk_trace_remove(struct request_queue *q)
@@ -270,8 +283,7 @@ int blk_trace_remove(struct request_queue *q)
270 if (!bt) 283 if (!bt)
271 return -EINVAL; 284 return -EINVAL;
272 285
273 if (bt->trace_state == Blktrace_setup || 286 if (bt->trace_state != Blktrace_running)
274 bt->trace_state == Blktrace_stopped)
275 blk_trace_cleanup(bt); 287 blk_trace_cleanup(bt);
276 288
277 return 0; 289 return 0;
@@ -414,11 +426,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
414 if (buts->name[i] == '/') 426 if (buts->name[i] == '/')
415 buts->name[i] = '_'; 427 buts->name[i] = '_';
416 428
417 ret = -ENOMEM;
418 bt = kzalloc(sizeof(*bt), GFP_KERNEL); 429 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
419 if (!bt) 430 if (!bt)
420 goto err; 431 return -ENOMEM;
421 432
433 ret = -ENOMEM;
422 bt->sequence = alloc_percpu(unsigned long); 434 bt->sequence = alloc_percpu(unsigned long);
423 if (!bt->sequence) 435 if (!bt->sequence)
424 goto err; 436 goto err;
@@ -429,11 +441,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
429 441
430 ret = -ENOENT; 442 ret = -ENOENT;
431 443
444 mutex_lock(&blk_tree_mutex);
432 if (!blk_tree_root) { 445 if (!blk_tree_root) {
433 blk_tree_root = debugfs_create_dir("block", NULL); 446 blk_tree_root = debugfs_create_dir("block", NULL);
434 if (!blk_tree_root) 447 if (!blk_tree_root) {
435 return -ENOMEM; 448 mutex_unlock(&blk_tree_mutex);
449 goto err;
450 }
436 } 451 }
452 mutex_unlock(&blk_tree_mutex);
437 453
438 dir = debugfs_create_dir(buts->name, blk_tree_root); 454 dir = debugfs_create_dir(buts->name, blk_tree_root);
439 455
@@ -471,14 +487,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
471 bt->pid = buts->pid; 487 bt->pid = buts->pid;
472 bt->trace_state = Blktrace_setup; 488 bt->trace_state = Blktrace_setup;
473 489
474 mutex_lock(&blk_probe_mutex);
475 if (atomic_add_return(1, &blk_probes_ref) == 1) {
476 ret = blk_register_tracepoints();
477 if (ret)
478 goto probe_err;
479 }
480 mutex_unlock(&blk_probe_mutex);
481
482 ret = -EBUSY; 490 ret = -EBUSY;
483 old_bt = xchg(&q->blk_trace, bt); 491 old_bt = xchg(&q->blk_trace, bt);
484 if (old_bt) { 492 if (old_bt) {
@@ -486,22 +494,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
486 goto err; 494 goto err;
487 } 495 }
488 496
497 if (atomic_inc_return(&blk_probes_ref) == 1)
498 blk_register_tracepoints();
499
489 return 0; 500 return 0;
490probe_err:
491 atomic_dec(&blk_probes_ref);
492 mutex_unlock(&blk_probe_mutex);
493err: 501err:
494 if (bt) { 502 blk_trace_free(bt);
495 if (bt->msg_file)
496 debugfs_remove(bt->msg_file);
497 if (bt->dropped_file)
498 debugfs_remove(bt->dropped_file);
499 free_percpu(bt->sequence);
500 free_percpu(bt->msg_data);
501 if (bt->rchan)
502 relay_close(bt->rchan);
503 kfree(bt);
504 }
505 return ret; 503 return ret;
506} 504}
507 505
@@ -863,7 +861,7 @@ void blk_add_driver_data(struct request_queue *q,
863} 861}
864EXPORT_SYMBOL_GPL(blk_add_driver_data); 862EXPORT_SYMBOL_GPL(blk_add_driver_data);
865 863
866static int blk_register_tracepoints(void) 864static void blk_register_tracepoints(void)
867{ 865{
868 int ret; 866 int ret;
869 867
@@ -901,7 +899,6 @@ static int blk_register_tracepoints(void)
901 WARN_ON(ret); 899 WARN_ON(ret);
902 ret = register_trace_block_remap(blk_add_trace_remap); 900 ret = register_trace_block_remap(blk_add_trace_remap);
903 WARN_ON(ret); 901 WARN_ON(ret);
904 return 0;
905} 902}
906 903
907static void blk_unregister_tracepoints(void) 904static void blk_unregister_tracepoints(void)
@@ -934,25 +931,31 @@ static void blk_unregister_tracepoints(void)
934static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) 931static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
935{ 932{
936 int i = 0; 933 int i = 0;
934 int tc = t->action >> BLK_TC_SHIFT;
935
936 if (t->action == BLK_TN_MESSAGE) {
937 rwbs[i++] = 'N';
938 goto out;
939 }
937 940
938 if (t->action & BLK_TC_DISCARD) 941 if (tc & BLK_TC_DISCARD)
939 rwbs[i++] = 'D'; 942 rwbs[i++] = 'D';
940 else if (t->action & BLK_TC_WRITE) 943 else if (tc & BLK_TC_WRITE)
941 rwbs[i++] = 'W'; 944 rwbs[i++] = 'W';
942 else if (t->bytes) 945 else if (t->bytes)
943 rwbs[i++] = 'R'; 946 rwbs[i++] = 'R';
944 else 947 else
945 rwbs[i++] = 'N'; 948 rwbs[i++] = 'N';
946 949
947 if (t->action & BLK_TC_AHEAD) 950 if (tc & BLK_TC_AHEAD)
948 rwbs[i++] = 'A'; 951 rwbs[i++] = 'A';
949 if (t->action & BLK_TC_BARRIER) 952 if (tc & BLK_TC_BARRIER)
950 rwbs[i++] = 'B'; 953 rwbs[i++] = 'B';
951 if (t->action & BLK_TC_SYNC) 954 if (tc & BLK_TC_SYNC)
952 rwbs[i++] = 'S'; 955 rwbs[i++] = 'S';
953 if (t->action & BLK_TC_META) 956 if (tc & BLK_TC_META)
954 rwbs[i++] = 'M'; 957 rwbs[i++] = 'M';
955 958out:
956 rwbs[i] = '\0'; 959 rwbs[i] = '\0';
957} 960}
958 961
@@ -979,7 +982,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent)
979 982
980static inline __u16 t_error(const struct trace_entry *ent) 983static inline __u16 t_error(const struct trace_entry *ent)
981{ 984{
982 return te_blk_io_trace(ent)->sector; 985 return te_blk_io_trace(ent)->error;
983} 986}
984 987
985static __u64 get_pdu_int(const struct trace_entry *ent) 988static __u64 get_pdu_int(const struct trace_entry *ent)
@@ -999,29 +1002,31 @@ static void get_pdu_remap(const struct trace_entry *ent,
999 r->sector = be64_to_cpu(sector); 1002 r->sector = be64_to_cpu(sector);
1000} 1003}
1001 1004
1002static int blk_log_action_iter(struct trace_iterator *iter, const char *act) 1005typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1006
1007static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1003{ 1008{
1004 char rwbs[6]; 1009 char rwbs[6];
1005 unsigned long long ts = ns2usecs(iter->ts); 1010 unsigned long long ts = iter->ts;
1006 unsigned long usec_rem = do_div(ts, USEC_PER_SEC); 1011 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1007 unsigned secs = (unsigned long)ts; 1012 unsigned secs = (unsigned long)ts;
1008 const struct trace_entry *ent = iter->ent; 1013 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1009 const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
1010 1014
1011 fill_rwbs(rwbs, t); 1015 fill_rwbs(rwbs, t);
1012 1016
1013 return trace_seq_printf(&iter->seq, 1017 return trace_seq_printf(&iter->seq,
1014 "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", 1018 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1015 MAJOR(t->device), MINOR(t->device), iter->cpu, 1019 MAJOR(t->device), MINOR(t->device), iter->cpu,
1016 secs, usec_rem, ent->pid, act, rwbs); 1020 secs, nsec_rem, iter->ent->pid, act, rwbs);
1017} 1021}
1018 1022
1019static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, 1023static int blk_log_action(struct trace_iterator *iter, const char *act)
1020 const char *act)
1021{ 1024{
1022 char rwbs[6]; 1025 char rwbs[6];
1026 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1027
1023 fill_rwbs(rwbs, t); 1028 fill_rwbs(rwbs, t);
1024 return trace_seq_printf(s, "%3d,%-3d %2s %3s ", 1029 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1025 MAJOR(t->device), MINOR(t->device), act, rwbs); 1030 MAJOR(t->device), MINOR(t->device), act, rwbs);
1026} 1031}
1027 1032
@@ -1085,6 +1090,17 @@ static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1085 get_pdu_int(ent), cmd); 1090 get_pdu_int(ent), cmd);
1086} 1091}
1087 1092
1093static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1094{
1095 int ret;
1096 const struct blk_io_trace *t = te_blk_io_trace(ent);
1097
1098 ret = trace_seq_putmem(s, t + 1, t->pdu_len);
1099 if (ret)
1100 return trace_seq_putc(s, '\n');
1101 return ret;
1102}
1103
1088/* 1104/*
1089 * struct tracer operations 1105 * struct tracer operations
1090 */ 1106 */
@@ -1099,11 +1115,7 @@ static void blk_tracer_print_header(struct seq_file *m)
1099 1115
1100static void blk_tracer_start(struct trace_array *tr) 1116static void blk_tracer_start(struct trace_array *tr)
1101{ 1117{
1102 mutex_lock(&blk_probe_mutex); 1118 blk_tracer_enabled = true;
1103 if (atomic_add_return(1, &blk_probes_ref) == 1)
1104 if (blk_register_tracepoints())
1105 atomic_dec(&blk_probes_ref);
1106 mutex_unlock(&blk_probe_mutex);
1107 trace_flags &= ~TRACE_ITER_CONTEXT_INFO; 1119 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1108} 1120}
1109 1121
@@ -1111,38 +1123,24 @@ static int blk_tracer_init(struct trace_array *tr)
1111{ 1123{
1112 blk_tr = tr; 1124 blk_tr = tr;
1113 blk_tracer_start(tr); 1125 blk_tracer_start(tr);
1114 mutex_lock(&blk_probe_mutex);
1115 blk_tracer_enabled++;
1116 mutex_unlock(&blk_probe_mutex);
1117 return 0; 1126 return 0;
1118} 1127}
1119 1128
1120static void blk_tracer_stop(struct trace_array *tr) 1129static void blk_tracer_stop(struct trace_array *tr)
1121{ 1130{
1131 blk_tracer_enabled = false;
1122 trace_flags |= TRACE_ITER_CONTEXT_INFO; 1132 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1123 mutex_lock(&blk_probe_mutex);
1124 if (atomic_dec_and_test(&blk_probes_ref))
1125 blk_unregister_tracepoints();
1126 mutex_unlock(&blk_probe_mutex);
1127} 1133}
1128 1134
1129static void blk_tracer_reset(struct trace_array *tr) 1135static void blk_tracer_reset(struct trace_array *tr)
1130{ 1136{
1131 if (!atomic_read(&blk_probes_ref))
1132 return;
1133
1134 mutex_lock(&blk_probe_mutex);
1135 blk_tracer_enabled--;
1136 WARN_ON(blk_tracer_enabled < 0);
1137 mutex_unlock(&blk_probe_mutex);
1138
1139 blk_tracer_stop(tr); 1137 blk_tracer_stop(tr);
1140} 1138}
1141 1139
1142static struct { 1140static const struct {
1143 const char *act[2]; 1141 const char *act[2];
1144 int (*print)(struct trace_seq *s, const struct trace_entry *ent); 1142 int (*print)(struct trace_seq *s, const struct trace_entry *ent);
1145} what2act[] __read_mostly = { 1143} what2act[] = {
1146 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, 1144 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1147 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, 1145 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
1148 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, 1146 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
@@ -1160,29 +1158,48 @@ static struct {
1160 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, 1158 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
1161}; 1159};
1162 1160
1163static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1161static enum print_line_t print_one_line(struct trace_iterator *iter,
1164 int flags) 1162 bool classic)
1165{ 1163{
1166 struct trace_seq *s = &iter->seq; 1164 struct trace_seq *s = &iter->seq;
1167 const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; 1165 const struct blk_io_trace *t;
1168 const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); 1166 u16 what;
1169 int ret; 1167 int ret;
1168 bool long_act;
1169 blk_log_action_t *log_action;
1170 1170
1171 if (!trace_print_context(iter)) 1171 t = te_blk_io_trace(iter->ent);
1172 return TRACE_TYPE_PARTIAL_LINE; 1172 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1173 long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1174 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1173 1175
1174 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) 1176 if (t->action == BLK_TN_MESSAGE) {
1177 ret = log_action(iter, long_act ? "message" : "m");
1178 if (ret)
1179 ret = blk_log_msg(s, iter->ent);
1180 goto out;
1181 }
1182
1183 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1175 ret = trace_seq_printf(s, "Bad pc action %x\n", what); 1184 ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1176 else { 1185 else {
1177 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); 1186 ret = log_action(iter, what2act[what].act[long_act]);
1178 ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
1179 if (ret) 1187 if (ret)
1180 ret = what2act[what].print(s, iter->ent); 1188 ret = what2act[what].print(s, iter->ent);
1181 } 1189 }
1182 1190out:
1183 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1191 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1184} 1192}
1185 1193
1194static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1195 int flags)
1196{
1197 if (!trace_print_context(iter))
1198 return TRACE_TYPE_PARTIAL_LINE;
1199
1200 return print_one_line(iter, false);
1201}
1202
1186static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) 1203static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1187{ 1204{
1188 struct trace_seq *s = &iter->seq; 1205 struct trace_seq *s = &iter->seq;
@@ -1190,7 +1207,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1190 const int offset = offsetof(struct blk_io_trace, sector); 1207 const int offset = offsetof(struct blk_io_trace, sector);
1191 struct blk_io_trace old = { 1208 struct blk_io_trace old = {
1192 .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, 1209 .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1193 .time = ns2usecs(iter->ts), 1210 .time = iter->ts,
1194 }; 1211 };
1195 1212
1196 if (!trace_seq_putmem(s, &old, offset)) 1213 if (!trace_seq_putmem(s, &old, offset))
@@ -1208,26 +1225,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1208 1225
1209static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) 1226static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1210{ 1227{
1211 const struct blk_io_trace *t;
1212 u16 what;
1213 int ret;
1214
1215 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) 1228 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1216 return TRACE_TYPE_UNHANDLED; 1229 return TRACE_TYPE_UNHANDLED;
1217 1230
1218 t = (const struct blk_io_trace *)iter->ent; 1231 return print_one_line(iter, true);
1219 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1220
1221 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
1222 ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
1223 else {
1224 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1225 ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
1226 if (ret)
1227 ret = what2act[what].print(&iter->seq, iter->ent);
1228 }
1229
1230 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1231} 1232}
1232 1233
1233static struct tracer blk_tracer __read_mostly = { 1234static struct tracer blk_tracer __read_mostly = {
@@ -1273,7 +1274,10 @@ static int blk_trace_remove_queue(struct request_queue *q)
1273 if (bt == NULL) 1274 if (bt == NULL)
1274 return -EINVAL; 1275 return -EINVAL;
1275 1276
1276 kfree(bt); 1277 if (atomic_dec_and_test(&blk_probes_ref))
1278 blk_unregister_tracepoints();
1279
1280 blk_trace_free(bt);
1277 return 0; 1281 return 0;
1278} 1282}
1279 1283
@@ -1283,26 +1287,33 @@ static int blk_trace_remove_queue(struct request_queue *q)
1283static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) 1287static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1284{ 1288{
1285 struct blk_trace *old_bt, *bt = NULL; 1289 struct blk_trace *old_bt, *bt = NULL;
1286 int ret; 1290 int ret = -ENOMEM;
1287 1291
1288 ret = -ENOMEM;
1289 bt = kzalloc(sizeof(*bt), GFP_KERNEL); 1292 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1290 if (!bt) 1293 if (!bt)
1291 goto err; 1294 return -ENOMEM;
1295
1296 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1297 if (!bt->msg_data)
1298 goto free_bt;
1292 1299
1293 bt->dev = dev; 1300 bt->dev = dev;
1294 bt->act_mask = (u16)-1; 1301 bt->act_mask = (u16)-1;
1295 bt->end_lba = -1ULL; 1302 bt->end_lba = -1ULL;
1296 bt->trace_state = Blktrace_running;
1297 1303
1298 old_bt = xchg(&q->blk_trace, bt); 1304 old_bt = xchg(&q->blk_trace, bt);
1299 if (old_bt != NULL) { 1305 if (old_bt != NULL) {
1300 (void)xchg(&q->blk_trace, old_bt); 1306 (void)xchg(&q->blk_trace, old_bt);
1301 kfree(bt);
1302 ret = -EBUSY; 1307 ret = -EBUSY;
1308 goto free_bt;
1303 } 1309 }
1310
1311 if (atomic_inc_return(&blk_probes_ref) == 1)
1312 blk_register_tracepoints();
1304 return 0; 1313 return 0;
1305err: 1314
1315free_bt:
1316 blk_trace_free(bt);
1306 return ret; 1317 return ret;
1307} 1318}
1308 1319
@@ -1310,72 +1321,6 @@ err:
1310 * sysfs interface to enable and configure tracing 1321 * sysfs interface to enable and configure tracing
1311 */ 1322 */
1312 1323
1313static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
1314 struct device_attribute *attr,
1315 char *buf)
1316{
1317 struct hd_struct *p = dev_to_part(dev);
1318 struct block_device *bdev;
1319 ssize_t ret = -ENXIO;
1320
1321 lock_kernel();
1322 bdev = bdget(part_devt(p));
1323 if (bdev != NULL) {
1324 struct request_queue *q = bdev_get_queue(bdev);
1325
1326 if (q != NULL) {
1327 mutex_lock(&bdev->bd_mutex);
1328 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1329 mutex_unlock(&bdev->bd_mutex);
1330 }
1331
1332 bdput(bdev);
1333 }
1334
1335 unlock_kernel();
1336 return ret;
1337}
1338
1339static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
1340 struct device_attribute *attr,
1341 const char *buf, size_t count)
1342{
1343 struct block_device *bdev;
1344 struct request_queue *q;
1345 struct hd_struct *p;
1346 int value;
1347 ssize_t ret = -ENXIO;
1348
1349 if (count == 0 || sscanf(buf, "%d", &value) != 1)
1350 goto out;
1351
1352 lock_kernel();
1353 p = dev_to_part(dev);
1354 bdev = bdget(part_devt(p));
1355 if (bdev == NULL)
1356 goto out_unlock_kernel;
1357
1358 q = bdev_get_queue(bdev);
1359 if (q == NULL)
1360 goto out_bdput;
1361
1362 mutex_lock(&bdev->bd_mutex);
1363 if (value)
1364 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1365 else
1366 ret = blk_trace_remove_queue(q);
1367 mutex_unlock(&bdev->bd_mutex);
1368
1369 if (ret == 0)
1370 ret = count;
1371out_bdput:
1372 bdput(bdev);
1373out_unlock_kernel:
1374 unlock_kernel();
1375out:
1376 return ret;
1377}
1378
1379static ssize_t sysfs_blk_trace_attr_show(struct device *dev, 1324static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1380 struct device_attribute *attr, 1325 struct device_attribute *attr,
1381 char *buf); 1326 char *buf);
@@ -1387,8 +1332,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1387 sysfs_blk_trace_attr_show, \ 1332 sysfs_blk_trace_attr_show, \
1388 sysfs_blk_trace_attr_store) 1333 sysfs_blk_trace_attr_store)
1389 1334
1390static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, 1335static BLK_TRACE_DEVICE_ATTR(enable);
1391 sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
1392static BLK_TRACE_DEVICE_ATTR(act_mask); 1336static BLK_TRACE_DEVICE_ATTR(act_mask);
1393static BLK_TRACE_DEVICE_ATTR(pid); 1337static BLK_TRACE_DEVICE_ATTR(pid);
1394static BLK_TRACE_DEVICE_ATTR(start_lba); 1338static BLK_TRACE_DEVICE_ATTR(start_lba);
@@ -1408,53 +1352,85 @@ struct attribute_group blk_trace_attr_group = {
1408 .attrs = blk_trace_attrs, 1352 .attrs = blk_trace_attrs,
1409}; 1353};
1410 1354
1411static int blk_str2act_mask(const char *str) 1355static const struct {
1356 int mask;
1357 const char *str;
1358} mask_maps[] = {
1359 { BLK_TC_READ, "read" },
1360 { BLK_TC_WRITE, "write" },
1361 { BLK_TC_BARRIER, "barrier" },
1362 { BLK_TC_SYNC, "sync" },
1363 { BLK_TC_QUEUE, "queue" },
1364 { BLK_TC_REQUEUE, "requeue" },
1365 { BLK_TC_ISSUE, "issue" },
1366 { BLK_TC_COMPLETE, "complete" },
1367 { BLK_TC_FS, "fs" },
1368 { BLK_TC_PC, "pc" },
1369 { BLK_TC_AHEAD, "ahead" },
1370 { BLK_TC_META, "meta" },
1371 { BLK_TC_DISCARD, "discard" },
1372 { BLK_TC_DRV_DATA, "drv_data" },
1373};
1374
1375static int blk_trace_str2mask(const char *str)
1412{ 1376{
1377 int i;
1413 int mask = 0; 1378 int mask = 0;
1414 char *copy = kstrdup(str, GFP_KERNEL), *s; 1379 char *s, *token;
1415 1380
1416 if (copy == NULL) 1381 s = kstrdup(str, GFP_KERNEL);
1382 if (s == NULL)
1417 return -ENOMEM; 1383 return -ENOMEM;
1418 1384 s = strstrip(s);
1419 s = strstrip(copy);
1420 1385
1421 while (1) { 1386 while (1) {
1422 char *sep = strchr(s, ','); 1387 token = strsep(&s, ",");
1423 1388 if (token == NULL)
1424 if (sep != NULL)
1425 *sep = '\0';
1426
1427 if (strcasecmp(s, "barrier") == 0)
1428 mask |= BLK_TC_BARRIER;
1429 else if (strcasecmp(s, "complete") == 0)
1430 mask |= BLK_TC_COMPLETE;
1431 else if (strcasecmp(s, "fs") == 0)
1432 mask |= BLK_TC_FS;
1433 else if (strcasecmp(s, "issue") == 0)
1434 mask |= BLK_TC_ISSUE;
1435 else if (strcasecmp(s, "pc") == 0)
1436 mask |= BLK_TC_PC;
1437 else if (strcasecmp(s, "queue") == 0)
1438 mask |= BLK_TC_QUEUE;
1439 else if (strcasecmp(s, "read") == 0)
1440 mask |= BLK_TC_READ;
1441 else if (strcasecmp(s, "requeue") == 0)
1442 mask |= BLK_TC_REQUEUE;
1443 else if (strcasecmp(s, "sync") == 0)
1444 mask |= BLK_TC_SYNC;
1445 else if (strcasecmp(s, "write") == 0)
1446 mask |= BLK_TC_WRITE;
1447
1448 if (sep == NULL)
1449 break; 1389 break;
1450 1390
1451 s = sep + 1; 1391 if (*token == '\0')
1392 continue;
1393
1394 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1395 if (strcasecmp(token, mask_maps[i].str) == 0) {
1396 mask |= mask_maps[i].mask;
1397 break;
1398 }
1399 }
1400 if (i == ARRAY_SIZE(mask_maps)) {
1401 mask = -EINVAL;
1402 break;
1403 }
1452 } 1404 }
1453 kfree(copy); 1405 kfree(s);
1454 1406
1455 return mask; 1407 return mask;
1456} 1408}
1457 1409
1410static ssize_t blk_trace_mask2str(char *buf, int mask)
1411{
1412 int i;
1413 char *p = buf;
1414
1415 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1416 if (mask & mask_maps[i].mask) {
1417 p += sprintf(p, "%s%s",
1418 (p == buf) ? "" : ",", mask_maps[i].str);
1419 }
1420 }
1421 *p++ = '\n';
1422
1423 return p - buf;
1424}
1425
1426static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
1427{
1428 if (bdev->bd_disk == NULL)
1429 return NULL;
1430
1431 return bdev_get_queue(bdev);
1432}
1433
1458static ssize_t sysfs_blk_trace_attr_show(struct device *dev, 1434static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1459 struct device_attribute *attr, 1435 struct device_attribute *attr,
1460 char *buf) 1436 char *buf)
@@ -1469,20 +1445,29 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1469 if (bdev == NULL) 1445 if (bdev == NULL)
1470 goto out_unlock_kernel; 1446 goto out_unlock_kernel;
1471 1447
1472 q = bdev_get_queue(bdev); 1448 q = blk_trace_get_queue(bdev);
1473 if (q == NULL) 1449 if (q == NULL)
1474 goto out_bdput; 1450 goto out_bdput;
1451
1475 mutex_lock(&bdev->bd_mutex); 1452 mutex_lock(&bdev->bd_mutex);
1453
1454 if (attr == &dev_attr_enable) {
1455 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1456 goto out_unlock_bdev;
1457 }
1458
1476 if (q->blk_trace == NULL) 1459 if (q->blk_trace == NULL)
1477 ret = sprintf(buf, "disabled\n"); 1460 ret = sprintf(buf, "disabled\n");
1478 else if (attr == &dev_attr_act_mask) 1461 else if (attr == &dev_attr_act_mask)
1479 ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); 1462 ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
1480 else if (attr == &dev_attr_pid) 1463 else if (attr == &dev_attr_pid)
1481 ret = sprintf(buf, "%u\n", q->blk_trace->pid); 1464 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1482 else if (attr == &dev_attr_start_lba) 1465 else if (attr == &dev_attr_start_lba)
1483 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); 1466 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1484 else if (attr == &dev_attr_end_lba) 1467 else if (attr == &dev_attr_end_lba)
1485 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); 1468 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1469
1470out_unlock_bdev:
1486 mutex_unlock(&bdev->bd_mutex); 1471 mutex_unlock(&bdev->bd_mutex);
1487out_bdput: 1472out_bdput:
1488 bdput(bdev); 1473 bdput(bdev);
@@ -1499,7 +1484,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1499 struct request_queue *q; 1484 struct request_queue *q;
1500 struct hd_struct *p; 1485 struct hd_struct *p;
1501 u64 value; 1486 u64 value;
1502 ssize_t ret = -ENXIO; 1487 ssize_t ret = -EINVAL;
1503 1488
1504 if (count == 0) 1489 if (count == 0)
1505 goto out; 1490 goto out;
@@ -1507,24 +1492,36 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1507 if (attr == &dev_attr_act_mask) { 1492 if (attr == &dev_attr_act_mask) {
1508 if (sscanf(buf, "%llx", &value) != 1) { 1493 if (sscanf(buf, "%llx", &value) != 1) {
1509 /* Assume it is a list of trace category names */ 1494 /* Assume it is a list of trace category names */
1510 value = blk_str2act_mask(buf); 1495 ret = blk_trace_str2mask(buf);
1511 if (value < 0) 1496 if (ret < 0)
1512 goto out; 1497 goto out;
1498 value = ret;
1513 } 1499 }
1514 } else if (sscanf(buf, "%llu", &value) != 1) 1500 } else if (sscanf(buf, "%llu", &value) != 1)
1515 goto out; 1501 goto out;
1516 1502
1503 ret = -ENXIO;
1504
1517 lock_kernel(); 1505 lock_kernel();
1518 p = dev_to_part(dev); 1506 p = dev_to_part(dev);
1519 bdev = bdget(part_devt(p)); 1507 bdev = bdget(part_devt(p));
1520 if (bdev == NULL) 1508 if (bdev == NULL)
1521 goto out_unlock_kernel; 1509 goto out_unlock_kernel;
1522 1510
1523 q = bdev_get_queue(bdev); 1511 q = blk_trace_get_queue(bdev);
1524 if (q == NULL) 1512 if (q == NULL)
1525 goto out_bdput; 1513 goto out_bdput;
1526 1514
1527 mutex_lock(&bdev->bd_mutex); 1515 mutex_lock(&bdev->bd_mutex);
1516
1517 if (attr == &dev_attr_enable) {
1518 if (value)
1519 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1520 else
1521 ret = blk_trace_remove_queue(q);
1522 goto out_unlock_bdev;
1523 }
1524
1528 ret = 0; 1525 ret = 0;
1529 if (q->blk_trace == NULL) 1526 if (q->blk_trace == NULL)
1530 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1527 ret = blk_trace_setup_queue(q, bdev->bd_dev);
@@ -1538,13 +1535,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1538 q->blk_trace->start_lba = value; 1535 q->blk_trace->start_lba = value;
1539 else if (attr == &dev_attr_end_lba) 1536 else if (attr == &dev_attr_end_lba)
1540 q->blk_trace->end_lba = value; 1537 q->blk_trace->end_lba = value;
1541 ret = count;
1542 } 1538 }
1539
1540out_unlock_bdev:
1543 mutex_unlock(&bdev->bd_mutex); 1541 mutex_unlock(&bdev->bd_mutex);
1544out_bdput: 1542out_bdput:
1545 bdput(bdev); 1543 bdput(bdev);
1546out_unlock_kernel: 1544out_unlock_kernel:
1547 unlock_kernel(); 1545 unlock_kernel();
1548out: 1546out:
1549 return ret; 1547 return ret ? ret : count;
1550} 1548}
1549
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c7f4a4be05dc..678e3d6caf85 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -928,9 +928,14 @@ void ftrace_release(void *start, unsigned long size)
928 928
929 mutex_lock(&ftrace_lock); 929 mutex_lock(&ftrace_lock);
930 do_for_each_ftrace_rec(pg, rec) { 930 do_for_each_ftrace_rec(pg, rec) {
931 if ((rec->ip >= s) && (rec->ip < e) && 931 if ((rec->ip >= s) && (rec->ip < e)) {
932 !(rec->flags & FTRACE_FL_FREE)) 932 /*
933 * rec->ip is changed in ftrace_free_rec()
934 * It should not between s and e if record was freed.
935 */
936 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
933 ftrace_free_rec(rec); 937 ftrace_free_rec(rec);
938 }
934 } while_for_each_ftrace_rec(); 939 } while_for_each_ftrace_rec();
935 mutex_unlock(&ftrace_lock); 940 mutex_unlock(&ftrace_lock);
936} 941}
@@ -3287,6 +3292,9 @@ void unregister_ftrace_graph(void)
3287{ 3292{
3288 mutex_lock(&ftrace_lock); 3293 mutex_lock(&ftrace_lock);
3289 3294
3295 if (!unlikely(atomic_read(&ftrace_graph_active)))
3296 goto out;
3297
3290 atomic_dec(&ftrace_graph_active); 3298 atomic_dec(&ftrace_graph_active);
3291 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); 3299 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3292 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3300 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -3294,6 +3302,7 @@ void unregister_ftrace_graph(void)
3294 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3302 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3295 unregister_pm_notifier(&ftrace_suspend_notifier); 3303 unregister_pm_notifier(&ftrace_suspend_notifier);
3296 3304
3305 out:
3297 mutex_unlock(&ftrace_lock); 3306 mutex_unlock(&ftrace_lock);
3298} 3307}
3299 3308
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda89..5011f4d91e37 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,14 +6,16 @@
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> 6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */ 7 */
8 8
9#include <linux/dcache.h> 9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
10#include <linux/debugfs.h> 11#include <linux/debugfs.h>
12#include <linux/dcache.h>
11#include <linux/fs.h> 13#include <linux/fs.h>
12#include <linux/seq_file.h> 14
13#include <trace/kmemtrace.h> 15#include <trace/kmemtrace.h>
14 16
15#include "trace.h"
16#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h"
17 19
18/* Select an alternative, minimalistic output than the original one */ 20/* Select an alternative, minimalistic output than the original one */
19#define TRACE_KMEM_OPT_MINIMAL 0x1 21#define TRACE_KMEM_OPT_MINIMAL 0x1
@@ -25,14 +27,156 @@ static struct tracer_opt kmem_opts[] = {
25}; 27};
26 28
27static struct tracer_flags kmem_tracer_flags = { 29static struct tracer_flags kmem_tracer_flags = {
28 .val = 0, 30 .val = 0,
29 .opts = kmem_opts 31 .opts = kmem_opts
30}; 32};
31 33
32
33static bool kmem_tracing_enabled __read_mostly;
34static struct trace_array *kmemtrace_array; 34static struct trace_array *kmemtrace_array;
35 35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event;
48
49 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
50 if (!event)
51 return;
52
53 entry = ring_buffer_event_data(event);
54 tracing_generic_entry_update(&entry->ent, 0, 0);
55
56 entry->ent.type = TRACE_KMEM_ALLOC;
57 entry->type_id = type_id;
58 entry->call_site = call_site;
59 entry->ptr = ptr;
60 entry->bytes_req = bytes_req;
61 entry->bytes_alloc = bytes_alloc;
62 entry->gfp_flags = gfp_flags;
63 entry->node = node;
64
65 ring_buffer_unlock_commit(tr->buffer, event);
66
67 trace_wake_up();
68}
69
70static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site,
72 const void *ptr)
73{
74 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event;
77
78 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
79 if (!event)
80 return;
81 entry = ring_buffer_event_data(event);
82 tracing_generic_entry_update(&entry->ent, 0, 0);
83
84 entry->ent.type = TRACE_KMEM_FREE;
85 entry->type_id = type_id;
86 entry->call_site = call_site;
87 entry->ptr = ptr;
88
89 ring_buffer_unlock_commit(tr->buffer, event);
90
91 trace_wake_up();
92}
93
94static void kmemtrace_kmalloc(unsigned long call_site,
95 const void *ptr,
96 size_t bytes_req,
97 size_t bytes_alloc,
98 gfp_t gfp_flags)
99{
100 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
101 bytes_req, bytes_alloc, gfp_flags, -1);
102}
103
104static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
105 const void *ptr,
106 size_t bytes_req,
107 size_t bytes_alloc,
108 gfp_t gfp_flags)
109{
110 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
111 bytes_req, bytes_alloc, gfp_flags, -1);
112}
113
114static void kmemtrace_kmalloc_node(unsigned long call_site,
115 const void *ptr,
116 size_t bytes_req,
117 size_t bytes_alloc,
118 gfp_t gfp_flags,
119 int node)
120{
121 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
122 bytes_req, bytes_alloc, gfp_flags, node);
123}
124
125static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
126 const void *ptr,
127 size_t bytes_req,
128 size_t bytes_alloc,
129 gfp_t gfp_flags,
130 int node)
131{
132 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
133 bytes_req, bytes_alloc, gfp_flags, node);
134}
135
136static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
137{
138 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
139}
140
141static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
142{
143 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
144}
145
146static int kmemtrace_start_probes(void)
147{
148 int err;
149
150 err = register_trace_kmalloc(kmemtrace_kmalloc);
151 if (err)
152 return err;
153 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
154 if (err)
155 return err;
156 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
157 if (err)
158 return err;
159 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
160 if (err)
161 return err;
162 err = register_trace_kfree(kmemtrace_kfree);
163 if (err)
164 return err;
165 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
166
167 return err;
168}
169
170static void kmemtrace_stop_probes(void)
171{
172 unregister_trace_kmalloc(kmemtrace_kmalloc);
173 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
174 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
175 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
176 unregister_trace_kfree(kmemtrace_kfree);
177 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
178}
179
36static int kmem_trace_init(struct trace_array *tr) 180static int kmem_trace_init(struct trace_array *tr)
37{ 181{
38 int cpu; 182 int cpu;
@@ -41,14 +185,14 @@ static int kmem_trace_init(struct trace_array *tr)
41 for_each_cpu_mask(cpu, cpu_possible_map) 185 for_each_cpu_mask(cpu, cpu_possible_map)
42 tracing_reset(tr, cpu); 186 tracing_reset(tr, cpu);
43 187
44 kmem_tracing_enabled = true; 188 kmemtrace_start_probes();
45 189
46 return 0; 190 return 0;
47} 191}
48 192
49static void kmem_trace_reset(struct trace_array *tr) 193static void kmem_trace_reset(struct trace_array *tr)
50{ 194{
51 kmem_tracing_enabled = false; 195 kmemtrace_stop_probes();
52} 196}
53 197
54static void kmemtrace_headers(struct seq_file *s) 198static void kmemtrace_headers(struct seq_file *s)
@@ -66,47 +210,84 @@ static void kmemtrace_headers(struct seq_file *s)
66} 210}
67 211
68/* 212/*
69 * The two following functions give the original output from kmemtrace, 213 * The following functions give the original output from kmemtrace,
70 * or something close to....perhaps they need some missing things 214 * plus the origin CPU, since reordering occurs in-kernel now.
71 */ 215 */
216
217#define KMEMTRACE_USER_ALLOC 0
218#define KMEMTRACE_USER_FREE 1
219
220struct kmemtrace_user_event {
221 u8 event_id;
222 u8 type_id;
223 u16 event_size;
224 u32 cpu;
225 u64 timestamp;
226 unsigned long call_site;
227 unsigned long ptr;
228};
229
230struct kmemtrace_user_event_alloc {
231 size_t bytes_req;
232 size_t bytes_alloc;
233 unsigned gfp_flags;
234 int node;
235};
236
72static enum print_line_t 237static enum print_line_t
73kmemtrace_print_alloc_original(struct trace_iterator *iter, 238kmemtrace_print_alloc_user(struct trace_iterator *iter,
74 struct kmemtrace_alloc_entry *entry) 239 struct kmemtrace_alloc_entry *entry)
75{ 240{
241 struct kmemtrace_user_event_alloc *ev_alloc;
76 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
77 int ret; 243 struct kmemtrace_user_event *ev;
244
245 ev = trace_seq_reserve(s, sizeof(*ev));
246 if (!ev)
247 return TRACE_TYPE_PARTIAL_LINE;
78 248
79 /* Taken from the old linux/kmemtrace.h */ 249 ev->event_id = KMEMTRACE_USER_ALLOC;
80 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " 250 ev->type_id = entry->type_id;
81 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", 251 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
82 entry->type_id, entry->call_site, (unsigned long) entry->ptr, 252 ev->cpu = iter->cpu;
83 (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, 253 ev->timestamp = iter->ts;
84 (unsigned long) entry->gfp_flags, entry->node); 254 ev->call_site = entry->call_site;
255 ev->ptr = (unsigned long)entry->ptr;
85 256
86 if (!ret) 257 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
258 if (!ev_alloc)
87 return TRACE_TYPE_PARTIAL_LINE; 259 return TRACE_TYPE_PARTIAL_LINE;
88 260
261 ev_alloc->bytes_req = entry->bytes_req;
262 ev_alloc->bytes_alloc = entry->bytes_alloc;
263 ev_alloc->gfp_flags = entry->gfp_flags;
264 ev_alloc->node = entry->node;
265
89 return TRACE_TYPE_HANDLED; 266 return TRACE_TYPE_HANDLED;
90} 267}
91 268
92static enum print_line_t 269static enum print_line_t
93kmemtrace_print_free_original(struct trace_iterator *iter, 270kmemtrace_print_free_user(struct trace_iterator *iter,
94 struct kmemtrace_free_entry *entry) 271 struct kmemtrace_free_entry *entry)
95{ 272{
96 struct trace_seq *s = &iter->seq; 273 struct trace_seq *s = &iter->seq;
97 int ret; 274 struct kmemtrace_user_event *ev;
98 275
99 /* Taken from the old linux/kmemtrace.h */ 276 ev = trace_seq_reserve(s, sizeof(*ev));
100 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", 277 if (!ev)
101 entry->type_id, entry->call_site, (unsigned long) entry->ptr);
102
103 if (!ret)
104 return TRACE_TYPE_PARTIAL_LINE; 278 return TRACE_TYPE_PARTIAL_LINE;
105 279
280 ev->event_id = KMEMTRACE_USER_FREE;
281 ev->type_id = entry->type_id;
282 ev->event_size = sizeof(*ev);
283 ev->cpu = iter->cpu;
284 ev->timestamp = iter->ts;
285 ev->call_site = entry->call_site;
286 ev->ptr = (unsigned long)entry->ptr;
287
106 return TRACE_TYPE_HANDLED; 288 return TRACE_TYPE_HANDLED;
107} 289}
108 290
109
110/* The two other following provide a more minimalistic output */ 291/* The two other following provide a more minimalistic output */
111static enum print_line_t 292static enum print_line_t
112kmemtrace_print_alloc_compress(struct trace_iterator *iter, 293kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -178,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
178 359
179static enum print_line_t 360static enum print_line_t
180kmemtrace_print_free_compress(struct trace_iterator *iter, 361kmemtrace_print_free_compress(struct trace_iterator *iter,
181 struct kmemtrace_free_entry *entry) 362 struct kmemtrace_free_entry *entry)
182{ 363{
183 struct trace_seq *s = &iter->seq; 364 struct trace_seq *s = &iter->seq;
184 int ret; 365 int ret;
@@ -239,20 +420,22 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
239 switch (entry->type) { 420 switch (entry->type) {
240 case TRACE_KMEM_ALLOC: { 421 case TRACE_KMEM_ALLOC: {
241 struct kmemtrace_alloc_entry *field; 422 struct kmemtrace_alloc_entry *field;
423
242 trace_assign_type(field, entry); 424 trace_assign_type(field, entry);
243 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) 425 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
244 return kmemtrace_print_alloc_compress(iter, field); 426 return kmemtrace_print_alloc_compress(iter, field);
245 else 427 else
246 return kmemtrace_print_alloc_original(iter, field); 428 return kmemtrace_print_alloc_user(iter, field);
247 } 429 }
248 430
249 case TRACE_KMEM_FREE: { 431 case TRACE_KMEM_FREE: {
250 struct kmemtrace_free_entry *field; 432 struct kmemtrace_free_entry *field;
433
251 trace_assign_type(field, entry); 434 trace_assign_type(field, entry);
252 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) 435 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
253 return kmemtrace_print_free_compress(iter, field); 436 return kmemtrace_print_free_compress(iter, field);
254 else 437 else
255 return kmemtrace_print_free_original(iter, field); 438 return kmemtrace_print_free_user(iter, field);
256 } 439 }
257 440
258 default: 441 default:
@@ -260,70 +443,13 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
260 } 443 }
261} 444}
262 445
263/* Trace allocations */
264void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
265 unsigned long call_site,
266 const void *ptr,
267 size_t bytes_req,
268 size_t bytes_alloc,
269 gfp_t gfp_flags,
270 int node)
271{
272 struct ring_buffer_event *event;
273 struct kmemtrace_alloc_entry *entry;
274 struct trace_array *tr = kmemtrace_array;
275
276 if (!kmem_tracing_enabled)
277 return;
278
279 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
280 sizeof(*entry), 0, 0);
281 if (!event)
282 return;
283 entry = ring_buffer_event_data(event);
284
285 entry->call_site = call_site;
286 entry->ptr = ptr;
287 entry->bytes_req = bytes_req;
288 entry->bytes_alloc = bytes_alloc;
289 entry->gfp_flags = gfp_flags;
290 entry->node = node;
291
292 trace_buffer_unlock_commit(tr, event, 0, 0);
293}
294EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
295
296void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
297 unsigned long call_site,
298 const void *ptr)
299{
300 struct ring_buffer_event *event;
301 struct kmemtrace_free_entry *entry;
302 struct trace_array *tr = kmemtrace_array;
303
304 if (!kmem_tracing_enabled)
305 return;
306
307 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
308 sizeof(*entry), 0, 0);
309 if (!event)
310 return;
311 entry = ring_buffer_event_data(event);
312 entry->type_id = type_id;
313 entry->call_site = call_site;
314 entry->ptr = ptr;
315
316 trace_buffer_unlock_commit(tr, event, 0, 0);
317}
318EXPORT_SYMBOL(kmemtrace_mark_free);
319
320static struct tracer kmem_tracer __read_mostly = { 446static struct tracer kmem_tracer __read_mostly = {
321 .name = "kmemtrace", 447 .name = "kmemtrace",
322 .init = kmem_trace_init, 448 .init = kmem_trace_init,
323 .reset = kmem_trace_reset, 449 .reset = kmem_trace_reset,
324 .print_line = kmemtrace_print_line, 450 .print_line = kmemtrace_print_line,
325 .print_header = kmemtrace_headers, 451 .print_header = kmemtrace_headers,
326 .flags = &kmem_tracer_flags 452 .flags = &kmem_tracer_flags
327}; 453};
328 454
329void kmemtrace_init(void) 455void kmemtrace_init(void)
@@ -335,5 +461,4 @@ static int __init init_kmem_tracer(void)
335{ 461{
336 return register_tracer(&kmem_tracer); 462 return register_tracer(&kmem_tracer);
337} 463}
338
339device_initcall(init_kmem_tracer); 464device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 808b14bbf076..960cbf44c844 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
189 RB_LEN_TIME_STAMP = 16, 189 RB_LEN_TIME_STAMP = 16,
190}; 190};
191 191
192/* inline for ring buffer fast paths */ 192static inline int rb_null_event(struct ring_buffer_event *event)
193{
194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
195}
196
197static inline int rb_discarded_event(struct ring_buffer_event *event)
198{
199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
200}
201
202static void rb_event_set_padding(struct ring_buffer_event *event)
203{
204 event->type = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0;
206}
207
208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
222{
223 event->type = RINGBUF_TYPE_PADDING;
224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
227}
228
193static unsigned 229static unsigned
194rb_event_length(struct ring_buffer_event *event) 230rb_event_data_length(struct ring_buffer_event *event)
195{ 231{
196 unsigned length; 232 unsigned length;
197 233
234 if (event->len)
235 length = event->len * RB_ALIGNMENT;
236 else
237 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE;
239}
240
241/* inline for ring buffer fast paths */
242static unsigned
243rb_event_length(struct ring_buffer_event *event)
244{
198 switch (event->type) { 245 switch (event->type) {
199 case RINGBUF_TYPE_PADDING: 246 case RINGBUF_TYPE_PADDING:
200 /* undefined */ 247 if (rb_null_event(event))
201 return -1; 248 /* undefined */
249 return -1;
250 return rb_event_data_length(event);
202 251
203 case RINGBUF_TYPE_TIME_EXTEND: 252 case RINGBUF_TYPE_TIME_EXTEND:
204 return RB_LEN_TIME_EXTEND; 253 return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
207 return RB_LEN_TIME_STAMP; 256 return RB_LEN_TIME_STAMP;
208 257
209 case RINGBUF_TYPE_DATA: 258 case RINGBUF_TYPE_DATA:
210 if (event->len) 259 return rb_event_data_length(event);
211 length = event->len * RB_ALIGNMENT;
212 else
213 length = event->array[0];
214 return length + RB_EVNT_HDR_SIZE;
215 default: 260 default:
216 BUG(); 261 BUG();
217 } 262 }
@@ -518,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
518 struct list_head *head = &cpu_buffer->pages; 563 struct list_head *head = &cpu_buffer->pages;
519 struct buffer_page *bpage, *tmp; 564 struct buffer_page *bpage, *tmp;
520 565
521 list_del_init(&cpu_buffer->reader_page->list);
522 free_buffer_page(cpu_buffer->reader_page); 566 free_buffer_page(cpu_buffer->reader_page);
523 567
524 list_for_each_entry_safe(bpage, tmp, head, list) { 568 list_for_each_entry_safe(bpage, tmp, head, list) {
@@ -845,11 +889,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
845} 889}
846EXPORT_SYMBOL_GPL(ring_buffer_resize); 890EXPORT_SYMBOL_GPL(ring_buffer_resize);
847 891
848static inline int rb_null_event(struct ring_buffer_event *event)
849{
850 return event->type == RINGBUF_TYPE_PADDING;
851}
852
853static inline void * 892static inline void *
854__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 893__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
855{ 894{
@@ -1219,7 +1258,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1219 if (tail < BUF_PAGE_SIZE) { 1258 if (tail < BUF_PAGE_SIZE) {
1220 /* Mark the rest of the page with padding */ 1259 /* Mark the rest of the page with padding */
1221 event = __rb_page_index(tail_page, tail); 1260 event = __rb_page_index(tail_page, tail);
1222 event->type = RINGBUF_TYPE_PADDING; 1261 rb_event_set_padding(event);
1223 } 1262 }
1224 1263
1225 if (tail <= BUF_PAGE_SIZE) 1264 if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2008,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1969 2008
1970 event = rb_reader_event(cpu_buffer); 2009 event = rb_reader_event(cpu_buffer);
1971 2010
1972 if (event->type == RINGBUF_TYPE_DATA) 2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
1973 cpu_buffer->entries--; 2012 cpu_buffer->entries--;
1974 2013
1975 rb_update_read_stamp(cpu_buffer, event); 2014 rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2091,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2052 2091
2053 switch (event->type) { 2092 switch (event->type) {
2054 case RINGBUF_TYPE_PADDING: 2093 case RINGBUF_TYPE_PADDING:
2055 RB_WARN_ON(cpu_buffer, 1); 2094 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1);
2096 /*
2097 * Because the writer could be discarding every
2098 * event it creates (which would probably be bad)
2099 * if we were to go back to "again" then we may never
2100 * catch up, and will trigger the warn on, or lock
2101 * the box. Return the padding, and we will release
2102 * the current locks, and try again.
2103 */
2056 rb_advance_reader(cpu_buffer); 2104 rb_advance_reader(cpu_buffer);
2057 return NULL; 2105 return event;
2058 2106
2059 case RINGBUF_TYPE_TIME_EXTEND: 2107 case RINGBUF_TYPE_TIME_EXTEND:
2060 /* Internal data, OK to advance */ 2108 /* Internal data, OK to advance */
@@ -2115,8 +2163,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2115 2163
2116 switch (event->type) { 2164 switch (event->type) {
2117 case RINGBUF_TYPE_PADDING: 2165 case RINGBUF_TYPE_PADDING:
2118 rb_inc_iter(iter); 2166 if (rb_null_event(event)) {
2119 goto again; 2167 rb_inc_iter(iter);
2168 goto again;
2169 }
2170 rb_advance_iter(iter);
2171 return event;
2120 2172
2121 case RINGBUF_TYPE_TIME_EXTEND: 2173 case RINGBUF_TYPE_TIME_EXTEND:
2122 /* Internal data, OK to advance */ 2174 /* Internal data, OK to advance */
@@ -2163,10 +2215,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2163 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2215 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2164 return NULL; 2216 return NULL;
2165 2217
2218 again:
2166 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2219 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2167 event = rb_buffer_peek(buffer, cpu, ts); 2220 event = rb_buffer_peek(buffer, cpu, ts);
2168 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2169 2222
2223 if (event && event->type == RINGBUF_TYPE_PADDING) {
2224 cpu_relax();
2225 goto again;
2226 }
2227
2170 return event; 2228 return event;
2171} 2229}
2172 2230
@@ -2185,10 +2243,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2185 struct ring_buffer_event *event; 2243 struct ring_buffer_event *event;
2186 unsigned long flags; 2244 unsigned long flags;
2187 2245
2246 again:
2188 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2247 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2189 event = rb_iter_peek(iter, ts); 2248 event = rb_iter_peek(iter, ts);
2190 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2191 2250
2251 if (event && event->type == RINGBUF_TYPE_PADDING) {
2252 cpu_relax();
2253 goto again;
2254 }
2255
2192 return event; 2256 return event;
2193} 2257}
2194 2258
@@ -2207,6 +2271,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2207 struct ring_buffer_event *event = NULL; 2271 struct ring_buffer_event *event = NULL;
2208 unsigned long flags; 2272 unsigned long flags;
2209 2273
2274 again:
2210 /* might be called in atomic */ 2275 /* might be called in atomic */
2211 preempt_disable(); 2276 preempt_disable();
2212 2277
@@ -2228,6 +2293,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2228 out: 2293 out:
2229 preempt_enable(); 2294 preempt_enable();
2230 2295
2296 if (event && event->type == RINGBUF_TYPE_PADDING) {
2297 cpu_relax();
2298 goto again;
2299 }
2300
2231 return event; 2301 return event;
2232} 2302}
2233EXPORT_SYMBOL_GPL(ring_buffer_consume); 2303EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2376,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2306 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 2376 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2307 unsigned long flags; 2377 unsigned long flags;
2308 2378
2379 again:
2309 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2380 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2310 event = rb_iter_peek(iter, ts); 2381 event = rb_iter_peek(iter, ts);
2311 if (!event) 2382 if (!event)
@@ -2315,6 +2386,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2315 out: 2386 out:
2316 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2317 2388
2389 if (event && event->type == RINGBUF_TYPE_PADDING) {
2390 cpu_relax();
2391 goto again;
2392 }
2393
2318 return event; 2394 return event;
2319} 2395}
2320EXPORT_SYMBOL_GPL(ring_buffer_read); 2396EXPORT_SYMBOL_GPL(ring_buffer_read);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d1a16cae376..2a81decf99bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -852,15 +852,25 @@ static void ftrace_trace_stack(struct trace_array *tr,
852static void ftrace_trace_userstack(struct trace_array *tr, 852static void ftrace_trace_userstack(struct trace_array *tr,
853 unsigned long flags, int pc); 853 unsigned long flags, int pc);
854 854
855void trace_buffer_unlock_commit(struct trace_array *tr, 855static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
856 struct ring_buffer_event *event, 856 struct ring_buffer_event *event,
857 unsigned long flags, int pc) 857 unsigned long flags, int pc,
858 int wake)
858{ 859{
859 ring_buffer_unlock_commit(tr->buffer, event); 860 ring_buffer_unlock_commit(tr->buffer, event);
860 861
861 ftrace_trace_stack(tr, flags, 6, pc); 862 ftrace_trace_stack(tr, flags, 6, pc);
862 ftrace_trace_userstack(tr, flags, pc); 863 ftrace_trace_userstack(tr, flags, pc);
863 trace_wake_up(); 864
865 if (wake)
866 trace_wake_up();
867}
868
869void trace_buffer_unlock_commit(struct trace_array *tr,
870 struct ring_buffer_event *event,
871 unsigned long flags, int pc)
872{
873 __trace_buffer_unlock_commit(tr, event, flags, pc, 1);
864} 874}
865 875
866struct ring_buffer_event * 876struct ring_buffer_event *
@@ -874,7 +884,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
874void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 884void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
875 unsigned long flags, int pc) 885 unsigned long flags, int pc)
876{ 886{
877 return trace_buffer_unlock_commit(&global_trace, event, flags, pc); 887 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
888}
889
890void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
891 unsigned long flags, int pc)
892{
893 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
878} 894}
879 895
880void 896void
@@ -900,7 +916,7 @@ trace_function(struct trace_array *tr,
900} 916}
901 917
902#ifdef CONFIG_FUNCTION_GRAPH_TRACER 918#ifdef CONFIG_FUNCTION_GRAPH_TRACER
903static void __trace_graph_entry(struct trace_array *tr, 919static int __trace_graph_entry(struct trace_array *tr,
904 struct ftrace_graph_ent *trace, 920 struct ftrace_graph_ent *trace,
905 unsigned long flags, 921 unsigned long flags,
906 int pc) 922 int pc)
@@ -909,15 +925,17 @@ static void __trace_graph_entry(struct trace_array *tr,
909 struct ftrace_graph_ent_entry *entry; 925 struct ftrace_graph_ent_entry *entry;
910 926
911 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 927 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
912 return; 928 return 0;
913 929
914 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, 930 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
915 sizeof(*entry), flags, pc); 931 sizeof(*entry), flags, pc);
916 if (!event) 932 if (!event)
917 return; 933 return 0;
918 entry = ring_buffer_event_data(event); 934 entry = ring_buffer_event_data(event);
919 entry->graph_ent = *trace; 935 entry->graph_ent = *trace;
920 ring_buffer_unlock_commit(global_trace.buffer, event); 936 ring_buffer_unlock_commit(global_trace.buffer, event);
937
938 return 1;
921} 939}
922 940
923static void __trace_graph_return(struct trace_array *tr, 941static void __trace_graph_return(struct trace_array *tr,
@@ -1138,6 +1156,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1138 struct trace_array_cpu *data; 1156 struct trace_array_cpu *data;
1139 unsigned long flags; 1157 unsigned long flags;
1140 long disabled; 1158 long disabled;
1159 int ret;
1141 int cpu; 1160 int cpu;
1142 int pc; 1161 int pc;
1143 1162
@@ -1153,15 +1172,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1153 disabled = atomic_inc_return(&data->disabled); 1172 disabled = atomic_inc_return(&data->disabled);
1154 if (likely(disabled == 1)) { 1173 if (likely(disabled == 1)) {
1155 pc = preempt_count(); 1174 pc = preempt_count();
1156 __trace_graph_entry(tr, trace, flags, pc); 1175 ret = __trace_graph_entry(tr, trace, flags, pc);
1176 } else {
1177 ret = 0;
1157 } 1178 }
1158 /* Only do the atomic if it is not already set */ 1179 /* Only do the atomic if it is not already set */
1159 if (!test_tsk_trace_graph(current)) 1180 if (!test_tsk_trace_graph(current))
1160 set_tsk_trace_graph(current); 1181 set_tsk_trace_graph(current);
1182
1161 atomic_dec(&data->disabled); 1183 atomic_dec(&data->disabled);
1162 local_irq_restore(flags); 1184 local_irq_restore(flags);
1163 1185
1164 return 1; 1186 return ret;
1165} 1187}
1166 1188
1167void trace_graph_return(struct ftrace_graph_ret *trace) 1189void trace_graph_return(struct ftrace_graph_ret *trace)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e3429a8ab059..fec6521ffa13 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
182 struct power_trace state_data; 182 struct power_trace state_data;
183}; 183};
184 184
185enum kmemtrace_type_id {
186 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
187 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
188 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
189};
190
185struct kmemtrace_alloc_entry { 191struct kmemtrace_alloc_entry {
186 struct trace_entry ent; 192 struct trace_entry ent;
187 enum kmemtrace_type_id type_id; 193 enum kmemtrace_type_id type_id;
@@ -483,6 +489,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
483 unsigned long flags, int pc); 489 unsigned long flags, int pc);
484void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
485 unsigned long flags, int pc); 491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
486 494
487struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
488 struct trace_array_cpu *data); 496 struct trace_array_cpu *data);
@@ -778,16 +786,27 @@ enum {
778 TRACE_EVENT_TYPE_RAW = 2, 786 TRACE_EVENT_TYPE_RAW = 2,
779}; 787};
780 788
789struct ftrace_event_field {
790 struct list_head link;
791 char *name;
792 char *type;
793 int offset;
794 int size;
795};
796
781struct ftrace_event_call { 797struct ftrace_event_call {
782 char *name; 798 char *name;
783 char *system; 799 char *system;
784 struct dentry *dir; 800 struct dentry *dir;
785 int enabled; 801 int enabled;
786 int (*regfunc)(void); 802 int (*regfunc)(void);
787 void (*unregfunc)(void); 803 void (*unregfunc)(void);
788 int id; 804 int id;
789 int (*raw_init)(void); 805 int (*raw_init)(void);
790 int (*show_format)(struct trace_seq *s); 806 int (*show_format)(struct trace_seq *s);
807 int (*define_fields)(void);
808 struct list_head fields;
809 struct filter_pred **preds;
791 810
792#ifdef CONFIG_EVENT_PROFILE 811#ifdef CONFIG_EVENT_PROFILE
793 atomic_t profile_count; 812 atomic_t profile_count;
@@ -796,6 +815,51 @@ struct ftrace_event_call {
796#endif 815#endif
797}; 816};
798 817
818struct event_subsystem {
819 struct list_head list;
820 const char *name;
821 struct dentry *entry;
822 struct filter_pred **preds;
823};
824
825#define events_for_each(event) \
826 for (event = __start_ftrace_events; \
827 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
828 event++)
829
830#define MAX_FILTER_PRED 8
831
832struct filter_pred;
833
834typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
835
836struct filter_pred {
837 filter_pred_fn_t fn;
838 u64 val;
839 char *str_val;
840 int str_len;
841 char *field_name;
842 int offset;
843 int not;
844 int or;
845 int compound;
846 int clear;
847};
848
849int trace_define_field(struct ftrace_event_call *call, char *type,
850 char *name, int offset, int size);
851extern void filter_free_pred(struct filter_pred *pred);
852extern void filter_print_preds(struct filter_pred **preds,
853 struct trace_seq *s);
854extern int filter_parse(char **pbuf, struct filter_pred *pred);
855extern int filter_add_pred(struct ftrace_event_call *call,
856 struct filter_pred *pred);
857extern void filter_free_preds(struct ftrace_event_call *call);
858extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
859extern void filter_free_subsystem_preds(struct event_subsystem *system);
860extern int filter_add_subsystem_pred(struct event_subsystem *system,
861 struct filter_pred *pred);
862
799void event_trace_printk(unsigned long ip, const char *fmt, ...); 863void event_trace_printk(unsigned long ip, const char *fmt, ...);
800extern struct ftrace_event_call __start_ftrace_events[]; 864extern struct ftrace_event_call __start_ftrace_events[];
801extern struct ftrace_event_call __stop_ftrace_events[]; 865extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3047b56f6637..64ec4d278ffb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,39 @@
19 19
20static DEFINE_MUTEX(event_mutex); 20static DEFINE_MUTEX(event_mutex);
21 21
22int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size)
24{
25 struct ftrace_event_field *field;
26
27 field = kzalloc(sizeof(*field), GFP_KERNEL);
28 if (!field)
29 goto err;
30
31 field->name = kstrdup(name, GFP_KERNEL);
32 if (!field->name)
33 goto err;
34
35 field->type = kstrdup(type, GFP_KERNEL);
36 if (!field->type)
37 goto err;
38
39 field->offset = offset;
40 field->size = size;
41 list_add(&field->link, &call->fields);
42
43 return 0;
44
45err:
46 if (field) {
47 kfree(field->name);
48 kfree(field->type);
49 }
50 kfree(field);
51
52 return -ENOMEM;
53}
54
22static void ftrace_clear_events(void) 55static void ftrace_clear_events(void)
23{ 56{
24 struct ftrace_event_call *call = (void *)__start_ftrace_events; 57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -343,7 +376,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
343 376
344#undef FIELD 377#undef FIELD
345#define FIELD(type, name) \ 378#define FIELD(type, name) \
346 #type, #name, offsetof(typeof(field), name), sizeof(field.name) 379 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name)
347 381
348static int trace_write_header(struct trace_seq *s) 382static int trace_write_header(struct trace_seq *s)
349{ 383{
@@ -430,6 +464,139 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
430 return r; 464 return r;
431} 465}
432 466
467static ssize_t
468event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
469 loff_t *ppos)
470{
471 struct ftrace_event_call *call = filp->private_data;
472 struct trace_seq *s;
473 int r;
474
475 if (*ppos)
476 return 0;
477
478 s = kmalloc(sizeof(*s), GFP_KERNEL);
479 if (!s)
480 return -ENOMEM;
481
482 trace_seq_init(s);
483
484 filter_print_preds(call->preds, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486
487 kfree(s);
488
489 return r;
490}
491
492static ssize_t
493event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos)
495{
496 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf;
498 struct filter_pred *pred;
499 int err;
500
501 if (cnt >= sizeof(buf))
502 return -EINVAL;
503
504 if (copy_from_user(&buf, ubuf, cnt))
505 return -EFAULT;
506
507 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
508 if (!pred)
509 return -ENOMEM;
510
511 err = filter_parse(&pbuf, pred);
512 if (err < 0) {
513 filter_free_pred(pred);
514 return err;
515 }
516
517 if (pred->clear) {
518 filter_free_preds(call);
519 filter_free_pred(pred);
520 return cnt;
521 }
522
523 if (filter_add_pred(call, pred)) {
524 filter_free_pred(pred);
525 return -EINVAL;
526 }
527
528 *ppos += cnt;
529
530 return cnt;
531}
532
533static ssize_t
534subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
535 loff_t *ppos)
536{
537 struct event_subsystem *system = filp->private_data;
538 struct trace_seq *s;
539 int r;
540
541 if (*ppos)
542 return 0;
543
544 s = kmalloc(sizeof(*s), GFP_KERNEL);
545 if (!s)
546 return -ENOMEM;
547
548 trace_seq_init(s);
549
550 filter_print_preds(system->preds, s);
551 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
552
553 kfree(s);
554
555 return r;
556}
557
558static ssize_t
559subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
560 loff_t *ppos)
561{
562 struct event_subsystem *system = filp->private_data;
563 char buf[64], *pbuf = buf;
564 struct filter_pred *pred;
565 int err;
566
567 if (cnt >= sizeof(buf))
568 return -EINVAL;
569
570 if (copy_from_user(&buf, ubuf, cnt))
571 return -EFAULT;
572
573 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
574 if (!pred)
575 return -ENOMEM;
576
577 err = filter_parse(&pbuf, pred);
578 if (err < 0) {
579 filter_free_pred(pred);
580 return err;
581 }
582
583 if (pred->clear) {
584 filter_free_subsystem_preds(system);
585 filter_free_pred(pred);
586 return cnt;
587 }
588
589 if (filter_add_subsystem_pred(system, pred)) {
590 filter_free_subsystem_preds(system);
591 filter_free_pred(pred);
592 return -EINVAL;
593 }
594
595 *ppos += cnt;
596
597 return cnt;
598}
599
433static const struct seq_operations show_event_seq_ops = { 600static const struct seq_operations show_event_seq_ops = {
434 .start = t_start, 601 .start = t_start,
435 .next = t_next, 602 .next = t_next,
@@ -475,6 +642,18 @@ static const struct file_operations ftrace_event_id_fops = {
475 .read = event_id_read, 642 .read = event_id_read,
476}; 643};
477 644
645static const struct file_operations ftrace_event_filter_fops = {
646 .open = tracing_open_generic,
647 .read = event_filter_read,
648 .write = event_filter_write,
649};
650
651static const struct file_operations ftrace_subsystem_filter_fops = {
652 .open = tracing_open_generic,
653 .read = subsystem_filter_read,
654 .write = subsystem_filter_write,
655};
656
478static struct dentry *event_trace_events_dir(void) 657static struct dentry *event_trace_events_dir(void)
479{ 658{
480 static struct dentry *d_tracer; 659 static struct dentry *d_tracer;
@@ -495,12 +674,6 @@ static struct dentry *event_trace_events_dir(void)
495 return d_events; 674 return d_events;
496} 675}
497 676
498struct event_subsystem {
499 struct list_head list;
500 const char *name;
501 struct dentry *entry;
502};
503
504static LIST_HEAD(event_subsystems); 677static LIST_HEAD(event_subsystems);
505 678
506static struct dentry * 679static struct dentry *
@@ -533,6 +706,8 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
533 system->name = name; 706 system->name = name;
534 list_add(&system->list, &event_subsystems); 707 list_add(&system->list, &event_subsystems);
535 708
709 system->preds = NULL;
710
536 return system->entry; 711 return system->entry;
537} 712}
538 713
@@ -581,6 +756,20 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
581 call->name); 756 call->name);
582 } 757 }
583 758
759 if (call->define_fields) {
760 ret = call->define_fields();
761 if (ret < 0) {
762 pr_warning("Could not initialize trace point"
763 " events/%s\n", call->name);
764 return ret;
765 }
766 entry = debugfs_create_file("filter", 0644, call->dir, call,
767 &ftrace_event_filter_fops);
768 if (!entry)
769 pr_warning("Could not create debugfs "
770 "'%s/filter' entry\n", call->name);
771 }
772
584 /* A trace may not want to export its format */ 773 /* A trace may not want to export its format */
585 if (!call->show_format) 774 if (!call->show_format)
586 return 0; 775 return 0;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000000..026be412f356
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,427 @@
1/*
2 * trace_events_filter - generic event filtering
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */
20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h>
24#include <linux/ctype.h>
25
26#include "trace.h"
27#include "trace_output.h"
28
29static int filter_pred_64(struct filter_pred *pred, void *event)
30{
31 u64 *addr = (u64 *)(event + pred->offset);
32 u64 val = (u64)pred->val;
33 int match;
34
35 match = (val == *addr) ^ pred->not;
36
37 return match;
38}
39
40static int filter_pred_32(struct filter_pred *pred, void *event)
41{
42 u32 *addr = (u32 *)(event + pred->offset);
43 u32 val = (u32)pred->val;
44 int match;
45
46 match = (val == *addr) ^ pred->not;
47
48 return match;
49}
50
51static int filter_pred_16(struct filter_pred *pred, void *event)
52{
53 u16 *addr = (u16 *)(event + pred->offset);
54 u16 val = (u16)pred->val;
55 int match;
56
57 match = (val == *addr) ^ pred->not;
58
59 return match;
60}
61
62static int filter_pred_8(struct filter_pred *pred, void *event)
63{
64 u8 *addr = (u8 *)(event + pred->offset);
65 u8 val = (u8)pred->val;
66 int match;
67
68 match = (val == *addr) ^ pred->not;
69
70 return match;
71}
72
73static int filter_pred_string(struct filter_pred *pred, void *event)
74{
75 char *addr = (char *)(event + pred->offset);
76 int cmp, match;
77
78 cmp = strncmp(addr, pred->str_val, pred->str_len);
79
80 match = (!cmp) ^ pred->not;
81
82 return match;
83}
84
85/* return 1 if event matches, 0 otherwise (discard) */
86int filter_match_preds(struct ftrace_event_call *call, void *rec)
87{
88 int i, matched, and_failed = 0;
89 struct filter_pred *pred;
90
91 for (i = 0; i < MAX_FILTER_PRED; i++) {
92 if (call->preds[i]) {
93 pred = call->preds[i];
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105
106 if (and_failed)
107 return 0;
108
109 return 1;
110}
111
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
113{
114 char *field_name;
115 struct filter_pred *pred;
116 int i;
117
118 if (!preds) {
119 trace_seq_printf(s, "none\n");
120 return;
121 }
122
123 for (i = 0; i < MAX_FILTER_PRED; i++) {
124 if (preds[i]) {
125 pred = preds[i];
126 field_name = pred->field_name;
127 if (i)
128 trace_seq_printf(s, pred->or ? "|| " : "&& ");
129 trace_seq_printf(s, "%s ", field_name);
130 trace_seq_printf(s, pred->not ? "!= " : "== ");
131 if (pred->str_val)
132 trace_seq_printf(s, "%s\n", pred->str_val);
133 else
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138}
139
140static struct ftrace_event_field *
141find_event_field(struct ftrace_event_call *call, char *name)
142{
143 struct ftrace_event_field *field;
144
145 list_for_each_entry(field, &call->fields, link) {
146 if (!strcmp(field->name, name))
147 return field;
148 }
149
150 return NULL;
151}
152
153void filter_free_pred(struct filter_pred *pred)
154{
155 if (!pred)
156 return;
157
158 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred);
161}
162
163void filter_free_preds(struct ftrace_event_call *call)
164{
165 int i;
166
167 if (call->preds) {
168 for (i = 0; i < MAX_FILTER_PRED; i++)
169 filter_free_pred(call->preds[i]);
170 kfree(call->preds);
171 call->preds = NULL;
172 }
173}
174
175void filter_free_subsystem_preds(struct event_subsystem *system)
176{
177 struct ftrace_event_call *call = __start_ftrace_events;
178 int i;
179
180 if (system->preds) {
181 for (i = 0; i < MAX_FILTER_PRED; i++)
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186
187 events_for_each(call) {
188 if (!call->name || !call->regfunc)
189 continue;
190
191 if (!strcmp(call->system, system->name))
192 filter_free_preds(call);
193 }
194}
195
196static int __filter_add_pred(struct ftrace_event_call *call,
197 struct filter_pred *pred)
198{
199 int i;
200
201 if (call->preds && !pred->compound)
202 filter_free_preds(call);
203
204 if (!call->preds) {
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
206 GFP_KERNEL);
207 if (!call->preds)
208 return -ENOMEM;
209 }
210
211 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) {
213 call->preds[i] = pred;
214 return 0;
215 }
216 }
217
218 return -ENOMEM;
219}
220
221static int is_string_field(const char *type)
222{
223 if (strchr(type, '[') && strstr(type, "char"))
224 return 1;
225
226 return 0;
227}
228
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
230{
231 struct ftrace_event_field *field;
232
233 field = find_event_field(call, pred->field_name);
234 if (!field)
235 return -EINVAL;
236
237 pred->offset = field->offset;
238
239 if (is_string_field(field->type)) {
240 if (!pred->str_val)
241 return -EINVAL;
242 pred->fn = filter_pred_string;
243 pred->str_len = field->size;
244 return __filter_add_pred(call, pred);
245 } else {
246 if (pred->str_val)
247 return -EINVAL;
248 }
249
250 switch (field->size) {
251 case 8:
252 pred->fn = filter_pred_64;
253 break;
254 case 4:
255 pred->fn = filter_pred_32;
256 break;
257 case 2:
258 pred->fn = filter_pred_16;
259 break;
260 case 1:
261 pred->fn = filter_pred_8;
262 break;
263 default:
264 return -EINVAL;
265 }
266
267 return __filter_add_pred(call, pred);
268}
269
270static struct filter_pred *copy_pred(struct filter_pred *pred)
271{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
273 if (!new_pred)
274 return NULL;
275
276 memcpy(new_pred, pred, sizeof(*pred));
277
278 if (pred->field_name) {
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
280 if (!new_pred->field_name) {
281 kfree(new_pred);
282 return NULL;
283 }
284 }
285
286 if (pred->str_val) {
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
288 if (!new_pred->str_val) {
289 filter_free_pred(new_pred);
290 return NULL;
291 }
292 }
293
294 return new_pred;
295}
296
297int filter_add_subsystem_pred(struct event_subsystem *system,
298 struct filter_pred *pred)
299{
300 struct ftrace_event_call *call = __start_ftrace_events;
301 struct filter_pred *event_pred;
302 int i;
303
304 if (system->preds && !pred->compound)
305 filter_free_subsystem_preds(system);
306
307 if (!system->preds) {
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
309 GFP_KERNEL);
310 if (!system->preds)
311 return -ENOMEM;
312 }
313
314 for (i = 0; i < MAX_FILTER_PRED; i++) {
315 if (!system->preds[i]) {
316 system->preds[i] = pred;
317 break;
318 }
319 }
320
321 if (i == MAX_FILTER_PRED)
322 return -EINVAL;
323
324 events_for_each(call) {
325 int err;
326
327 if (!call->name || !call->regfunc)
328 continue;
329
330 if (strcmp(call->system, system->name))
331 continue;
332
333 if (!find_event_field(call, pred->field_name))
334 continue;
335
336 event_pred = copy_pred(pred);
337 if (!event_pred)
338 goto oom;
339
340 err = filter_add_pred(call, event_pred);
341 if (err)
342 filter_free_pred(event_pred);
343 if (err == -ENOMEM)
344 goto oom;
345 }
346
347 return 0;
348
349oom:
350 system->preds[i] = NULL;
351 return -ENOMEM;
352}
353
354int filter_parse(char **pbuf, struct filter_pred *pred)
355{
356 char *tmp, *tok, *val_str = NULL;
357 int tok_n = 0;
358
359 /* field ==/!= number, or/and field ==/!= number, number */
360 while ((tok = strsep(pbuf, " \n"))) {
361 if (tok_n == 0) {
362 if (!strcmp(tok, "0")) {
363 pred->clear = 1;
364 return 0;
365 } else if (!strcmp(tok, "&&")) {
366 pred->or = 0;
367 pred->compound = 1;
368 } else if (!strcmp(tok, "||")) {
369 pred->or = 1;
370 pred->compound = 1;
371 } else
372 pred->field_name = tok;
373 tok_n = 1;
374 continue;
375 }
376 if (tok_n == 1) {
377 if (!pred->field_name)
378 pred->field_name = tok;
379 else if (!strcmp(tok, "!="))
380 pred->not = 1;
381 else if (!strcmp(tok, "=="))
382 pred->not = 0;
383 else {
384 pred->field_name = NULL;
385 return -EINVAL;
386 }
387 tok_n = 2;
388 continue;
389 }
390 if (tok_n == 2) {
391 if (pred->compound) {
392 if (!strcmp(tok, "!="))
393 pred->not = 1;
394 else if (!strcmp(tok, "=="))
395 pred->not = 0;
396 else {
397 pred->field_name = NULL;
398 return -EINVAL;
399 }
400 } else {
401 val_str = tok;
402 break; /* done */
403 }
404 tok_n = 3;
405 continue;
406 }
407 if (tok_n == 3) {
408 val_str = tok;
409 break; /* done */
410 }
411 }
412
413 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
414 if (!pred->field_name)
415 return -ENOMEM;
416
417 pred->val = simple_strtoull(val_str, &tmp, 10);
418 if (tmp == val_str) {
419 pred->str_val = kstrdup(val_str, GFP_KERNEL);
420 if (!pred->str_val)
421 return -ENOMEM;
422 }
423
424 return 0;
425}
426
427
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 5117c43f5c67..30743f7d4110 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s) \
129} 129}
130 130
131#include <trace/trace_event_types.h> 131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6b3261ca988c..9d2fa78cecca 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call; \
204 \ 204 \
205static void ftrace_raw_event_##call(proto) \ 205static void ftrace_raw_event_##call(proto) \
206{ \ 206{ \
207 struct ftrace_event_call *call = &event_##call; \
207 struct ring_buffer_event *event; \ 208 struct ring_buffer_event *event; \
208 struct ftrace_raw_##call *entry; \ 209 struct ftrace_raw_##call *entry; \
209 unsigned long irq_flags; \ 210 unsigned long irq_flags; \
@@ -221,7 +222,11 @@ static void ftrace_raw_event_##call(proto) \
221 \ 222 \
222 assign; \ 223 assign; \
223 \ 224 \
224 trace_current_buffer_unlock_commit(event, irq_flags, pc); \ 225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
225} \ 230} \
226 \ 231 \
227static int ftrace_raw_reg_event_##call(void) \ 232static int ftrace_raw_reg_event_##call(void) \
@@ -252,6 +257,7 @@ static int ftrace_raw_init_event_##call(void) \
252 if (!id) \ 257 if (!id) \
253 return -ENODEV; \ 258 return -ENODEV; \
254 event_##call.id = id; \ 259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
255 return 0; \ 261 return 0; \
256} \ 262} \
257 \ 263 \
@@ -264,6 +270,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
264 .regfunc = ftrace_raw_reg_event_##call, \ 270 .regfunc = ftrace_raw_reg_event_##call, \
265 .unregfunc = ftrace_raw_unreg_event_##call, \ 271 .unregfunc = ftrace_raw_unreg_event_##call, \
266 .show_format = ftrace_format_##call, \ 272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
267 _TRACE_PROFILE_INIT(call) \ 274 _TRACE_PROFILE_INIT(call) \
268} 275}
269 276
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 9aa84bde23cd..394f94417e2f 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly =
91 .name = "nop", 91 .name = "nop",
92 .init = nop_trace_init, 92 .init = nop_trace_init,
93 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
94#ifdef CONFIG_FTRACE_SELFTEST 95#ifdef CONFIG_FTRACE_SELFTEST
95 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
96#endif 97#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a3b6e3fd7044..aeac358ee231 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -147,7 +147,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
147 return 1; 147 return 1;
148} 148}
149 149
150int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) 150int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
151{ 151{
152 if (len > ((PAGE_SIZE - 1) - s->len)) 152 if (len > ((PAGE_SIZE - 1) - s->len))
153 return 0; 153 return 0;
@@ -158,10 +158,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
158 return len; 158 return len;
159} 159}
160 160
161int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) 161int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
162{ 162{
163 unsigned char hex[HEX_CHARS]; 163 unsigned char hex[HEX_CHARS];
164 unsigned char *data = mem; 164 const unsigned char *data = mem;
165 int i, j; 165 int i, j;
166 166
167#ifdef __BIG_ENDIAN 167#ifdef __BIG_ENDIAN
@@ -177,6 +177,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
177 return trace_seq_putmem(s, hex, j); 177 return trace_seq_putmem(s, hex, j);
178} 178}
179 179
180void *trace_seq_reserve(struct trace_seq *s, size_t len)
181{
182 void *ret;
183
184 if (len > ((PAGE_SIZE - 1) - s->len))
185 return NULL;
186
187 ret = s->buffer + s->len;
188 s->len += len;
189
190 return ret;
191}
192
180int trace_seq_path(struct trace_seq *s, struct path *path) 193int trace_seq_path(struct trace_seq *s, struct path *path)
181{ 194{
182 unsigned char *p; 195 unsigned char *p;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1eac2973374e..91630217fb46 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -31,24 +31,27 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
31 unsigned long sym_flags); 31 unsigned long sym_flags);
32extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 32extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
33 size_t cnt); 33 size_t cnt);
34int trace_seq_puts(struct trace_seq *s, const char *str); 34extern int trace_seq_puts(struct trace_seq *s, const char *str);
35int trace_seq_putc(struct trace_seq *s, unsigned char c); 35extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
36int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); 36extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
37int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); 37extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
38int trace_seq_path(struct trace_seq *s, struct path *path); 38 size_t len);
39int seq_print_userip_objs(const struct userstack_entry *entry, 39extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
40 struct trace_seq *s, unsigned long sym_flags); 40extern int trace_seq_path(struct trace_seq *s, struct path *path);
41int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, 41extern int seq_print_userip_objs(const struct userstack_entry *entry,
42 unsigned long ip, unsigned long sym_flags); 42 struct trace_seq *s, unsigned long sym_flags);
43extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
44 unsigned long ip, unsigned long sym_flags);
43 45
44int trace_print_context(struct trace_iterator *iter); 46extern int trace_print_context(struct trace_iterator *iter);
45int trace_print_lat_context(struct trace_iterator *iter); 47extern int trace_print_lat_context(struct trace_iterator *iter);
46 48
47struct trace_event *ftrace_find_event(int type); 49extern struct trace_event *ftrace_find_event(int type);
48int register_ftrace_event(struct trace_event *event); 50extern int register_ftrace_event(struct trace_event *event);
49int unregister_ftrace_event(struct trace_event *event); 51extern int unregister_ftrace_event(struct trace_event *event);
50 52
51enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); 53extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
54 int flags);
52 55
53#define MAX_MEMHEX_BYTES 8 56#define MAX_MEMHEX_BYTES 8
54#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 57#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f8f48d84b2c3..fdde3a4a94cd 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session)
125 INIT_LIST_HEAD(&new_entry->list); 125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat; 126 new_entry->stat = stat;
127 127
128 list_for_each_entry(iter_entry, &session->stat_list, list) { 128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
129 130
130 /* Insertion with a descendent sorting */ 131 /* Insertion with a descendent sorting */
131 if (ts->stat_cmp(new_entry->stat, 132 if (ts->stat_cmp(iter_entry->stat,
132 iter_entry->stat) > 0) { 133 new_entry->stat) >= 0) {
133 134
134 list_add_tail(&new_entry->list,
135 &iter_entry->list);
136 break;
137
138 /* The current smaller value */
139 } else if (list_is_last(&iter_entry->list,
140 &session->stat_list)) {
141 list_add(&new_entry->list, &iter_entry->list); 135 list_add(&new_entry->list, &iter_entry->list);
142 break; 136 break;
143 } 137 }
144 } 138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
145 } 143 }
146exit: 144exit:
147 mutex_unlock(&session->stat_mutex); 145 mutex_unlock(&session->stat_mutex);
@@ -163,7 +161,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
163 161
164 /* If we are in the beginning of the file, print the headers */ 162 /* If we are in the beginning of the file, print the headers */
165 if (!*pos && session->ts->stat_headers) 163 if (!*pos && session->ts->stat_headers)
166 session->ts->stat_headers(s); 164 return SEQ_START_TOKEN;
167 165
168 return seq_list_start(&session->stat_list, *pos); 166 return seq_list_start(&session->stat_list, *pos);
169} 167}
@@ -172,6 +170,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
172{ 170{
173 struct tracer_stat_session *session = s->private; 171 struct tracer_stat_session *session = s->private;
174 172
173 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos);
175
175 return seq_list_next(p, &session->stat_list, pos); 176 return seq_list_next(p, &session->stat_list, pos);
176} 177}
177 178
@@ -186,6 +187,9 @@ static int stat_seq_show(struct seq_file *s, void *v)
186 struct tracer_stat_session *session = s->private; 187 struct tracer_stat_session *session = s->private;
187 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); 188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
188 189
190 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s);
192
189 return session->ts->stat_show(s, l->stat); 193 return session->ts->stat_show(s, l->stat);
190} 194}
191 195
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index ee533c2e161b..984b9175c13d 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
196 struct pid *pid; 196 struct pid *pid;
197 struct task_struct *tsk; 197 struct task_struct *tsk;
198 198
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
199 pid = find_get_pid(cws->pid); 204 pid = find_get_pid(cws->pid);
200 if (pid) { 205 if (pid) {
201 tsk = get_pid_task(pid, PIDTYPE_PID); 206 tsk = get_pid_task(pid, PIDTYPE_PID);
@@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
208 put_pid(pid); 213 put_pid(pid);
209 } 214 }
210 215
211 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
212 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
213 seq_printf(s, "\n");
214 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
215
216 return 0; 216 return 0;
217} 217}
218 218
219static int workqueue_stat_headers(struct seq_file *s) 219static int workqueue_stat_headers(struct seq_file *s)
220{ 220{
221 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 221 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
222 seq_printf(s, "# | | | |\n\n"); 222 seq_printf(s, "# | | | |\n");
223 return 0; 223 return 0;
224} 224}
225 225
diff --git a/kernel/user.c b/kernel/user.c
index fbb300e6191f..850e0ba41c1e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
20 20
21struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
22 .kref = { 22 .kref = {
23 .refcount = ATOMIC_INIT(1), 23 .refcount = ATOMIC_INIT(2),
24 }, 24 },
25 .creator = &root_user, 25 .creator = &root_user,
26}; 26};
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
37 up_write(&uts_sem); 37 up_write(&uts_sem);
38} 38}
39 39
40#ifdef CONFIG_PROC_FS 40#ifdef CONFIG_PROC_SYSCTL
41/* 41/*
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e53ee18ef431..b6b966ce1451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,8 +49,6 @@ struct cpu_workqueue_struct {
49 49
50 struct workqueue_struct *wq; 50 struct workqueue_struct *wq;
51 struct task_struct *thread; 51 struct task_struct *thread;
52
53 int run_depth; /* Detect run_workqueue() recursion depth */
54} ____cacheline_aligned; 52} ____cacheline_aligned;
55 53
56/* 54/*
@@ -269,13 +267,6 @@ DEFINE_TRACE(workqueue_execution);
269static void run_workqueue(struct cpu_workqueue_struct *cwq) 267static void run_workqueue(struct cpu_workqueue_struct *cwq)
270{ 268{
271 spin_lock_irq(&cwq->lock); 269 spin_lock_irq(&cwq->lock);
272 cwq->run_depth++;
273 if (cwq->run_depth > 3) {
274 /* morton gets to eat his hat */
275 printk("%s: recursion depth exceeded: %d\n",
276 __func__, cwq->run_depth);
277 dump_stack();
278 }
279 while (!list_empty(&cwq->worklist)) { 270 while (!list_empty(&cwq->worklist)) {
280 struct work_struct *work = list_entry(cwq->worklist.next, 271 struct work_struct *work = list_entry(cwq->worklist.next,
281 struct work_struct, entry); 272 struct work_struct, entry);
@@ -318,7 +309,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
318 spin_lock_irq(&cwq->lock); 309 spin_lock_irq(&cwq->lock);
319 cwq->current_work = NULL; 310 cwq->current_work = NULL;
320 } 311 }
321 cwq->run_depth--;
322 spin_unlock_irq(&cwq->lock); 312 spin_unlock_irq(&cwq->lock);
323} 313}
324 314
@@ -375,29 +365,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
375 365
376static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 366static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
377{ 367{
378 int active; 368 int active = 0;
369 struct wq_barrier barr;
379 370
380 if (cwq->thread == current) { 371 WARN_ON(cwq->thread == current);
381 /*
382 * Probably keventd trying to flush its own queue. So simply run
383 * it by hand rather than deadlocking.
384 */
385 run_workqueue(cwq);
386 active = 1;
387 } else {
388 struct wq_barrier barr;
389 372
390 active = 0; 373 spin_lock_irq(&cwq->lock);
391 spin_lock_irq(&cwq->lock); 374 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
392 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 375 insert_wq_barrier(cwq, &barr, &cwq->worklist);
393 insert_wq_barrier(cwq, &barr, &cwq->worklist); 376 active = 1;
394 active = 1;
395 }
396 spin_unlock_irq(&cwq->lock);
397
398 if (active)
399 wait_for_completion(&barr.done);
400 } 377 }
378 spin_unlock_irq(&cwq->lock);
379
380 if (active)
381 wait_for_completion(&barr.done);
401 382
402 return active; 383 return active;
403} 384}
@@ -423,7 +404,7 @@ void flush_workqueue(struct workqueue_struct *wq)
423 might_sleep(); 404 might_sleep();
424 lock_map_acquire(&wq->lockdep_map); 405 lock_map_acquire(&wq->lockdep_map);
425 lock_map_release(&wq->lockdep_map); 406 lock_map_release(&wq->lockdep_map);
426 for_each_cpu_mask_nr(cpu, *cpu_map) 407 for_each_cpu(cpu, cpu_map)
427 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 408 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
428} 409}
429EXPORT_SYMBOL_GPL(flush_workqueue); 410EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -554,7 +535,7 @@ static void wait_on_work(struct work_struct *work)
554 wq = cwq->wq; 535 wq = cwq->wq;
555 cpu_map = wq_cpu_map(wq); 536 cpu_map = wq_cpu_map(wq);
556 537
557 for_each_cpu_mask_nr(cpu, *cpu_map) 538 for_each_cpu(cpu, cpu_map)
558 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 539 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
559} 540}
560 541
@@ -925,7 +906,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
925 list_del(&wq->list); 906 list_del(&wq->list);
926 spin_unlock(&workqueue_lock); 907 spin_unlock(&workqueue_lock);
927 908
928 for_each_cpu_mask_nr(cpu, *cpu_map) 909 for_each_cpu(cpu, cpu_map)
929 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 910 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
930 cpu_maps_update_done(); 911 cpu_maps_update_done();
931 912