aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/audit.c365
-rw-r--r--kernel/audit.h15
-rw-r--r--kernel/audit_tree.c20
-rw-r--r--kernel/audit_watch.c24
-rw-r--r--kernel/auditfilter.c93
-rw-r--r--kernel/auditsc.c44
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c1239
-rw-r--r--kernel/cgroup_freezer.c7
-rw-r--r--kernel/context_tracking.c8
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/cpuset.c79
-rw-r--r--kernel/events/core.c60
-rw-r--r--kernel/events/ring_buffer.c42
-rw-r--r--kernel/events/uprobes.c64
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c210
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/locking/lockdep.c4
-rw-r--r--kernel/locking/mutex-debug.c7
-rw-r--r--kernel/locking/rtmutex-debug.c8
-rw-r--r--kernel/locking/rtmutex.c166
-rw-r--r--kernel/locking/rtmutex_common.h23
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/posix-cpu-timers.c327
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c10
-rw-r--r--kernel/rcu/rcu.h5
-rw-r--r--kernel/rcu/srcu.c57
-rw-r--r--kernel/rcu/torture.c75
-rw-r--r--kernel/rcu/tree.c97
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_plugin.h106
-rw-r--r--kernel/rcu/tree_trace.c3
-rw-r--r--kernel/rcu/update.c5
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/clock.c78
-rw-r--r--kernel/sched/core.c847
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--kernel/sched/cpudeadline.c216
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/deadline.c1640
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c269
-rw-r--r--kernel/sched/rt.c16
-rw-r--r--kernel/sched/sched.h146
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/softirq.c92
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/system_certificates.S14
-rw-r--r--kernel/system_keyring.c4
-rw-r--r--kernel/time/sched_clock.c6
-rw-r--r--kernel/time/tick-broadcast.c6
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c67
-rw-r--r--kernel/time/timekeeping.c55
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c281
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c57
-rw-r--r--kernel/trace/trace.h193
-rw-r--r--kernel/trace/trace_event_perf.c8
-rw-r--r--kernel/trace/trace_events.c52
-rw-r--r--kernel/trace/trace_events_filter.c12
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_kprobe.c838
-rw-r--r--kernel/trace/trace_probe.c440
-rw-r--r--kernel/trace/trace_probe.h224
-rw-r--r--kernel/trace/trace_sched_wakeup.c65
-rw-r--r--kernel/trace/trace_selftest.c33
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_syscalls.c24
-rw-r--r--kernel/trace/trace_uprobe.c487
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/workqueue.c84
90 files changed, 8186 insertions, 2918 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc 7hz.bc
8x509_certificate_list
diff --git a/kernel/Makefile b/kernel/Makefile
index bbaf7d59c1bb..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,9 +137,10 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
137############################################################################### 137###############################################################################
138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) 138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) 139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
141X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ 141X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
142 $(or $(realpath $(CERT)),$(CERT)))) 142 $(or $(realpath $(CERT)),$(CERT))))
143X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
143 144
144ifeq ($(X509_CERTIFICATES),) 145ifeq ($(X509_CERTIFICATES),)
145$(warning *** No X.509 certificates found ***) 146$(warning *** No X.509 certificates found ***)
@@ -164,9 +165,9 @@ $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
164targets += $(obj)/.x509.list 165targets += $(obj)/.x509.list
165$(obj)/.x509.list: 166$(obj)/.x509.list:
166 @echo $(X509_CERTIFICATES) >$@ 167 @echo $(X509_CERTIFICATES) >$@
168endif
167 169
168clean-files := x509_certificate_list .x509.list 170clean-files := x509_certificate_list .x509.list
169endif
170 171
171ifeq ($(CONFIG_MODULE_SIG),y) 172ifeq ($(CONFIG_MODULE_SIG),y)
172############################################################################### 173###############################################################################
diff --git a/kernel/audit.c b/kernel/audit.c
index 906ae5a0233a..34c5a2310fbf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,6 +41,8 @@
41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ 41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
42 */ 42 */
43 43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
44#include <linux/init.h> 46#include <linux/init.h>
45#include <asm/types.h> 47#include <asm/types.h>
46#include <linux/atomic.h> 48#include <linux/atomic.h>
@@ -63,6 +65,7 @@
63#include <linux/freezer.h> 65#include <linux/freezer.h>
64#include <linux/tty.h> 66#include <linux/tty.h>
65#include <linux/pid_namespace.h> 67#include <linux/pid_namespace.h>
68#include <net/netns/generic.h>
66 69
67#include "audit.h" 70#include "audit.h"
68 71
@@ -76,16 +79,16 @@ static int audit_initialized;
76#define AUDIT_OFF 0 79#define AUDIT_OFF 0
77#define AUDIT_ON 1 80#define AUDIT_ON 1
78#define AUDIT_LOCKED 2 81#define AUDIT_LOCKED 2
79int audit_enabled; 82u32 audit_enabled;
80int audit_ever_enabled; 83u32 audit_ever_enabled;
81 84
82EXPORT_SYMBOL_GPL(audit_enabled); 85EXPORT_SYMBOL_GPL(audit_enabled);
83 86
84/* Default state when kernel boots without any parameters. */ 87/* Default state when kernel boots without any parameters. */
85static int audit_default; 88static u32 audit_default;
86 89
87/* If auditing cannot proceed, audit_failure selects what happens. */ 90/* If auditing cannot proceed, audit_failure selects what happens. */
88static int audit_failure = AUDIT_FAIL_PRINTK; 91static u32 audit_failure = AUDIT_FAIL_PRINTK;
89 92
90/* 93/*
91 * If audit records are to be written to the netlink socket, audit_pid 94 * If audit records are to be written to the netlink socket, audit_pid
@@ -93,17 +96,19 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
93 * the portid to use to send netlink messages to that process. 96 * the portid to use to send netlink messages to that process.
94 */ 97 */
95int audit_pid; 98int audit_pid;
96static int audit_nlk_portid; 99static __u32 audit_nlk_portid;
97 100
98/* If audit_rate_limit is non-zero, limit the rate of sending audit records 101/* If audit_rate_limit is non-zero, limit the rate of sending audit records
99 * to that number per second. This prevents DoS attacks, but results in 102 * to that number per second. This prevents DoS attacks, but results in
100 * audit records being dropped. */ 103 * audit records being dropped. */
101static int audit_rate_limit; 104static u32 audit_rate_limit;
102 105
103/* Number of outstanding audit_buffers allowed. */ 106/* Number of outstanding audit_buffers allowed.
104static int audit_backlog_limit = 64; 107 * When set to zero, this means unlimited. */
105static int audit_backlog_wait_time = 60 * HZ; 108static u32 audit_backlog_limit = 64;
106static int audit_backlog_wait_overflow = 0; 109#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
110static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
111static u32 audit_backlog_wait_overflow = 0;
107 112
108/* The identity of the user shutting down the audit system. */ 113/* The identity of the user shutting down the audit system. */
109kuid_t audit_sig_uid = INVALID_UID; 114kuid_t audit_sig_uid = INVALID_UID;
@@ -121,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
121 126
122/* The netlink socket. */ 127/* The netlink socket. */
123static struct sock *audit_sock; 128static struct sock *audit_sock;
129int audit_net_id;
124 130
125/* Hash for inode-based rules */ 131/* Hash for inode-based rules */
126struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -175,27 +181,27 @@ struct audit_buffer {
175}; 181};
176 182
177struct audit_reply { 183struct audit_reply {
178 int pid; 184 __u32 portid;
185 pid_t pid;
179 struct sk_buff *skb; 186 struct sk_buff *skb;
180}; 187};
181 188
182static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 189static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
183{ 190{
184 if (ab) { 191 if (ab) {
185 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 192 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
186 nlh->nlmsg_pid = pid; 193 nlh->nlmsg_pid = portid;
187 } 194 }
188} 195}
189 196
190void audit_panic(const char *message) 197void audit_panic(const char *message)
191{ 198{
192 switch (audit_failure) 199 switch (audit_failure) {
193 {
194 case AUDIT_FAIL_SILENT: 200 case AUDIT_FAIL_SILENT:
195 break; 201 break;
196 case AUDIT_FAIL_PRINTK: 202 case AUDIT_FAIL_PRINTK:
197 if (printk_ratelimit()) 203 if (printk_ratelimit())
198 printk(KERN_ERR "audit: %s\n", message); 204 pr_err("%s\n", message);
199 break; 205 break;
200 case AUDIT_FAIL_PANIC: 206 case AUDIT_FAIL_PANIC:
201 /* test audit_pid since printk is always losey, why bother? */ 207 /* test audit_pid since printk is always losey, why bother? */
@@ -266,9 +272,7 @@ void audit_log_lost(const char *message)
266 272
267 if (print) { 273 if (print) {
268 if (printk_ratelimit()) 274 if (printk_ratelimit())
269 printk(KERN_WARNING 275 pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
270 "audit: audit_lost=%d audit_rate_limit=%d "
271 "audit_backlog_limit=%d\n",
272 atomic_read(&audit_lost), 276 atomic_read(&audit_lost),
273 audit_rate_limit, 277 audit_rate_limit,
274 audit_backlog_limit); 278 audit_backlog_limit);
@@ -276,7 +280,7 @@ void audit_log_lost(const char *message)
276 } 280 }
277} 281}
278 282
279static int audit_log_config_change(char *function_name, int new, int old, 283static int audit_log_config_change(char *function_name, u32 new, u32 old,
280 int allow_changes) 284 int allow_changes)
281{ 285{
282 struct audit_buffer *ab; 286 struct audit_buffer *ab;
@@ -285,7 +289,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
285 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 289 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
286 if (unlikely(!ab)) 290 if (unlikely(!ab))
287 return rc; 291 return rc;
288 audit_log_format(ab, "%s=%d old=%d", function_name, new, old); 292 audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
289 audit_log_session_info(ab); 293 audit_log_session_info(ab);
290 rc = audit_log_task_context(ab); 294 rc = audit_log_task_context(ab);
291 if (rc) 295 if (rc)
@@ -295,9 +299,10 @@ static int audit_log_config_change(char *function_name, int new, int old,
295 return rc; 299 return rc;
296} 300}
297 301
298static int audit_do_config_change(char *function_name, int *to_change, int new) 302static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
299{ 303{
300 int allow_changes, rc = 0, old = *to_change; 304 int allow_changes, rc = 0;
305 u32 old = *to_change;
301 306
302 /* check if we are locked */ 307 /* check if we are locked */
303 if (audit_enabled == AUDIT_LOCKED) 308 if (audit_enabled == AUDIT_LOCKED)
@@ -320,17 +325,23 @@ static int audit_do_config_change(char *function_name, int *to_change, int new)
320 return rc; 325 return rc;
321} 326}
322 327
323static int audit_set_rate_limit(int limit) 328static int audit_set_rate_limit(u32 limit)
324{ 329{
325 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); 330 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
326} 331}
327 332
328static int audit_set_backlog_limit(int limit) 333static int audit_set_backlog_limit(u32 limit)
329{ 334{
330 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); 335 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
331} 336}
332 337
333static int audit_set_enabled(int state) 338static int audit_set_backlog_wait_time(u32 timeout)
339{
340 return audit_do_config_change("audit_backlog_wait_time",
341 &audit_backlog_wait_time, timeout);
342}
343
344static int audit_set_enabled(u32 state)
334{ 345{
335 int rc; 346 int rc;
336 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 347 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -343,7 +354,7 @@ static int audit_set_enabled(int state)
343 return rc; 354 return rc;
344} 355}
345 356
346static int audit_set_failure(int state) 357static int audit_set_failure(u32 state)
347{ 358{
348 if (state != AUDIT_FAIL_SILENT 359 if (state != AUDIT_FAIL_SILENT
349 && state != AUDIT_FAIL_PRINTK 360 && state != AUDIT_FAIL_PRINTK
@@ -365,7 +376,8 @@ static int audit_set_failure(int state)
365static void audit_hold_skb(struct sk_buff *skb) 376static void audit_hold_skb(struct sk_buff *skb)
366{ 377{
367 if (audit_default && 378 if (audit_default &&
368 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) 379 (!audit_backlog_limit ||
380 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
369 skb_queue_tail(&audit_skb_hold_queue, skb); 381 skb_queue_tail(&audit_skb_hold_queue, skb);
370 else 382 else
371 kfree_skb(skb); 383 kfree_skb(skb);
@@ -382,7 +394,7 @@ static void audit_printk_skb(struct sk_buff *skb)
382 394
383 if (nlh->nlmsg_type != AUDIT_EOE) { 395 if (nlh->nlmsg_type != AUDIT_EOE) {
384 if (printk_ratelimit()) 396 if (printk_ratelimit())
385 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); 397 pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
386 else 398 else
387 audit_log_lost("printk limit exceeded\n"); 399 audit_log_lost("printk limit exceeded\n");
388 } 400 }
@@ -398,9 +410,12 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); 410 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
399 if (err < 0) { 411 if (err < 0) {
400 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 412 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
401 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 413 if (audit_pid) {
402 audit_log_lost("auditd disappeared\n"); 414 pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_pid = 0; 415 audit_log_lost("auditd disappeared\n");
416 audit_pid = 0;
417 audit_sock = NULL;
418 }
404 /* we might get lucky and get this in the next auditd */ 419 /* we might get lucky and get this in the next auditd */
405 audit_hold_skb(skb); 420 audit_hold_skb(skb);
406 } else 421 } else
@@ -457,8 +472,10 @@ static int kauditd_thread(void *dummy)
457 flush_hold_queue(); 472 flush_hold_queue();
458 473
459 skb = skb_dequeue(&audit_skb_queue); 474 skb = skb_dequeue(&audit_skb_queue);
460 wake_up(&audit_backlog_wait); 475
461 if (skb) { 476 if (skb) {
477 if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
478 wake_up(&audit_backlog_wait);
462 if (audit_pid) 479 if (audit_pid)
463 kauditd_send_skb(skb); 480 kauditd_send_skb(skb);
464 else 481 else
@@ -482,22 +499,23 @@ static int kauditd_thread(void *dummy)
482int audit_send_list(void *_dest) 499int audit_send_list(void *_dest)
483{ 500{
484 struct audit_netlink_list *dest = _dest; 501 struct audit_netlink_list *dest = _dest;
485 int pid = dest->pid;
486 struct sk_buff *skb; 502 struct sk_buff *skb;
503 struct net *net = get_net_ns_by_pid(dest->pid);
504 struct audit_net *aunet = net_generic(net, audit_net_id);
487 505
488 /* wait for parent to finish and send an ACK */ 506 /* wait for parent to finish and send an ACK */
489 mutex_lock(&audit_cmd_mutex); 507 mutex_lock(&audit_cmd_mutex);
490 mutex_unlock(&audit_cmd_mutex); 508 mutex_unlock(&audit_cmd_mutex);
491 509
492 while ((skb = __skb_dequeue(&dest->q)) != NULL) 510 while ((skb = __skb_dequeue(&dest->q)) != NULL)
493 netlink_unicast(audit_sock, skb, pid, 0); 511 netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
494 512
495 kfree(dest); 513 kfree(dest);
496 514
497 return 0; 515 return 0;
498} 516}
499 517
500struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 518struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
501 int multi, const void *payload, int size) 519 int multi, const void *payload, int size)
502{ 520{
503 struct sk_buff *skb; 521 struct sk_buff *skb;
@@ -510,7 +528,7 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
510 if (!skb) 528 if (!skb)
511 return NULL; 529 return NULL;
512 530
513 nlh = nlmsg_put(skb, pid, seq, t, size, flags); 531 nlh = nlmsg_put(skb, portid, seq, t, size, flags);
514 if (!nlh) 532 if (!nlh)
515 goto out_kfree_skb; 533 goto out_kfree_skb;
516 data = nlmsg_data(nlh); 534 data = nlmsg_data(nlh);
@@ -525,19 +543,21 @@ out_kfree_skb:
525static int audit_send_reply_thread(void *arg) 543static int audit_send_reply_thread(void *arg)
526{ 544{
527 struct audit_reply *reply = (struct audit_reply *)arg; 545 struct audit_reply *reply = (struct audit_reply *)arg;
546 struct net *net = get_net_ns_by_pid(reply->pid);
547 struct audit_net *aunet = net_generic(net, audit_net_id);
528 548
529 mutex_lock(&audit_cmd_mutex); 549 mutex_lock(&audit_cmd_mutex);
530 mutex_unlock(&audit_cmd_mutex); 550 mutex_unlock(&audit_cmd_mutex);
531 551
532 /* Ignore failure. It'll only happen if the sender goes away, 552 /* Ignore failure. It'll only happen if the sender goes away,
533 because our timeout is set to infinite. */ 553 because our timeout is set to infinite. */
534 netlink_unicast(audit_sock, reply->skb, reply->pid, 0); 554 netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
535 kfree(reply); 555 kfree(reply);
536 return 0; 556 return 0;
537} 557}
538/** 558/**
539 * audit_send_reply - send an audit reply message via netlink 559 * audit_send_reply - send an audit reply message via netlink
540 * @pid: process id to send reply to 560 * @portid: netlink port to which to send reply
541 * @seq: sequence number 561 * @seq: sequence number
542 * @type: audit message type 562 * @type: audit message type
543 * @done: done (last) flag 563 * @done: done (last) flag
@@ -545,11 +565,11 @@ static int audit_send_reply_thread(void *arg)
545 * @payload: payload data 565 * @payload: payload data
546 * @size: payload size 566 * @size: payload size
547 * 567 *
548 * Allocates an skb, builds the netlink message, and sends it to the pid. 568 * Allocates an skb, builds the netlink message, and sends it to the port id.
549 * No failure notifications. 569 * No failure notifications.
550 */ 570 */
551static void audit_send_reply(int pid, int seq, int type, int done, int multi, 571static void audit_send_reply(__u32 portid, int seq, int type, int done,
552 const void *payload, int size) 572 int multi, const void *payload, int size)
553{ 573{
554 struct sk_buff *skb; 574 struct sk_buff *skb;
555 struct task_struct *tsk; 575 struct task_struct *tsk;
@@ -559,11 +579,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 if (!reply) 579 if (!reply)
560 return; 580 return;
561 581
562 skb = audit_make_reply(pid, seq, type, done, multi, payload, size); 582 skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
563 if (!skb) 583 if (!skb)
564 goto out; 584 goto out;
565 585
566 reply->pid = pid; 586 reply->portid = portid;
587 reply->pid = task_pid_vnr(current);
567 reply->skb = skb; 588 reply->skb = skb;
568 589
569 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); 590 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -663,8 +684,12 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
663{ 684{
664 struct audit_buffer *ab; 685 struct audit_buffer *ab;
665 686
687 if (audit_enabled == AUDIT_OFF)
688 return;
689
666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); 690 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
667 audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", 691 audit_log_task_info(ab, current);
692 audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
668 audit_feature_names[which], !!old_feature, !!new_feature, 693 audit_feature_names[which], !!old_feature, !!new_feature,
669 !!old_lock, !!new_lock, res); 694 !!old_lock, !!new_lock, res);
670 audit_log_end(ab); 695 audit_log_end(ab);
@@ -694,7 +719,7 @@ static int audit_set_feature(struct sk_buff *skb)
694 old_lock = af.lock & feature; 719 old_lock = af.lock & feature;
695 720
696 /* are we changing a locked feature? */ 721 /* are we changing a locked feature? */
697 if ((af.lock & feature) && (new_feature != old_feature)) { 722 if (old_lock && (new_feature != old_feature)) {
698 audit_log_feature_change(i, old_feature, new_feature, 723 audit_log_feature_change(i, old_feature, new_feature,
699 old_lock, new_lock, 0); 724 old_lock, new_lock, 0);
700 return -EPERM; 725 return -EPERM;
@@ -732,7 +757,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
732{ 757{
733 u32 seq; 758 u32 seq;
734 void *data; 759 void *data;
735 struct audit_status *status_get, status_set;
736 int err; 760 int err;
737 struct audit_buffer *ab; 761 struct audit_buffer *ab;
738 u16 msg_type = nlh->nlmsg_type; 762 u16 msg_type = nlh->nlmsg_type;
@@ -758,48 +782,70 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
758 data = nlmsg_data(nlh); 782 data = nlmsg_data(nlh);
759 783
760 switch (msg_type) { 784 switch (msg_type) {
761 case AUDIT_GET: 785 case AUDIT_GET: {
762 memset(&status_set, 0, sizeof(status_set)); 786 struct audit_status s;
763 status_set.enabled = audit_enabled; 787 memset(&s, 0, sizeof(s));
764 status_set.failure = audit_failure; 788 s.enabled = audit_enabled;
765 status_set.pid = audit_pid; 789 s.failure = audit_failure;
766 status_set.rate_limit = audit_rate_limit; 790 s.pid = audit_pid;
767 status_set.backlog_limit = audit_backlog_limit; 791 s.rate_limit = audit_rate_limit;
768 status_set.lost = atomic_read(&audit_lost); 792 s.backlog_limit = audit_backlog_limit;
769 status_set.backlog = skb_queue_len(&audit_skb_queue); 793 s.lost = atomic_read(&audit_lost);
794 s.backlog = skb_queue_len(&audit_skb_queue);
795 s.version = AUDIT_VERSION_LATEST;
796 s.backlog_wait_time = audit_backlog_wait_time;
770 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 797 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
771 &status_set, sizeof(status_set)); 798 &s, sizeof(s));
772 break; 799 break;
773 case AUDIT_SET: 800 }
774 if (nlmsg_len(nlh) < sizeof(struct audit_status)) 801 case AUDIT_SET: {
775 return -EINVAL; 802 struct audit_status s;
776 status_get = (struct audit_status *)data; 803 memset(&s, 0, sizeof(s));
777 if (status_get->mask & AUDIT_STATUS_ENABLED) { 804 /* guard against past and future API changes */
778 err = audit_set_enabled(status_get->enabled); 805 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
806 if (s.mask & AUDIT_STATUS_ENABLED) {
807 err = audit_set_enabled(s.enabled);
779 if (err < 0) 808 if (err < 0)
780 return err; 809 return err;
781 } 810 }
782 if (status_get->mask & AUDIT_STATUS_FAILURE) { 811 if (s.mask & AUDIT_STATUS_FAILURE) {
783 err = audit_set_failure(status_get->failure); 812 err = audit_set_failure(s.failure);
784 if (err < 0) 813 if (err < 0)
785 return err; 814 return err;
786 } 815 }
787 if (status_get->mask & AUDIT_STATUS_PID) { 816 if (s.mask & AUDIT_STATUS_PID) {
788 int new_pid = status_get->pid; 817 int new_pid = s.pid;
789 818
819 if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
820 return -EACCES;
790 if (audit_enabled != AUDIT_OFF) 821 if (audit_enabled != AUDIT_OFF)
791 audit_log_config_change("audit_pid", new_pid, audit_pid, 1); 822 audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
792 audit_pid = new_pid; 823 audit_pid = new_pid;
793 audit_nlk_portid = NETLINK_CB(skb).portid; 824 audit_nlk_portid = NETLINK_CB(skb).portid;
825 audit_sock = skb->sk;
794 } 826 }
795 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 827 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
796 err = audit_set_rate_limit(status_get->rate_limit); 828 err = audit_set_rate_limit(s.rate_limit);
829 if (err < 0)
830 return err;
831 }
832 if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
833 err = audit_set_backlog_limit(s.backlog_limit);
834 if (err < 0)
835 return err;
836 }
837 if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
838 if (sizeof(s) > (size_t)nlh->nlmsg_len)
839 return -EINVAL;
840 if (s.backlog_wait_time < 0 ||
841 s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
842 return -EINVAL;
843 err = audit_set_backlog_wait_time(s.backlog_wait_time);
797 if (err < 0) 844 if (err < 0)
798 return err; 845 return err;
799 } 846 }
800 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
801 err = audit_set_backlog_limit(status_get->backlog_limit);
802 break; 847 break;
848 }
803 case AUDIT_GET_FEATURE: 849 case AUDIT_GET_FEATURE:
804 err = audit_get_feature(skb); 850 err = audit_get_feature(skb);
805 if (err) 851 if (err)
@@ -817,13 +863,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
817 return 0; 863 return 0;
818 864
819 err = audit_filter_user(msg_type); 865 err = audit_filter_user(msg_type);
820 if (err == 1) { 866 if (err == 1) { /* match or error */
821 err = 0; 867 err = 0;
822 if (msg_type == AUDIT_USER_TTY) { 868 if (msg_type == AUDIT_USER_TTY) {
823 err = tty_audit_push_current(); 869 err = tty_audit_push_current();
824 if (err) 870 if (err)
825 break; 871 break;
826 } 872 }
873 mutex_unlock(&audit_cmd_mutex);
827 audit_log_common_recv_msg(&ab, msg_type); 874 audit_log_common_recv_msg(&ab, msg_type);
828 if (msg_type != AUDIT_USER_TTY) 875 if (msg_type != AUDIT_USER_TTY)
829 audit_log_format(ab, " msg='%.*s'", 876 audit_log_format(ab, " msg='%.*s'",
@@ -839,8 +886,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
839 size--; 886 size--;
840 audit_log_n_untrustedstring(ab, data, size); 887 audit_log_n_untrustedstring(ab, data, size);
841 } 888 }
842 audit_set_pid(ab, NETLINK_CB(skb).portid); 889 audit_set_portid(ab, NETLINK_CB(skb).portid);
843 audit_log_end(ab); 890 audit_log_end(ab);
891 mutex_lock(&audit_cmd_mutex);
844 } 892 }
845 break; 893 break;
846 case AUDIT_ADD_RULE: 894 case AUDIT_ADD_RULE:
@@ -853,11 +901,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
853 audit_log_end(ab); 901 audit_log_end(ab);
854 return -EPERM; 902 return -EPERM;
855 } 903 }
856 /* fallthrough */ 904 err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
857 case AUDIT_LIST_RULES:
858 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
859 seq, data, nlmsg_len(nlh)); 905 seq, data, nlmsg_len(nlh));
860 break; 906 break;
907 case AUDIT_LIST_RULES:
908 err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
909 break;
861 case AUDIT_TRIM: 910 case AUDIT_TRIM:
862 audit_trim_trees(); 911 audit_trim_trees();
863 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 912 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
@@ -939,20 +988,33 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
939 break; 988 break;
940 } 989 }
941 case AUDIT_TTY_SET: { 990 case AUDIT_TTY_SET: {
942 struct audit_tty_status s; 991 struct audit_tty_status s, old;
943 struct task_struct *tsk = current; 992 struct task_struct *tsk = current;
993 struct audit_buffer *ab;
944 994
945 memset(&s, 0, sizeof(s)); 995 memset(&s, 0, sizeof(s));
946 /* guard against past and future API changes */ 996 /* guard against past and future API changes */
947 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); 997 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
998 /* check if new data is valid */
948 if ((s.enabled != 0 && s.enabled != 1) || 999 if ((s.enabled != 0 && s.enabled != 1) ||
949 (s.log_passwd != 0 && s.log_passwd != 1)) 1000 (s.log_passwd != 0 && s.log_passwd != 1))
950 return -EINVAL; 1001 err = -EINVAL;
951 1002
952 spin_lock(&tsk->sighand->siglock); 1003 spin_lock(&tsk->sighand->siglock);
953 tsk->signal->audit_tty = s.enabled; 1004 old.enabled = tsk->signal->audit_tty;
954 tsk->signal->audit_tty_log_passwd = s.log_passwd; 1005 old.log_passwd = tsk->signal->audit_tty_log_passwd;
1006 if (!err) {
1007 tsk->signal->audit_tty = s.enabled;
1008 tsk->signal->audit_tty_log_passwd = s.log_passwd;
1009 }
955 spin_unlock(&tsk->sighand->siglock); 1010 spin_unlock(&tsk->sighand->siglock);
1011
1012 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
1013 audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
1014 " old-log_passwd=%d new-log_passwd=%d res=%d",
1015 old.enabled, s.enabled, old.log_passwd,
1016 s.log_passwd, !err);
1017 audit_log_end(ab);
956 break; 1018 break;
957 } 1019 }
958 default: 1020 default:
@@ -998,24 +1060,55 @@ static void audit_receive(struct sk_buff *skb)
998 mutex_unlock(&audit_cmd_mutex); 1060 mutex_unlock(&audit_cmd_mutex);
999} 1061}
1000 1062
1001/* Initialize audit support at boot time. */ 1063static int __net_init audit_net_init(struct net *net)
1002static int __init audit_init(void)
1003{ 1064{
1004 int i;
1005 struct netlink_kernel_cfg cfg = { 1065 struct netlink_kernel_cfg cfg = {
1006 .input = audit_receive, 1066 .input = audit_receive,
1007 }; 1067 };
1008 1068
1069 struct audit_net *aunet = net_generic(net, audit_net_id);
1070
1071 aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
1072 if (aunet->nlsk == NULL) {
1073 audit_panic("cannot initialize netlink socket in namespace");
1074 return -ENOMEM;
1075 }
1076 aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1077 return 0;
1078}
1079
1080static void __net_exit audit_net_exit(struct net *net)
1081{
1082 struct audit_net *aunet = net_generic(net, audit_net_id);
1083 struct sock *sock = aunet->nlsk;
1084 if (sock == audit_sock) {
1085 audit_pid = 0;
1086 audit_sock = NULL;
1087 }
1088
1089 rcu_assign_pointer(aunet->nlsk, NULL);
1090 synchronize_net();
1091 netlink_kernel_release(sock);
1092}
1093
1094static struct pernet_operations audit_net_ops __net_initdata = {
1095 .init = audit_net_init,
1096 .exit = audit_net_exit,
1097 .id = &audit_net_id,
1098 .size = sizeof(struct audit_net),
1099};
1100
1101/* Initialize audit support at boot time. */
1102static int __init audit_init(void)
1103{
1104 int i;
1105
1009 if (audit_initialized == AUDIT_DISABLED) 1106 if (audit_initialized == AUDIT_DISABLED)
1010 return 0; 1107 return 0;
1011 1108
1012 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 1109 pr_info("initializing netlink subsys (%s)\n",
1013 audit_default ? "enabled" : "disabled"); 1110 audit_default ? "enabled" : "disabled");
1014 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); 1111 register_pernet_subsys(&audit_net_ops);
1015 if (!audit_sock)
1016 audit_panic("cannot initialize netlink socket");
1017 else
1018 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1019 1112
1020 skb_queue_head_init(&audit_skb_queue); 1113 skb_queue_head_init(&audit_skb_queue);
1021 skb_queue_head_init(&audit_skb_hold_queue); 1114 skb_queue_head_init(&audit_skb_hold_queue);
@@ -1039,22 +1132,32 @@ static int __init audit_enable(char *str)
1039 if (!audit_default) 1132 if (!audit_default)
1040 audit_initialized = AUDIT_DISABLED; 1133 audit_initialized = AUDIT_DISABLED;
1041 1134
1042 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); 1135 pr_info("%s\n", audit_default ?
1136 "enabled (after initialization)" : "disabled (until reboot)");
1043 1137
1044 if (audit_initialized == AUDIT_INITIALIZED) { 1138 return 1;
1045 audit_enabled = audit_default; 1139}
1046 audit_ever_enabled |= !!audit_default; 1140__setup("audit=", audit_enable);
1047 } else if (audit_initialized == AUDIT_UNINITIALIZED) { 1141
1048 printk(" (after initialization)"); 1142/* Process kernel command-line parameter at boot time.
1049 } else { 1143 * audit_backlog_limit=<n> */
1050 printk(" (until reboot)"); 1144static int __init audit_backlog_limit_set(char *str)
1145{
1146 u32 audit_backlog_limit_arg;
1147
1148 pr_info("audit_backlog_limit: ");
1149 if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
1150 pr_cont("using default of %u, unable to parse %s\n",
1151 audit_backlog_limit, str);
1152 return 1;
1051 } 1153 }
1052 printk("\n"); 1154
1155 audit_backlog_limit = audit_backlog_limit_arg;
1156 pr_cont("%d\n", audit_backlog_limit);
1053 1157
1054 return 1; 1158 return 1;
1055} 1159}
1056 1160__setup("audit_backlog_limit=", audit_backlog_limit_set);
1057__setup("audit=", audit_enable);
1058 1161
1059static void audit_buffer_free(struct audit_buffer *ab) 1162static void audit_buffer_free(struct audit_buffer *ab)
1060{ 1163{
@@ -1165,18 +1268,20 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1165/* 1268/*
1166 * Wait for auditd to drain the queue a little 1269 * Wait for auditd to drain the queue a little
1167 */ 1270 */
1168static void wait_for_auditd(unsigned long sleep_time) 1271static long wait_for_auditd(long sleep_time)
1169{ 1272{
1170 DECLARE_WAITQUEUE(wait, current); 1273 DECLARE_WAITQUEUE(wait, current);
1171 set_current_state(TASK_UNINTERRUPTIBLE); 1274 set_current_state(TASK_UNINTERRUPTIBLE);
1172 add_wait_queue(&audit_backlog_wait, &wait); 1275 add_wait_queue_exclusive(&audit_backlog_wait, &wait);
1173 1276
1174 if (audit_backlog_limit && 1277 if (audit_backlog_limit &&
1175 skb_queue_len(&audit_skb_queue) > audit_backlog_limit) 1278 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1176 schedule_timeout(sleep_time); 1279 sleep_time = schedule_timeout(sleep_time);
1177 1280
1178 __set_current_state(TASK_RUNNING); 1281 __set_current_state(TASK_RUNNING);
1179 remove_wait_queue(&audit_backlog_wait, &wait); 1282 remove_wait_queue(&audit_backlog_wait, &wait);
1283
1284 return sleep_time;
1180} 1285}
1181 1286
1182/** 1287/**
@@ -1200,7 +1305,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1200 struct audit_buffer *ab = NULL; 1305 struct audit_buffer *ab = NULL;
1201 struct timespec t; 1306 struct timespec t;
1202 unsigned int uninitialized_var(serial); 1307 unsigned int uninitialized_var(serial);
1203 int reserve; 1308 int reserve = 5; /* Allow atomic callers to go up to five
1309 entries over the normal backlog limit */
1204 unsigned long timeout_start = jiffies; 1310 unsigned long timeout_start = jiffies;
1205 1311
1206 if (audit_initialized != AUDIT_INITIALIZED) 1312 if (audit_initialized != AUDIT_INITIALIZED)
@@ -1209,36 +1315,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1209 if (unlikely(audit_filter_type(type))) 1315 if (unlikely(audit_filter_type(type)))
1210 return NULL; 1316 return NULL;
1211 1317
1212 if (gfp_mask & __GFP_WAIT) 1318 if (gfp_mask & __GFP_WAIT) {
1213 reserve = 0; 1319 if (audit_pid && audit_pid == current->pid)
1214 else 1320 gfp_mask &= ~__GFP_WAIT;
1215 reserve = 5; /* Allow atomic callers to go up to five 1321 else
1216 entries over the normal backlog limit */ 1322 reserve = 0;
1323 }
1217 1324
1218 while (audit_backlog_limit 1325 while (audit_backlog_limit
1219 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1326 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1220 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { 1327 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
1221 unsigned long sleep_time; 1328 long sleep_time;
1222 1329
1223 sleep_time = timeout_start + audit_backlog_wait_time - 1330 sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
1224 jiffies; 1331 if (sleep_time > 0) {
1225 if ((long)sleep_time > 0) { 1332 sleep_time = wait_for_auditd(sleep_time);
1226 wait_for_auditd(sleep_time); 1333 if (sleep_time > 0)
1227 continue; 1334 continue;
1228 } 1335 }
1229 } 1336 }
1230 if (audit_rate_check() && printk_ratelimit()) 1337 if (audit_rate_check() && printk_ratelimit())
1231 printk(KERN_WARNING 1338 pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
1232 "audit: audit_backlog=%d > " 1339 skb_queue_len(&audit_skb_queue),
1233 "audit_backlog_limit=%d\n", 1340 audit_backlog_limit);
1234 skb_queue_len(&audit_skb_queue),
1235 audit_backlog_limit);
1236 audit_log_lost("backlog limit exceeded"); 1341 audit_log_lost("backlog limit exceeded");
1237 audit_backlog_wait_time = audit_backlog_wait_overflow; 1342 audit_backlog_wait_time = audit_backlog_wait_overflow;
1238 wake_up(&audit_backlog_wait); 1343 wake_up(&audit_backlog_wait);
1239 return NULL; 1344 return NULL;
1240 } 1345 }
1241 1346
1347 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
1348
1242 ab = audit_buffer_alloc(ctx, gfp_mask, type); 1349 ab = audit_buffer_alloc(ctx, gfp_mask, type);
1243 if (!ab) { 1350 if (!ab) {
1244 audit_log_lost("out of memory in audit_log_start"); 1351 audit_log_lost("out of memory in audit_log_start");
@@ -1356,7 +1463,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1356 int i, avail, new_len; 1463 int i, avail, new_len;
1357 unsigned char *ptr; 1464 unsigned char *ptr;
1358 struct sk_buff *skb; 1465 struct sk_buff *skb;
1359 static const unsigned char *hex = "0123456789ABCDEF";
1360 1466
1361 if (!ab) 1467 if (!ab)
1362 return; 1468 return;
@@ -1374,10 +1480,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1374 } 1480 }
1375 1481
1376 ptr = skb_tail_pointer(skb); 1482 ptr = skb_tail_pointer(skb);
1377 for (i=0; i<len; i++) { 1483 for (i = 0; i < len; i++)
1378 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ 1484 ptr = hex_byte_pack_upper(ptr, buf[i]);
1379 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
1380 }
1381 *ptr = 0; 1485 *ptr = 0;
1382 skb_put(skb, len << 1); /* new string is twice the old string */ 1486 skb_put(skb, len << 1); /* new string is twice the old string */
1383} 1487}
@@ -1491,7 +1595,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1491 1595
1492void audit_log_session_info(struct audit_buffer *ab) 1596void audit_log_session_info(struct audit_buffer *ab)
1493{ 1597{
1494 u32 sessionid = audit_get_sessionid(current); 1598 unsigned int sessionid = audit_get_sessionid(current);
1495 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 1599 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1496 1600
1497 audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); 1601 audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
@@ -1716,7 +1820,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1716 audit_log_format(ab, 1820 audit_log_format(ab,
1717 " ppid=%ld pid=%d auid=%u uid=%u gid=%u" 1821 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1718 " euid=%u suid=%u fsuid=%u" 1822 " euid=%u suid=%u fsuid=%u"
1719 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", 1823 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1720 sys_getppid(), 1824 sys_getppid(),
1721 tsk->pid, 1825 tsk->pid,
1722 from_kuid(&init_user_ns, audit_get_loginuid(tsk)), 1826 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
@@ -1728,7 +1832,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1728 from_kgid(&init_user_ns, cred->egid), 1832 from_kgid(&init_user_ns, cred->egid),
1729 from_kgid(&init_user_ns, cred->sgid), 1833 from_kgid(&init_user_ns, cred->sgid),
1730 from_kgid(&init_user_ns, cred->fsgid), 1834 from_kgid(&init_user_ns, cred->fsgid),
1731 audit_get_sessionid(tsk), tty); 1835 tty, audit_get_sessionid(tsk));
1732 1836
1733 get_task_comm(name, tsk); 1837 get_task_comm(name, tsk);
1734 audit_log_format(ab, " comm="); 1838 audit_log_format(ab, " comm=");
@@ -1739,7 +1843,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1739 if (mm->exe_file) 1843 if (mm->exe_file)
1740 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); 1844 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1741 up_read(&mm->mmap_sem); 1845 up_read(&mm->mmap_sem);
1742 } 1846 } else
1847 audit_log_format(ab, " exe=(null)");
1743 audit_log_task_context(ab); 1848 audit_log_task_context(ab);
1744} 1849}
1745EXPORT_SYMBOL(audit_log_task_info); 1850EXPORT_SYMBOL(audit_log_task_info);
diff --git a/kernel/audit.h b/kernel/audit.h
index b779642b29af..57cc64d67718 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -209,7 +209,7 @@ struct audit_context {
209#endif 209#endif
210}; 210};
211 211
212extern int audit_ever_enabled; 212extern u32 audit_ever_enabled;
213 213
214extern void audit_copy_inode(struct audit_names *name, 214extern void audit_copy_inode(struct audit_names *name,
215 const struct dentry *dentry, 215 const struct dentry *dentry,
@@ -240,18 +240,23 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
240extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); 240extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
241extern int parent_len(const char *path); 241extern int parent_len(const char *path);
242extern int audit_compare_dname_path(const char *dname, const char *path, int plen); 242extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
243extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 243extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
244 int done, int multi, 244 int done, int multi,
245 const void *payload, int size); 245 const void *payload, int size);
246extern void audit_panic(const char *message); 246extern void audit_panic(const char *message);
247 247
248struct audit_netlink_list { 248struct audit_netlink_list {
249 int pid; 249 __u32 portid;
250 pid_t pid;
250 struct sk_buff_head q; 251 struct sk_buff_head q;
251}; 252};
252 253
253int audit_send_list(void *); 254int audit_send_list(void *);
254 255
256struct audit_net {
257 struct sock *nlsk;
258};
259
255extern int selinux_audit_rule_update(void); 260extern int selinux_audit_rule_update(void);
256 261
257extern struct mutex audit_filter_mutex; 262extern struct mutex audit_filter_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc9453..67ccf0e7cca9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
912} 912}
913 913
914static int audit_tree_handle_event(struct fsnotify_group *group, 914static int audit_tree_handle_event(struct fsnotify_group *group,
915 struct inode *to_tell,
915 struct fsnotify_mark *inode_mark, 916 struct fsnotify_mark *inode_mark,
916 struct fsnotify_mark *vfsmonut_mark, 917 struct fsnotify_mark *vfsmount_mark,
917 struct fsnotify_event *event) 918 u32 mask, void *data, int data_type,
919 const unsigned char *file_name)
918{ 920{
919 BUG(); 921 return 0;
920 return -EOPNOTSUPP;
921} 922}
922 923
923static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) 924static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
933 BUG_ON(atomic_read(&entry->refcnt) < 1); 934 BUG_ON(atomic_read(&entry->refcnt) < 1);
934} 935}
935 936
936static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
937 struct fsnotify_mark *inode_mark,
938 struct fsnotify_mark *vfsmount_mark,
939 __u32 mask, void *data, int data_type)
940{
941 return false;
942}
943
944static const struct fsnotify_ops audit_tree_ops = { 937static const struct fsnotify_ops audit_tree_ops = {
945 .handle_event = audit_tree_handle_event, 938 .handle_event = audit_tree_handle_event,
946 .should_send_event = audit_tree_send_event,
947 .free_group_priv = NULL,
948 .free_event_priv = NULL,
949 .freeing_mark = audit_tree_freeing_mark, 939 .freeing_mark = audit_tree_freeing_mark,
950}; 940};
951 941
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369c..2596fac5dcb4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule)
465 } 465 }
466} 466}
467 467
468static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
469 struct fsnotify_mark *inode_mark,
470 struct fsnotify_mark *vfsmount_mark,
471 __u32 mask, void *data, int data_type)
472{
473 return true;
474}
475
476/* Update watch data in audit rules based on fsnotify events. */ 468/* Update watch data in audit rules based on fsnotify events. */
477static int audit_watch_handle_event(struct fsnotify_group *group, 469static int audit_watch_handle_event(struct fsnotify_group *group,
470 struct inode *to_tell,
478 struct fsnotify_mark *inode_mark, 471 struct fsnotify_mark *inode_mark,
479 struct fsnotify_mark *vfsmount_mark, 472 struct fsnotify_mark *vfsmount_mark,
480 struct fsnotify_event *event) 473 u32 mask, void *data, int data_type,
474 const unsigned char *dname)
481{ 475{
482 struct inode *inode; 476 struct inode *inode;
483 __u32 mask = event->mask;
484 const char *dname = event->file_name;
485 struct audit_parent *parent; 477 struct audit_parent *parent;
486 478
487 parent = container_of(inode_mark, struct audit_parent, mark); 479 parent = container_of(inode_mark, struct audit_parent, mark);
488 480
489 BUG_ON(group != audit_watch_group); 481 BUG_ON(group != audit_watch_group);
490 482
491 switch (event->data_type) { 483 switch (data_type) {
492 case (FSNOTIFY_EVENT_PATH): 484 case (FSNOTIFY_EVENT_PATH):
493 inode = event->path.dentry->d_inode; 485 inode = ((struct path *)data)->dentry->d_inode;
494 break; 486 break;
495 case (FSNOTIFY_EVENT_INODE): 487 case (FSNOTIFY_EVENT_INODE):
496 inode = event->inode; 488 inode = (struct inode *)data;
497 break; 489 break;
498 default: 490 default:
499 BUG(); 491 BUG();
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
512} 504}
513 505
514static const struct fsnotify_ops audit_watch_fsnotify_ops = { 506static const struct fsnotify_ops audit_watch_fsnotify_ops = {
515 .should_send_event = audit_watch_should_send_event,
516 .handle_event = audit_watch_handle_event, 507 .handle_event = audit_watch_handle_event,
517 .free_group_priv = NULL,
518 .freeing_mark = NULL,
519 .free_event_priv = NULL,
520}; 508};
521 509
522static int __init audit_watch_init(void) 510static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 51f3fd4c1ed3..14a78cca384e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -972,7 +972,7 @@ out:
972} 972}
973 973
974/* List rules using struct audit_rule_data. */ 974/* List rules using struct audit_rule_data. */
975static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 975static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
976{ 976{
977 struct sk_buff *skb; 977 struct sk_buff *skb;
978 struct audit_krule *r; 978 struct audit_krule *r;
@@ -987,14 +987,15 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
987 data = audit_krule_to_data(r); 987 data = audit_krule_to_data(r);
988 if (unlikely(!data)) 988 if (unlikely(!data))
989 break; 989 break;
990 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 990 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
991 data, sizeof(*data) + data->buflen); 991 0, 1, data,
992 sizeof(*data) + data->buflen);
992 if (skb) 993 if (skb)
993 skb_queue_tail(q, skb); 994 skb_queue_tail(q, skb);
994 kfree(data); 995 kfree(data);
995 } 996 }
996 } 997 }
997 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 998 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
998 if (skb) 999 if (skb)
999 skb_queue_tail(q, skb); 1000 skb_queue_tail(q, skb);
1000} 1001}
@@ -1004,7 +1005,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1004{ 1005{
1005 struct audit_buffer *ab; 1006 struct audit_buffer *ab;
1006 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 1007 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1007 u32 sessionid = audit_get_sessionid(current); 1008 unsigned int sessionid = audit_get_sessionid(current);
1008 1009
1009 if (!audit_enabled) 1010 if (!audit_enabled)
1010 return; 1011 return;
@@ -1022,45 +1023,20 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1022} 1023}
1023 1024
1024/** 1025/**
1025 * audit_receive_filter - apply all rules to the specified message type 1026 * audit_rule_change - apply all rules to the specified message type
1026 * @type: audit message type 1027 * @type: audit message type
1027 * @pid: target pid for netlink audit messages 1028 * @portid: target port id for netlink audit messages
1028 * @seq: netlink audit message sequence (serial) number 1029 * @seq: netlink audit message sequence (serial) number
1029 * @data: payload data 1030 * @data: payload data
1030 * @datasz: size of payload data 1031 * @datasz: size of payload data
1031 */ 1032 */
1032int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) 1033int audit_rule_change(int type, __u32 portid, int seq, void *data,
1034 size_t datasz)
1033{ 1035{
1034 struct task_struct *tsk;
1035 struct audit_netlink_list *dest;
1036 int err = 0; 1036 int err = 0;
1037 struct audit_entry *entry; 1037 struct audit_entry *entry;
1038 1038
1039 switch (type) { 1039 switch (type) {
1040 case AUDIT_LIST_RULES:
1041 /* We can't just spew out the rules here because we might fill
1042 * the available socket buffer space and deadlock waiting for
1043 * auditctl to read from it... which isn't ever going to
1044 * happen if we're actually running in the context of auditctl
1045 * trying to _send_ the stuff */
1046
1047 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1048 if (!dest)
1049 return -ENOMEM;
1050 dest->pid = pid;
1051 skb_queue_head_init(&dest->q);
1052
1053 mutex_lock(&audit_filter_mutex);
1054 audit_list_rules(pid, seq, &dest->q);
1055 mutex_unlock(&audit_filter_mutex);
1056
1057 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
1058 if (IS_ERR(tsk)) {
1059 skb_queue_purge(&dest->q);
1060 kfree(dest);
1061 err = PTR_ERR(tsk);
1062 }
1063 break;
1064 case AUDIT_ADD_RULE: 1040 case AUDIT_ADD_RULE:
1065 entry = audit_data_to_entry(data, datasz); 1041 entry = audit_data_to_entry(data, datasz);
1066 if (IS_ERR(entry)) 1042 if (IS_ERR(entry))
@@ -1087,6 +1063,44 @@ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
1087 return err; 1063 return err;
1088} 1064}
1089 1065
1066/**
1067 * audit_list_rules_send - list the audit rules
1068 * @portid: target portid for netlink audit messages
1069 * @seq: netlink audit message sequence (serial) number
1070 */
1071int audit_list_rules_send(__u32 portid, int seq)
1072{
1073 struct task_struct *tsk;
1074 struct audit_netlink_list *dest;
1075 int err = 0;
1076
1077 /* We can't just spew out the rules here because we might fill
1078 * the available socket buffer space and deadlock waiting for
1079 * auditctl to read from it... which isn't ever going to
1080 * happen if we're actually running in the context of auditctl
1081 * trying to _send_ the stuff */
1082
1083 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1084 if (!dest)
1085 return -ENOMEM;
1086 dest->portid = portid;
1087 dest->pid = task_pid_vnr(current);
1088 skb_queue_head_init(&dest->q);
1089
1090 mutex_lock(&audit_filter_mutex);
1091 audit_list_rules(portid, seq, &dest->q);
1092 mutex_unlock(&audit_filter_mutex);
1093
1094 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
1095 if (IS_ERR(tsk)) {
1096 skb_queue_purge(&dest->q);
1097 kfree(dest);
1098 err = PTR_ERR(tsk);
1099 }
1100
1101 return err;
1102}
1103
1090int audit_comparator(u32 left, u32 op, u32 right) 1104int audit_comparator(u32 left, u32 op, u32 right)
1091{ 1105{
1092 switch (op) { 1106 switch (op) {
@@ -1276,19 +1290,22 @@ int audit_filter_user(int type)
1276{ 1290{
1277 enum audit_state state = AUDIT_DISABLED; 1291 enum audit_state state = AUDIT_DISABLED;
1278 struct audit_entry *e; 1292 struct audit_entry *e;
1279 int ret = 1; 1293 int rc, ret;
1294
1295 ret = 1; /* Audit by default */
1280 1296
1281 rcu_read_lock(); 1297 rcu_read_lock();
1282 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1298 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1283 if (audit_filter_user_rules(&e->rule, type, &state)) { 1299 rc = audit_filter_user_rules(&e->rule, type, &state);
1284 if (state == AUDIT_DISABLED) 1300 if (rc) {
1301 if (rc > 0 && state == AUDIT_DISABLED)
1285 ret = 0; 1302 ret = 0;
1286 break; 1303 break;
1287 } 1304 }
1288 } 1305 }
1289 rcu_read_unlock(); 1306 rcu_read_unlock();
1290 1307
1291 return ret; /* Audit by default */ 1308 return ret;
1292} 1309}
1293 1310
1294int audit_filter_type(int type) 1311int audit_filter_type(int type)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 90594c9f7552..10176cd5956a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1969,18 +1969,24 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1969 int rc) 1969 int rc)
1970{ 1970{
1971 struct audit_buffer *ab; 1971 struct audit_buffer *ab;
1972 uid_t uid, ologinuid, nloginuid; 1972 uid_t uid, oldloginuid, loginuid;
1973
1974 if (!audit_enabled)
1975 return;
1973 1976
1974 uid = from_kuid(&init_user_ns, task_uid(current)); 1977 uid = from_kuid(&init_user_ns, task_uid(current));
1975 ologinuid = from_kuid(&init_user_ns, koldloginuid); 1978 oldloginuid = from_kuid(&init_user_ns, koldloginuid);
1976 nloginuid = from_kuid(&init_user_ns, kloginuid), 1979 loginuid = from_kuid(&init_user_ns, kloginuid),
1977 1980
1978 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1981 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1979 if (!ab) 1982 if (!ab)
1980 return; 1983 return;
1981 audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " 1984 audit_log_format(ab, "pid=%d uid=%u"
1982 "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, 1985 " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
1983 nloginuid, oldsessionid, sessionid, !rc); 1986 " res=%d",
1987 current->pid, uid,
1988 oldloginuid, loginuid, oldsessionid, sessionid,
1989 !rc);
1984 audit_log_end(ab); 1990 audit_log_end(ab);
1985} 1991}
1986 1992
@@ -2008,7 +2014,7 @@ int audit_set_loginuid(kuid_t loginuid)
2008 2014
2009 /* are we setting or clearing? */ 2015 /* are we setting or clearing? */
2010 if (uid_valid(loginuid)) 2016 if (uid_valid(loginuid))
2011 sessionid = atomic_inc_return(&session_id); 2017 sessionid = (unsigned int)atomic_inc_return(&session_id);
2012 2018
2013 task->sessionid = sessionid; 2019 task->sessionid = sessionid;
2014 task->loginuid = loginuid; 2020 task->loginuid = loginuid;
@@ -2321,18 +2327,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2321 2327
2322/** 2328/**
2323 * __audit_log_capset - store information about the arguments to the capset syscall 2329 * __audit_log_capset - store information about the arguments to the capset syscall
2324 * @pid: target pid of the capset call
2325 * @new: the new credentials 2330 * @new: the new credentials
2326 * @old: the old (current) credentials 2331 * @old: the old (current) credentials
2327 * 2332 *
2328 * Record the aguments userspace sent to sys_capset for later printing by the 2333 * Record the aguments userspace sent to sys_capset for later printing by the
2329 * audit system if applicable 2334 * audit system if applicable
2330 */ 2335 */
2331void __audit_log_capset(pid_t pid, 2336void __audit_log_capset(const struct cred *new, const struct cred *old)
2332 const struct cred *new, const struct cred *old)
2333{ 2337{
2334 struct audit_context *context = current->audit_context; 2338 struct audit_context *context = current->audit_context;
2335 context->capset.pid = pid; 2339 context->capset.pid = task_pid_nr(current);
2336 context->capset.cap.effective = new->cap_effective; 2340 context->capset.cap.effective = new->cap_effective;
2337 context->capset.cap.inheritable = new->cap_effective; 2341 context->capset.cap.inheritable = new->cap_effective;
2338 context->capset.cap.permitted = new->cap_permitted; 2342 context->capset.cap.permitted = new->cap_permitted;
@@ -2352,6 +2356,7 @@ static void audit_log_task(struct audit_buffer *ab)
2352 kuid_t auid, uid; 2356 kuid_t auid, uid;
2353 kgid_t gid; 2357 kgid_t gid;
2354 unsigned int sessionid; 2358 unsigned int sessionid;
2359 struct mm_struct *mm = current->mm;
2355 2360
2356 auid = audit_get_loginuid(current); 2361 auid = audit_get_loginuid(current);
2357 sessionid = audit_get_sessionid(current); 2362 sessionid = audit_get_sessionid(current);
@@ -2365,15 +2370,15 @@ static void audit_log_task(struct audit_buffer *ab)
2365 audit_log_task_context(ab); 2370 audit_log_task_context(ab);
2366 audit_log_format(ab, " pid=%d comm=", current->pid); 2371 audit_log_format(ab, " pid=%d comm=", current->pid);
2367 audit_log_untrustedstring(ab, current->comm); 2372 audit_log_untrustedstring(ab, current->comm);
2373 if (mm) {
2374 down_read(&mm->mmap_sem);
2375 if (mm->exe_file)
2376 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
2377 up_read(&mm->mmap_sem);
2378 } else
2379 audit_log_format(ab, " exe=(null)");
2368} 2380}
2369 2381
2370static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2371{
2372 audit_log_task(ab);
2373 audit_log_format(ab, " reason=");
2374 audit_log_string(ab, reason);
2375 audit_log_format(ab, " sig=%ld", signr);
2376}
2377/** 2382/**
2378 * audit_core_dumps - record information about processes that end abnormally 2383 * audit_core_dumps - record information about processes that end abnormally
2379 * @signr: signal value 2384 * @signr: signal value
@@ -2394,7 +2399,8 @@ void audit_core_dumps(long signr)
2394 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2399 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2395 if (unlikely(!ab)) 2400 if (unlikely(!ab))
2396 return; 2401 return;
2397 audit_log_abend(ab, "memory violation", signr); 2402 audit_log_task(ab);
2403 audit_log_format(ab, " sig=%ld", signr);
2398 audit_log_end(ab); 2404 audit_log_end(ab);
2399} 2405}
2400 2406
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 5253204afdca..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,6 @@ void foo(void)
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); 23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif 24#endif
25 DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); 25 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
26 /* End of constants */ 26 /* End of constants */
27} 27}
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e66bf9275b0..34019c57888d 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
277 if (ret < 0) 277 if (ret < 0)
278 goto error; 278 goto error;
279 279
280 audit_log_capset(pid, new, current_cred()); 280 audit_log_capset(new, current_cred());
281 281
282 return commit_creds(new); 282 return commit_creds(new);
283 283
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..e2f46ba37f72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,33 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
109/*
110 * cgroup destruction makes heavy use of work items and there can be a lot
111 * of concurrent destructions. Use a separate workqueue so that cgroup
112 * destruction work items don't end up filling up max_active of system_wq
113 * which may lead to deadlock.
114 */
115static struct workqueue_struct *cgroup_destroy_wq;
116
117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
92/* 123/*
93 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
94 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
@@ -111,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
111/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
112static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
113 144
114/*
115 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
116 */
117struct cfent {
118 struct list_head node;
119 struct dentry *dentry;
120 struct cftype *type;
121 struct cgroup_subsys_state *css;
122
123 /* file xattrs */
124 struct simple_xattrs xattrs;
125};
126
127/*
128 * cgroup_event represents events which userspace want to receive.
129 */
130struct cgroup_event {
131 /*
132 * css which the event belongs to.
133 */
134 struct cgroup_subsys_state *css;
135 /*
136 * Control file which the event associated.
137 */
138 struct cftype *cft;
139 /*
140 * eventfd to signal userspace about the event.
141 */
142 struct eventfd_ctx *eventfd;
143 /*
144 * Each of these stored in a list by the cgroup.
145 */
146 struct list_head list;
147 /*
148 * All fields below needed to unregister event when
149 * userspace closes eventfd.
150 */
151 poll_table pt;
152 wait_queue_head_t *wqh;
153 wait_queue_t wait;
154 struct work_struct remove;
155};
156
157/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
158 146
159static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -191,6 +179,8 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
191static int cgroup_destroy_locked(struct cgroup *cgrp); 179static int cgroup_destroy_locked(struct cgroup *cgrp);
192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
193 bool is_add); 181 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
194 184
195/** 185/**
196 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -253,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
253} 243}
254 244
255/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
256 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
257 * @ss: the iteration cursor 262 * @ss: the iteration cursor
258 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
259 * 264 *
260 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
261 */ 267 */
262#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
263 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
264 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
265 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
266 else 272 else
267 273
268/** 274/**
@@ -277,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
277 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
278 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
279 285
280/* iterate each subsystem attached to a hierarchy */
281#define for_each_root_subsys(root, ss) \
282 list_for_each_entry((ss), &(root)->subsys_list, sibling)
283
284/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
285#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
286 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -854,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
854 */ 856 */
855 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
856 858
857 /* 859 cgroup_pidlist_destroy_all(cgrp);
858 * if we're getting rid of the cgroup, refcount should ensure
859 * that there are no pidlists left.
860 */
861 BUG_ON(!list_empty(&cgrp->pidlists));
862 860
863 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
864 862
@@ -871,7 +869,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
871 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 869 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
872 870
873 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); 871 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
874 schedule_work(&cgrp->destroy_work); 872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
875} 873}
876 874
877static void cgroup_diput(struct dentry *dentry, struct inode *inode) 875static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -881,6 +879,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
881 struct cgroup *cgrp = dentry->d_fsdata; 879 struct cgroup *cgrp = dentry->d_fsdata;
882 880
883 BUG_ON(!(cgroup_is_dead(cgrp))); 881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
890 cgrp->id = -1;
891
884 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 892 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
885 } else { 893 } else {
886 struct cfent *cfe = __d_cfe(dentry); 894 struct cfent *cfe = __d_cfe(dentry);
@@ -1031,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1031 cgroup_css(cgroup_dummy_top, ss)); 1039 cgroup_css(cgroup_dummy_top, ss));
1032 cgroup_css(cgrp, ss)->cgroup = cgrp; 1040 cgroup_css(cgrp, ss)->cgroup = cgrp;
1033 1041
1034 list_move(&ss->sibling, &root->subsys_list);
1035 ss->root = root; 1042 ss->root = root;
1036 if (ss->bind) 1043 if (ss->bind)
1037 ss->bind(cgroup_css(cgrp, ss)); 1044 ss->bind(cgroup_css(cgrp, ss));
@@ -1050,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1057 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1051 1058
1052 cgroup_subsys[i]->root = &cgroup_dummy_root; 1059 cgroup_subsys[i]->root = &cgroup_dummy_root;
1053 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1054 1060
1055 /* subsystem is now free - drop reference on module */ 1061 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1062 module_put(ss->module);
@@ -1077,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1077{ 1083{
1078 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1084 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1079 struct cgroup_subsys *ss; 1085 struct cgroup_subsys *ss;
1086 int ssid;
1080 1087
1081 mutex_lock(&cgroup_root_mutex); 1088 mutex_lock(&cgroup_root_mutex);
1082 for_each_root_subsys(root, ss) 1089 for_each_subsys(ss, ssid)
1083 seq_printf(seq, ",%s", ss->name); 1090 if (root->subsys_mask & (1 << ssid))
1091 seq_printf(seq, ",%s", ss->name);
1084 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1092 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1085 seq_puts(seq, ",sane_behavior"); 1093 seq_puts(seq, ",sane_behavior");
1086 if (root->flags & CGRP_ROOT_NOPREFIX) 1094 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1343,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1343 INIT_LIST_HEAD(&cgrp->pidlists); 1351 INIT_LIST_HEAD(&cgrp->pidlists);
1344 mutex_init(&cgrp->pidlist_mutex); 1352 mutex_init(&cgrp->pidlist_mutex);
1345 cgrp->dummy_css.cgroup = cgrp; 1353 cgrp->dummy_css.cgroup = cgrp;
1346 INIT_LIST_HEAD(&cgrp->event_list);
1347 spin_lock_init(&cgrp->event_list_lock);
1348 simple_xattrs_init(&cgrp->xattrs); 1354 simple_xattrs_init(&cgrp->xattrs);
1349} 1355}
1350 1356
@@ -1352,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1352{ 1358{
1353 struct cgroup *cgrp = &root->top_cgroup; 1359 struct cgroup *cgrp = &root->top_cgroup;
1354 1360
1355 INIT_LIST_HEAD(&root->subsys_list);
1356 INIT_LIST_HEAD(&root->root_list); 1361 INIT_LIST_HEAD(&root->root_list);
1357 root->number_of_cgroups = 1; 1362 root->number_of_cgroups = 1;
1358 cgrp->root = root; 1363 cgrp->root = root;
@@ -1674,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1674 return ERR_PTR(ret); 1679 return ERR_PTR(ret);
1675} 1680}
1676 1681
1677static void cgroup_kill_sb(struct super_block *sb) { 1682static void cgroup_kill_sb(struct super_block *sb)
1683{
1678 struct cgroupfs_root *root = sb->s_fs_info; 1684 struct cgroupfs_root *root = sb->s_fs_info;
1679 struct cgroup *cgrp = &root->top_cgroup; 1685 struct cgroup *cgrp = &root->top_cgroup;
1680 struct cgrp_cset_link *link, *tmp_link; 1686 struct cgrp_cset_link *link, *tmp_link;
@@ -1957,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1957 bool threadgroup) 1963 bool threadgroup)
1958{ 1964{
1959 int retval, i, group_size; 1965 int retval, i, group_size;
1960 struct cgroup_subsys *ss, *failed_ss = NULL;
1961 struct cgroupfs_root *root = cgrp->root; 1966 struct cgroupfs_root *root = cgrp->root;
1967 struct cgroup_subsys_state *css, *failed_css = NULL;
1962 /* threadgroup list cursor and array */ 1968 /* threadgroup list cursor and array */
1963 struct task_struct *leader = tsk; 1969 struct task_struct *leader = tsk;
1964 struct task_and_cgroup *tc; 1970 struct task_and_cgroup *tc;
@@ -2031,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2031 /* 2037 /*
2032 * step 1: check that we can legitimately attach to the cgroup. 2038 * step 1: check that we can legitimately attach to the cgroup.
2033 */ 2039 */
2034 for_each_root_subsys(root, ss) { 2040 for_each_css(css, i, cgrp) {
2035 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2041 if (css->ss->can_attach) {
2036 2042 retval = css->ss->can_attach(css, &tset);
2037 if (ss->can_attach) {
2038 retval = ss->can_attach(css, &tset);
2039 if (retval) { 2043 if (retval) {
2040 failed_ss = ss; 2044 failed_css = css;
2041 goto out_cancel_attach; 2045 goto out_cancel_attach;
2042 } 2046 }
2043 } 2047 }
@@ -2073,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2073 /* 2077 /*
2074 * step 4: do subsystem attach callbacks. 2078 * step 4: do subsystem attach callbacks.
2075 */ 2079 */
2076 for_each_root_subsys(root, ss) { 2080 for_each_css(css, i, cgrp)
2077 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2081 if (css->ss->attach)
2078 2082 css->ss->attach(css, &tset);
2079 if (ss->attach)
2080 ss->attach(css, &tset);
2081 }
2082 2083
2083 /* 2084 /*
2084 * step 5: success! and cleanup 2085 * step 5: success! and cleanup
@@ -2095,13 +2096,11 @@ out_put_css_set_refs:
2095 } 2096 }
2096out_cancel_attach: 2097out_cancel_attach:
2097 if (retval) { 2098 if (retval) {
2098 for_each_root_subsys(root, ss) { 2099 for_each_css(css, i, cgrp) {
2099 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2100 if (css == failed_css)
2100
2101 if (ss == failed_ss)
2102 break; 2101 break;
2103 if (ss->cancel_attach) 2102 if (css->ss->cancel_attach)
2104 ss->cancel_attach(css, &tset); 2103 css->ss->cancel_attach(css, &tset);
2105 } 2104 }
2106 } 2105 }
2107out_free_group_list: 2106out_free_group_list:
@@ -2129,7 +2128,7 @@ retry_find_task:
2129 tsk = find_task_by_vpid(pid); 2128 tsk = find_task_by_vpid(pid);
2130 if (!tsk) { 2129 if (!tsk) {
2131 rcu_read_unlock(); 2130 rcu_read_unlock();
2132 ret= -ESRCH; 2131 ret = -ESRCH;
2133 goto out_unlock_cgroup; 2132 goto out_unlock_cgroup;
2134 } 2133 }
2135 /* 2134 /*
@@ -2241,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2241 return 0; 2240 return 0;
2242} 2241}
2243 2242
2244static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2243static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2245 struct cftype *cft, struct seq_file *seq)
2246{ 2244{
2247 struct cgroup *cgrp = css->cgroup; 2245 struct cgroup *cgrp = seq_css(seq)->cgroup;
2248 2246
2249 if (!cgroup_lock_live_group(cgrp)) 2247 if (!cgroup_lock_live_group(cgrp))
2250 return -ENODEV; 2248 return -ENODEV;
@@ -2254,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2254 return 0; 2252 return 0;
2255} 2253}
2256 2254
2257static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2255static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2258 struct cftype *cft, struct seq_file *seq)
2259{ 2256{
2260 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2257 struct cgroup *cgrp = seq_css(seq)->cgroup;
2258
2259 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2261 return 0; 2260 return 0;
2262} 2261}
2263 2262
2264/* A buffer size big enough for numbers or short strings */ 2263/* A buffer size big enough for numbers or short strings */
2265#define CGROUP_LOCAL_BUFFER_SIZE 64 2264#define CGROUP_LOCAL_BUFFER_SIZE 64
2266 2265
2267static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2266static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2268 struct cftype *cft, struct file *file, 2267 size_t nbytes, loff_t *ppos)
2269 const char __user *userbuf, size_t nbytes,
2270 loff_t *unused_ppos)
2271{ 2268{
2272 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2269 struct cfent *cfe = __d_cfe(file->f_dentry);
2273 int retval = 0; 2270 struct cftype *cft = __d_cft(file->f_dentry);
2274 char *end; 2271 struct cgroup_subsys_state *css = cfe->css;
2272 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2273 char *buf;
2274 int ret;
2275 2275
2276 if (!nbytes) 2276 if (nbytes >= max_bytes)
2277 return -EINVAL;
2278 if (nbytes >= sizeof(buffer))
2279 return -E2BIG; 2277 return -E2BIG;
2280 if (copy_from_user(buffer, userbuf, nbytes))
2281 return -EFAULT;
2282 2278
2283 buffer[nbytes] = 0; /* nul-terminate */ 2279 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2284 if (cft->write_u64) { 2280 if (!buf)
2285 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2281 return -ENOMEM;
2286 if (*end) 2282
2287 return -EINVAL; 2283 if (copy_from_user(buf, userbuf, nbytes)) {
2288 retval = cft->write_u64(css, cft, val); 2284 ret = -EFAULT;
2285 goto out_free;
2286 }
2287
2288 buf[nbytes] = '\0';
2289
2290 if (cft->write_string) {
2291 ret = cft->write_string(css, cft, strstrip(buf));
2292 } else if (cft->write_u64) {
2293 unsigned long long v;
2294 ret = kstrtoull(buf, 0, &v);
2295 if (!ret)
2296 ret = cft->write_u64(css, cft, v);
2297 } else if (cft->write_s64) {
2298 long long v;
2299 ret = kstrtoll(buf, 0, &v);
2300 if (!ret)
2301 ret = cft->write_s64(css, cft, v);
2302 } else if (cft->trigger) {
2303 ret = cft->trigger(css, (unsigned int)cft->private);
2289 } else { 2304 } else {
2290 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2305 ret = -EINVAL;
2291 if (*end)
2292 return -EINVAL;
2293 retval = cft->write_s64(css, cft, val);
2294 } 2306 }
2295 if (!retval) 2307out_free:
2296 retval = nbytes; 2308 kfree(buf);
2297 return retval; 2309 return ret ?: nbytes;
2298} 2310}
2299 2311
2300static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2312/*
2301 struct cftype *cft, struct file *file, 2313 * seqfile ops/methods for returning structured data. Currently just
2302 const char __user *userbuf, size_t nbytes, 2314 * supports string->u64 maps, but can be extended in future.
2303 loff_t *unused_ppos) 2315 */
2316
2317static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2304{ 2318{
2305 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2319 struct cftype *cft = seq_cft(seq);
2306 int retval = 0;
2307 size_t max_bytes = cft->max_write_len;
2308 char *buffer = local_buffer;
2309 2320
2310 if (!max_bytes) 2321 if (cft->seq_start) {
2311 max_bytes = sizeof(local_buffer) - 1; 2322 return cft->seq_start(seq, ppos);
2312 if (nbytes >= max_bytes) 2323 } else {
2313 return -E2BIG; 2324 /*
2314 /* Allocate a dynamic buffer if we need one */ 2325 * The same behavior and code as single_open(). Returns
2315 if (nbytes >= sizeof(local_buffer)) { 2326 * !NULL if pos is at the beginning; otherwise, NULL.
2316 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2327 */
2317 if (buffer == NULL) 2328 return NULL + !*ppos;
2318 return -ENOMEM;
2319 }
2320 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2321 retval = -EFAULT;
2322 goto out;
2323 } 2329 }
2324
2325 buffer[nbytes] = 0; /* nul-terminate */
2326 retval = cft->write_string(css, cft, strstrip(buffer));
2327 if (!retval)
2328 retval = nbytes;
2329out:
2330 if (buffer != local_buffer)
2331 kfree(buffer);
2332 return retval;
2333} 2330}
2334 2331
2335static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2332static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2336 size_t nbytes, loff_t *ppos)
2337{ 2333{
2338 struct cfent *cfe = __d_cfe(file->f_dentry); 2334 struct cftype *cft = seq_cft(seq);
2339 struct cftype *cft = __d_cft(file->f_dentry);
2340 struct cgroup_subsys_state *css = cfe->css;
2341 2335
2342 if (cft->write) 2336 if (cft->seq_next) {
2343 return cft->write(css, cft, file, buf, nbytes, ppos); 2337 return cft->seq_next(seq, v, ppos);
2344 if (cft->write_u64 || cft->write_s64) 2338 } else {
2345 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2339 /*
2346 if (cft->write_string) 2340 * The same behavior and code as single_open(), always
2347 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2341 * terminate after the initial read.
2348 if (cft->trigger) { 2342 */
2349 int ret = cft->trigger(css, (unsigned int)cft->private); 2343 ++*ppos;
2350 return ret ? ret : nbytes; 2344 return NULL;
2351 } 2345 }
2352 return -EINVAL;
2353} 2346}
2354 2347
2355static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2348static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2356 struct cftype *cft, struct file *file,
2357 char __user *buf, size_t nbytes, loff_t *ppos)
2358{ 2349{
2359 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2350 struct cftype *cft = seq_cft(seq);
2360 u64 val = cft->read_u64(css, cft);
2361 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2362 2351
2363 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2352 if (cft->seq_stop)
2353 cft->seq_stop(seq, v);
2364} 2354}
2365 2355
2366static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2356static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2367 struct cftype *cft, struct file *file,
2368 char __user *buf, size_t nbytes, loff_t *ppos)
2369{ 2357{
2370 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2358 struct cftype *cft = seq_cft(m);
2371 s64 val = cft->read_s64(css, cft); 2359 struct cgroup_subsys_state *css = seq_css(m);
2372 int len = sprintf(tmp, "%lld\n", (long long) val);
2373 2360
2374 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2361 if (cft->seq_show)
2375} 2362 return cft->seq_show(m, arg);
2376 2363
2377static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2378 size_t nbytes, loff_t *ppos)
2379{
2380 struct cfent *cfe = __d_cfe(file->f_dentry);
2381 struct cftype *cft = __d_cft(file->f_dentry);
2382 struct cgroup_subsys_state *css = cfe->css;
2383
2384 if (cft->read)
2385 return cft->read(css, cft, file, buf, nbytes, ppos);
2386 if (cft->read_u64) 2364 if (cft->read_u64)
2387 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2365 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2388 if (cft->read_s64) 2366 else if (cft->read_s64)
2389 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2390 return -EINVAL; 2368 else
2391} 2369 return -EINVAL;
2392 2370 return 0;
2393/*
2394 * seqfile ops/methods for returning structured data. Currently just
2395 * supports string->u64 maps, but can be extended in future.
2396 */
2397
2398static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2399{
2400 struct seq_file *sf = cb->state;
2401 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2402}
2403
2404static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2405{
2406 struct cfent *cfe = m->private;
2407 struct cftype *cft = cfe->type;
2408 struct cgroup_subsys_state *css = cfe->css;
2409
2410 if (cft->read_map) {
2411 struct cgroup_map_cb cb = {
2412 .fill = cgroup_map_add,
2413 .state = m,
2414 };
2415 return cft->read_map(css, cft, &cb);
2416 }
2417 return cft->read_seq_string(css, cft, m);
2418} 2371}
2419 2372
2420static const struct file_operations cgroup_seqfile_operations = { 2373static struct seq_operations cgroup_seq_operations = {
2421 .read = seq_read, 2374 .start = cgroup_seqfile_start,
2422 .write = cgroup_file_write, 2375 .next = cgroup_seqfile_next,
2423 .llseek = seq_lseek, 2376 .stop = cgroup_seqfile_stop,
2424 .release = single_release, 2377 .show = cgroup_seqfile_show,
2425}; 2378};
2426 2379
2427static int cgroup_file_open(struct inode *inode, struct file *file) 2380static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2430,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2430 struct cftype *cft = __d_cft(file->f_dentry); 2383 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2384 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2432 struct cgroup_subsys_state *css; 2385 struct cgroup_subsys_state *css;
2386 struct cgroup_open_file *of;
2433 int err; 2387 int err;
2434 2388
2435 err = generic_file_open(inode, file); 2389 err = generic_file_open(inode, file);
@@ -2459,30 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2459 WARN_ON_ONCE(cfe->css && cfe->css != css); 2413 WARN_ON_ONCE(cfe->css && cfe->css != css);
2460 cfe->css = css; 2414 cfe->css = css;
2461 2415
2462 if (cft->read_map || cft->read_seq_string) { 2416 of = __seq_open_private(file, &cgroup_seq_operations,
2463 file->f_op = &cgroup_seqfile_operations; 2417 sizeof(struct cgroup_open_file));
2464 err = single_open(file, cgroup_seqfile_show, cfe); 2418 if (of) {
2465 } else if (cft->open) { 2419 of->cfe = cfe;
2466 err = cft->open(inode, file); 2420 return 0;
2467 } 2421 }
2468 2422
2469 if (css->ss && err) 2423 if (css->ss)
2470 css_put(css); 2424 css_put(css);
2471 return err; 2425 return -ENOMEM;
2472} 2426}
2473 2427
2474static int cgroup_file_release(struct inode *inode, struct file *file) 2428static int cgroup_file_release(struct inode *inode, struct file *file)
2475{ 2429{
2476 struct cfent *cfe = __d_cfe(file->f_dentry); 2430 struct cfent *cfe = __d_cfe(file->f_dentry);
2477 struct cftype *cft = __d_cft(file->f_dentry);
2478 struct cgroup_subsys_state *css = cfe->css; 2431 struct cgroup_subsys_state *css = cfe->css;
2479 int ret = 0;
2480 2432
2481 if (cft->release)
2482 ret = cft->release(inode, file);
2483 if (css->ss) 2433 if (css->ss)
2484 css_put(css); 2434 css_put(css);
2485 return ret; 2435 return seq_release_private(inode, file);
2486} 2436}
2487 2437
2488/* 2438/*
@@ -2593,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2593} 2543}
2594 2544
2595static const struct file_operations cgroup_file_operations = { 2545static const struct file_operations cgroup_file_operations = {
2596 .read = cgroup_file_read, 2546 .read = seq_read,
2597 .write = cgroup_file_write, 2547 .write = cgroup_file_write,
2598 .llseek = generic_file_llseek, 2548 .llseek = generic_file_llseek,
2599 .open = cgroup_file_open, 2549 .open = cgroup_file_open,
@@ -2618,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2618 .removexattr = cgroup_removexattr, 2568 .removexattr = cgroup_removexattr,
2619}; 2569};
2620 2570
2621/*
2622 * Check if a file is a control file
2623 */
2624static inline struct cftype *__file_cft(struct file *file)
2625{
2626 if (file_inode(file)->i_fop != &cgroup_file_operations)
2627 return ERR_PTR(-EINVAL);
2628 return __d_cft(file->f_dentry);
2629}
2630
2631static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2571static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2632 struct super_block *sb) 2572 struct super_block *sb)
2633{ 2573{
@@ -2685,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2685 if (cft->mode) 2625 if (cft->mode)
2686 return cft->mode; 2626 return cft->mode;
2687 2627
2688 if (cft->read || cft->read_u64 || cft->read_s64 || 2628 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2689 cft->read_map || cft->read_seq_string)
2690 mode |= S_IRUGO; 2629 mode |= S_IRUGO;
2691 2630
2692 if (cft->write || cft->write_u64 || cft->write_s64 || 2631 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2693 cft->write_string || cft->trigger) 2632 cft->trigger)
2694 mode |= S_IWUSR; 2633 mode |= S_IWUSR;
2695 2634
2696 return mode; 2635 return mode;
@@ -2986,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
2986 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
2987 * 2926 *
2988 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
2989 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
2990 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
2991 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
2992 */ 2931 */
2993struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
2994css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -2998,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2998 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
2999 struct cgroup *next; 2938 struct cgroup *next;
3000 2939
3001 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3002 2941
3003 /* 2942 /*
3004 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3045,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3045 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3046 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3047 * 2986 *
3048 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3049 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3050 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3051 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3052 */ 2991 */
3053struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3054css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3056,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3056{ 2995{
3057 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3058 2997
3059 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3060 2999
3061 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3062 if (!pos) 3001 if (!pos)
@@ -3087,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3087 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3088 * subtree of @pos. 3027 * subtree of @pos.
3089 * 3028 *
3090 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3091 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3092 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3093 * accessible. 3032 * long as @pos is accessible.
3094 */ 3033 */
3095struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3096css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3097{ 3036{
3098 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3099 3038
3100 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3101 3040
3102 do { 3041 do {
3103 last = pos; 3042 last = pos;
@@ -3133,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3133 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3134 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3135 * 3074 *
3136 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3137 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3138 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3139 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3140 */ 3080 */
3141struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3142css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3144,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3144{ 3084{
3145 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3146 3086
3147 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3148 3088
3149 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3150 if (!pos) 3090 if (!pos)
@@ -3483,14 +3423,12 @@ struct cgroup_pidlist {
3483 pid_t *list; 3423 pid_t *list;
3484 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3485 int length; 3425 int length;
3486 /* how many files are using the current array */
3487 int use_count;
3488 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3489 struct list_head links; 3427 struct list_head links;
3490 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3491 struct cgroup *owner; 3429 struct cgroup *owner;
3492 /* protects the other fields */ 3430 /* for delayed destruction */
3493 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3494}; 3432};
3495 3433
3496/* 3434/*
@@ -3506,6 +3444,7 @@ static void *pidlist_allocate(int count)
3506 else 3444 else
3507 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3508} 3446}
3447
3509static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3510{ 3449{
3511 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3515,6 +3454,47 @@ static void pidlist_free(void *p)
3515} 3454}
3516 3455
3517/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3518 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3519 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3520 */ 3500 */
@@ -3544,52 +3524,92 @@ after:
3544 return dest; 3524 return dest;
3545} 3525}
3546 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3547static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3548{ 3561{
3549 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3550} 3563}
3551 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3552/* 3585/*
3553 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3554 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3555 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3556 * memory. 3589 * memory.
3557 */ 3590 */
3558static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3559 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3560{ 3593{
3561 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3562 /* don't need task_nsproxy() if we're looking at ourself */
3563 struct pid_namespace *ns = task_active_pid_ns(current);
3564 3595
3565 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3566 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3567 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3568 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3569 * list we find out from under us - compare release_pid_array(). 3600 return l;
3570 */ 3601
3571 mutex_lock(&cgrp->pidlist_mutex);
3572 list_for_each_entry(l, &cgrp->pidlists, links) {
3573 if (l->key.type == type && l->key.ns == ns) {
3574 /* make sure l doesn't vanish out from under us */
3575 down_write(&l->rwsem);
3576 mutex_unlock(&cgrp->pidlist_mutex);
3577 return l;
3578 }
3579 }
3580 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3581 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3582 if (!l) { 3604 if (!l)
3583 mutex_unlock(&cgrp->pidlist_mutex);
3584 return l; 3605 return l;
3585 } 3606
3586 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3587 down_write(&l->rwsem);
3588 l->key.type = type; 3608 l->key.type = type;
3589 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3590 l->owner = cgrp; 3611 l->owner = cgrp;
3591 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3592 mutex_unlock(&cgrp->pidlist_mutex);
3593 return l; 3613 return l;
3594} 3614}
3595 3615
@@ -3606,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3606 struct task_struct *tsk; 3626 struct task_struct *tsk;
3607 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3608 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3609 /* 3631 /*
3610 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3611 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3632,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3632 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3633 length = n; 3655 length = n;
3634 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3635 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3636 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3637 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3638 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3639 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3640 pidlist_free(array); 3667 pidlist_free(array);
3641 return -ENOMEM; 3668 return -ENOMEM;
3642 } 3669 }
3643 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3644 pidlist_free(l->list); 3672 pidlist_free(l->list);
3645 l->list = array; 3673 l->list = array;
3646 l->length = length; 3674 l->length = length;
3647 l->use_count++;
3648 up_write(&l->rwsem);
3649 *lp = l; 3675 *lp = l;
3650 return 0; 3676 return 0;
3651} 3677}
@@ -3719,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3719 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3720 * next pid to display, if any 3746 * next pid to display, if any
3721 */ 3747 */
3722 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3723 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3724 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3725 3777
3726 down_read(&l->rwsem);
3727 if (pid) { 3778 if (pid) {
3728 int end = l->length; 3779 int end = l->length;
3729 3780
3730 while (index < end) { 3781 while (index < end) {
3731 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3732 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3733 index = mid; 3784 index = mid;
3734 break; 3785 break;
3735 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3736 index = mid + 1; 3787 index = mid + 1;
3737 else 3788 else
3738 end = mid; 3789 end = mid;
@@ -3743,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3743 return NULL; 3794 return NULL;
3744 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3745 iter = l->list + index; 3796 iter = l->list + index;
3746 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3747 return iter; 3798 return iter;
3748} 3799}
3749 3800
3750static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3751{ 3802{
3752 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3753 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3754} 3810}
3755 3811
3756static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3757{ 3813{
3758 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3759 pid_t *p = v; 3816 pid_t *p = v;
3760 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3761 /* 3818 /*
@@ -3766,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3766 if (p >= end) { 3823 if (p >= end) {
3767 return NULL; 3824 return NULL;
3768 } else { 3825 } else {
3769 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3770 return p; 3827 return p;
3771 } 3828 }
3772} 3829}
@@ -3787,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3787 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3788}; 3845};
3789 3846
3790static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3791{
3792 /*
3793 * the case where we're the last user of this particular pidlist will
3794 * have us remove it from the cgroup's list, which entails taking the
3795 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3796 * pidlist_mutex, we have to take pidlist_mutex first.
3797 */
3798 mutex_lock(&l->owner->pidlist_mutex);
3799 down_write(&l->rwsem);
3800 BUG_ON(!l->use_count);
3801 if (!--l->use_count) {
3802 /* we're the last user if refcount is 0; remove and free */
3803 list_del(&l->links);
3804 mutex_unlock(&l->owner->pidlist_mutex);
3805 pidlist_free(l->list);
3806 put_pid_ns(l->key.ns);
3807 up_write(&l->rwsem);
3808 kfree(l);
3809 return;
3810 }
3811 mutex_unlock(&l->owner->pidlist_mutex);
3812 up_write(&l->rwsem);
3813}
3814
3815static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3816{
3817 struct cgroup_pidlist *l;
3818 if (!(file->f_mode & FMODE_READ))
3819 return 0;
3820 /*
3821 * the seq_file will only be initialized if the file was opened for
3822 * reading; hence we check if it's not null only in that case.
3823 */
3824 l = ((struct seq_file *)file->private_data)->private;
3825 cgroup_release_pid_array(l);
3826 return seq_release(inode, file);
3827}
3828
3829static const struct file_operations cgroup_pidlist_operations = {
3830 .read = seq_read,
3831 .llseek = seq_lseek,
3832 .write = cgroup_file_write,
3833 .release = cgroup_pidlist_release,
3834};
3835
3836/*
3837 * The following functions handle opens on a file that displays a pidlist
3838 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3839 * in the cgroup.
3840 */
3841/* helper function for the two below it */
3842static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3843{
3844 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3845 struct cgroup_pidlist *l;
3846 int retval;
3847
3848 /* Nothing to do for write-only files */
3849 if (!(file->f_mode & FMODE_READ))
3850 return 0;
3851
3852 /* have the array populated */
3853 retval = pidlist_array_load(cgrp, type, &l);
3854 if (retval)
3855 return retval;
3856 /* configure file information */
3857 file->f_op = &cgroup_pidlist_operations;
3858
3859 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3860 if (retval) {
3861 cgroup_release_pid_array(l);
3862 return retval;
3863 }
3864 ((struct seq_file *)file->private_data)->private = l;
3865 return 0;
3866}
3867static int cgroup_tasks_open(struct inode *unused, struct file *file)
3868{
3869 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3870}
3871static int cgroup_procs_open(struct inode *unused, struct file *file)
3872{
3873 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3874}
3875
3876static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3877 struct cftype *cft) 3848 struct cftype *cft)
3878{ 3849{
@@ -3907,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3907 deactivate_super(sb); 3878 deactivate_super(sb);
3908} 3879}
3909 3880
3910/*
3911 * Unregister event and free resources.
3912 *
3913 * Gets called from workqueue.
3914 */
3915static void cgroup_event_remove(struct work_struct *work)
3916{
3917 struct cgroup_event *event = container_of(work, struct cgroup_event,
3918 remove);
3919 struct cgroup_subsys_state *css = event->css;
3920
3921 remove_wait_queue(event->wqh, &event->wait);
3922
3923 event->cft->unregister_event(css, event->cft, event->eventfd);
3924
3925 /* Notify userspace the event is going away. */
3926 eventfd_signal(event->eventfd, 1);
3927
3928 eventfd_ctx_put(event->eventfd);
3929 kfree(event);
3930 css_put(css);
3931}
3932
3933/*
3934 * Gets called on POLLHUP on eventfd when user closes it.
3935 *
3936 * Called with wqh->lock held and interrupts disabled.
3937 */
3938static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3939 int sync, void *key)
3940{
3941 struct cgroup_event *event = container_of(wait,
3942 struct cgroup_event, wait);
3943 struct cgroup *cgrp = event->css->cgroup;
3944 unsigned long flags = (unsigned long)key;
3945
3946 if (flags & POLLHUP) {
3947 /*
3948 * If the event has been detached at cgroup removal, we
3949 * can simply return knowing the other side will cleanup
3950 * for us.
3951 *
3952 * We can't race against event freeing since the other
3953 * side will require wqh->lock via remove_wait_queue(),
3954 * which we hold.
3955 */
3956 spin_lock(&cgrp->event_list_lock);
3957 if (!list_empty(&event->list)) {
3958 list_del_init(&event->list);
3959 /*
3960 * We are in atomic context, but cgroup_event_remove()
3961 * may sleep, so we have to call it in workqueue.
3962 */
3963 schedule_work(&event->remove);
3964 }
3965 spin_unlock(&cgrp->event_list_lock);
3966 }
3967
3968 return 0;
3969}
3970
3971static void cgroup_event_ptable_queue_proc(struct file *file,
3972 wait_queue_head_t *wqh, poll_table *pt)
3973{
3974 struct cgroup_event *event = container_of(pt,
3975 struct cgroup_event, pt);
3976
3977 event->wqh = wqh;
3978 add_wait_queue(wqh, &event->wait);
3979}
3980
3981/*
3982 * Parse input and register new cgroup event handler.
3983 *
3984 * Input must be in format '<event_fd> <control_fd> <args>'.
3985 * Interpretation of args is defined by control file implementation.
3986 */
3987static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3988 struct cftype *cft, const char *buffer)
3989{
3990 struct cgroup *cgrp = dummy_css->cgroup;
3991 struct cgroup_event *event;
3992 struct cgroup_subsys_state *cfile_css;
3993 unsigned int efd, cfd;
3994 struct fd efile;
3995 struct fd cfile;
3996 char *endp;
3997 int ret;
3998
3999 efd = simple_strtoul(buffer, &endp, 10);
4000 if (*endp != ' ')
4001 return -EINVAL;
4002 buffer = endp + 1;
4003
4004 cfd = simple_strtoul(buffer, &endp, 10);
4005 if ((*endp != ' ') && (*endp != '\0'))
4006 return -EINVAL;
4007 buffer = endp + 1;
4008
4009 event = kzalloc(sizeof(*event), GFP_KERNEL);
4010 if (!event)
4011 return -ENOMEM;
4012
4013 INIT_LIST_HEAD(&event->list);
4014 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4015 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4016 INIT_WORK(&event->remove, cgroup_event_remove);
4017
4018 efile = fdget(efd);
4019 if (!efile.file) {
4020 ret = -EBADF;
4021 goto out_kfree;
4022 }
4023
4024 event->eventfd = eventfd_ctx_fileget(efile.file);
4025 if (IS_ERR(event->eventfd)) {
4026 ret = PTR_ERR(event->eventfd);
4027 goto out_put_efile;
4028 }
4029
4030 cfile = fdget(cfd);
4031 if (!cfile.file) {
4032 ret = -EBADF;
4033 goto out_put_eventfd;
4034 }
4035
4036 /* the process need read permission on control file */
4037 /* AV: shouldn't we check that it's been opened for read instead? */
4038 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4039 if (ret < 0)
4040 goto out_put_cfile;
4041
4042 event->cft = __file_cft(cfile.file);
4043 if (IS_ERR(event->cft)) {
4044 ret = PTR_ERR(event->cft);
4045 goto out_put_cfile;
4046 }
4047
4048 if (!event->cft->ss) {
4049 ret = -EBADF;
4050 goto out_put_cfile;
4051 }
4052
4053 /*
4054 * Determine the css of @cfile, verify it belongs to the same
4055 * cgroup as cgroup.event_control, and associate @event with it.
4056 * Remaining events are automatically removed on cgroup destruction
4057 * but the removal is asynchronous, so take an extra ref.
4058 */
4059 rcu_read_lock();
4060
4061 ret = -EINVAL;
4062 event->css = cgroup_css(cgrp, event->cft->ss);
4063 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4064 if (event->css && event->css == cfile_css && css_tryget(event->css))
4065 ret = 0;
4066
4067 rcu_read_unlock();
4068 if (ret)
4069 goto out_put_cfile;
4070
4071 if (!event->cft->register_event || !event->cft->unregister_event) {
4072 ret = -EINVAL;
4073 goto out_put_css;
4074 }
4075
4076 ret = event->cft->register_event(event->css, event->cft,
4077 event->eventfd, buffer);
4078 if (ret)
4079 goto out_put_css;
4080
4081 efile.file->f_op->poll(efile.file, &event->pt);
4082
4083 spin_lock(&cgrp->event_list_lock);
4084 list_add(&event->list, &cgrp->event_list);
4085 spin_unlock(&cgrp->event_list_lock);
4086
4087 fdput(cfile);
4088 fdput(efile);
4089
4090 return 0;
4091
4092out_put_css:
4093 css_put(event->css);
4094out_put_cfile:
4095 fdput(cfile);
4096out_put_eventfd:
4097 eventfd_ctx_put(event->eventfd);
4098out_put_efile:
4099 fdput(efile);
4100out_kfree:
4101 kfree(event);
4102
4103 return ret;
4104}
4105
4106static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4107 struct cftype *cft) 3882 struct cftype *cft)
4108{ 3883{
@@ -4122,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4122static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4123 { 3898 {
4124 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4125 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4126 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4127 .release = cgroup_pidlist_release,
4128 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4129 }, 3907 },
4130 { 3908 {
4131 .name = "cgroup.event_control",
4132 .write_string = cgroup_write_event_control,
4133 .mode = S_IWUGO,
4134 },
4135 {
4136 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4137 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4138 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4141,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4141 { 3914 {
4142 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4143 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4144 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4145 }, 3918 },
4146 3919
4147 /* 3920 /*
@@ -4152,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4152 { 3925 {
4153 .name = "tasks", 3926 .name = "tasks",
4154 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4155 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4156 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4157 .release = cgroup_pidlist_release,
4158 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4159 }, 3935 },
4160 { 3936 {
@@ -4166,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4166 { 3942 {
4167 .name = "release_agent", 3943 .name = "release_agent",
4168 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4169 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4170 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4171 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4172 }, 3948 },
@@ -4249,7 +4025,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4249 * css_put(). dput() requires process context which we don't have. 4025 * css_put(). dput() requires process context which we don't have.
4250 */ 4026 */
4251 INIT_WORK(&css->destroy_work, css_free_work_fn); 4027 INIT_WORK(&css->destroy_work, css_free_work_fn);
4252 schedule_work(&css->destroy_work); 4028 queue_work(cgroup_destroy_wq, &css->destroy_work);
4253} 4029}
4254 4030
4255static void css_release(struct percpu_ref *ref) 4031static void css_release(struct percpu_ref *ref)
@@ -4257,6 +4033,7 @@ static void css_release(struct percpu_ref *ref)
4257 struct cgroup_subsys_state *css = 4033 struct cgroup_subsys_state *css =
4258 container_of(ref, struct cgroup_subsys_state, refcnt); 4034 container_of(ref, struct cgroup_subsys_state, refcnt);
4259 4035
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
4260 call_rcu(&css->rcu_head, css_free_rcu_fn); 4037 call_rcu(&css->rcu_head, css_free_rcu_fn);
4261} 4038}
4262 4039
@@ -4311,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
4311 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4312} 4089}
4313 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_free;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_free:
4142 percpu_ref_cancel_init(&css->refcnt);
4143 ss->css_free(css);
4144 return err;
4145}
4146
4314/* 4147/*
4315 * cgroup_create - create a cgroup 4148 * cgroup_create - create a cgroup
4316 * @parent: cgroup that will be parent of the new cgroup 4149 * @parent: cgroup that will be parent of the new cgroup
@@ -4322,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4322static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4155static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4323 umode_t mode) 4156 umode_t mode)
4324{ 4157{
4325 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4326 struct cgroup *cgrp; 4158 struct cgroup *cgrp;
4327 struct cgroup_name *name; 4159 struct cgroup_name *name;
4328 struct cgroupfs_root *root = parent->root; 4160 struct cgroupfs_root *root = parent->root;
4329 int err = 0; 4161 int ssid, err = 0;
4330 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
4331 struct super_block *sb = root->sb; 4163 struct super_block *sb = root->sb;
4332 4164
@@ -4382,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4382 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4214 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4383 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4215 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4384 4216
4385 for_each_root_subsys(root, ss) {
4386 struct cgroup_subsys_state *css;
4387
4388 css = ss->css_alloc(cgroup_css(parent, ss));
4389 if (IS_ERR(css)) {
4390 err = PTR_ERR(css);
4391 goto err_free_all;
4392 }
4393 css_ar[ss->subsys_id] = css;
4394
4395 err = percpu_ref_init(&css->refcnt, css_release);
4396 if (err)
4397 goto err_free_all;
4398
4399 init_css(css, ss, cgrp);
4400 }
4401
4402 /* 4217 /*
4403 * Create directory. cgroup_create_file() returns with the new 4218 * Create directory. cgroup_create_file() returns with the new
4404 * directory locked on success so that it can be populated without 4219 * directory locked on success so that it can be populated without
@@ -4406,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4406 */ 4221 */
4407 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4222 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4408 if (err < 0) 4223 if (err < 0)
4409 goto err_free_all; 4224 goto err_unlock;
4410 lockdep_assert_held(&dentry->d_inode->i_mutex); 4225 lockdep_assert_held(&dentry->d_inode->i_mutex);
4411 4226
4412 cgrp->serial_nr = cgroup_serial_nr_next++; 4227 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4415,59 +4230,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4415 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4230 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4416 root->number_of_cgroups++; 4231 root->number_of_cgroups++;
4417 4232
4418 /* each css holds a ref to the cgroup's dentry and the parent css */
4419 for_each_root_subsys(root, ss) {
4420 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4421
4422 dget(dentry);
4423 css_get(css->parent);
4424 }
4425
4426 /* hold a ref to the parent's dentry */ 4233 /* hold a ref to the parent's dentry */
4427 dget(parent->dentry); 4234 dget(parent->dentry);
4428 4235
4429 /* creation succeeded, notify subsystems */ 4236 /*
4430 for_each_root_subsys(root, ss) { 4237 * @cgrp is now fully operational. If something fails after this
4431 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4238 * point, it'll be released via the normal destruction path.
4432 4239 */
4433 err = online_css(css);
4434 if (err)
4435 goto err_destroy;
4436
4437 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4438 parent->parent) {
4439 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4440 current->comm, current->pid, ss->name);
4441 if (!strcmp(ss->name, "memory"))
4442 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4443 ss->warned_broken_hierarchy = true;
4444 }
4445 }
4446
4447 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4240 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4448 4241
4449 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4242 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4450 if (err) 4243 if (err)
4451 goto err_destroy; 4244 goto err_destroy;
4452 4245
4453 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4246 /* let's create and online css's */
4454 if (err) 4247 for_each_subsys(ss, ssid) {
4455 goto err_destroy; 4248 if (root->subsys_mask & (1 << ssid)) {
4249 err = create_css(cgrp, ss);
4250 if (err)
4251 goto err_destroy;
4252 }
4253 }
4456 4254
4457 mutex_unlock(&cgroup_mutex); 4255 mutex_unlock(&cgroup_mutex);
4458 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4256 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4459 4257
4460 return 0; 4258 return 0;
4461 4259
4462err_free_all: 4260err_unlock:
4463 for_each_root_subsys(root, ss) {
4464 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4465
4466 if (css) {
4467 percpu_ref_cancel_init(&css->refcnt);
4468 ss->css_free(css);
4469 }
4470 }
4471 mutex_unlock(&cgroup_mutex); 4261 mutex_unlock(&cgroup_mutex);
4472 /* Release the reference count that we took on the superblock */ 4262 /* Release the reference count that we took on the superblock */
4473 deactivate_super(sb); 4263 deactivate_super(sb);
@@ -4539,7 +4329,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4539 container_of(ref, struct cgroup_subsys_state, refcnt); 4329 container_of(ref, struct cgroup_subsys_state, refcnt);
4540 4330
4541 INIT_WORK(&css->destroy_work, css_killed_work_fn); 4331 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4542 schedule_work(&css->destroy_work); 4332 queue_work(cgroup_destroy_wq, &css->destroy_work);
4543} 4333}
4544 4334
4545/** 4335/**
@@ -4602,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4602 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4392 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4603{ 4393{
4604 struct dentry *d = cgrp->dentry; 4394 struct dentry *d = cgrp->dentry;
4605 struct cgroup_event *event, *tmp; 4395 struct cgroup_subsys_state *css;
4606 struct cgroup_subsys *ss;
4607 struct cgroup *child; 4396 struct cgroup *child;
4608 bool empty; 4397 bool empty;
4398 int ssid;
4609 4399
4610 lockdep_assert_held(&d->d_inode->i_mutex); 4400 lockdep_assert_held(&d->d_inode->i_mutex);
4611 lockdep_assert_held(&cgroup_mutex); 4401 lockdep_assert_held(&cgroup_mutex);
@@ -4641,8 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4641 * will be invoked to perform the rest of destruction once the 4431 * will be invoked to perform the rest of destruction once the
4642 * percpu refs of all css's are confirmed to be killed. 4432 * percpu refs of all css's are confirmed to be killed.
4643 */ 4433 */
4644 for_each_root_subsys(cgrp->root, ss) 4434 for_each_css(css, ssid, cgrp)
4645 kill_css(cgroup_css(cgrp, ss)); 4435 kill_css(css);
4646 4436
4647 /* 4437 /*
4648 * Mark @cgrp dead. This prevents further task migration and child 4438 * Mark @cgrp dead. This prevents further task migration and child
@@ -4677,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4677 dget(d); 4467 dget(d);
4678 cgroup_d_remove_dir(d); 4468 cgroup_d_remove_dir(d);
4679 4469
4680 /*
4681 * Unregister events and notify userspace.
4682 * Notify userspace about cgroup removing only after rmdir of cgroup
4683 * directory to avoid race between userspace and kernelspace.
4684 */
4685 spin_lock(&cgrp->event_list_lock);
4686 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4687 list_del_init(&event->list);
4688 schedule_work(&event->remove);
4689 }
4690 spin_unlock(&cgrp->event_list_lock);
4691
4692 return 0; 4470 return 0;
4693}; 4471};
4694 4472
@@ -4711,14 +4489,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4711 /* delete this cgroup from parent->children */ 4489 /* delete this cgroup from parent->children */
4712 list_del_rcu(&cgrp->sibling); 4490 list_del_rcu(&cgrp->sibling);
4713 4491
4714 /*
4715 * We should remove the cgroup object from idr before its grace
4716 * period starts, so we won't be looking up a cgroup while the
4717 * cgroup is being freed.
4718 */
4719 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4720 cgrp->id = -1;
4721
4722 dput(d); 4492 dput(d);
4723 4493
4724 set_bit(CGRP_RELEASABLE, &parent->flags); 4494 set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -4767,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4767 cgroup_init_cftsets(ss); 4537 cgroup_init_cftsets(ss);
4768 4538
4769 /* Create the top cgroup state for this subsystem */ 4539 /* Create the top cgroup state for this subsystem */
4770 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4771 ss->root = &cgroup_dummy_root; 4540 ss->root = &cgroup_dummy_root;
4772 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4541 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4773 /* We don't handle early failures gracefully */ 4542 /* We don't handle early failures gracefully */
@@ -4841,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4841 cgroup_init_cftsets(ss); 4610 cgroup_init_cftsets(ss);
4842 4611
4843 mutex_lock(&cgroup_mutex); 4612 mutex_lock(&cgroup_mutex);
4613 mutex_lock(&cgroup_root_mutex);
4844 cgroup_subsys[ss->subsys_id] = ss; 4614 cgroup_subsys[ss->subsys_id] = ss;
4845 4615
4846 /* 4616 /*
@@ -4852,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4852 if (IS_ERR(css)) { 4622 if (IS_ERR(css)) {
4853 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4623 /* failure case - need to deassign the cgroup_subsys[] slot. */
4854 cgroup_subsys[ss->subsys_id] = NULL; 4624 cgroup_subsys[ss->subsys_id] = NULL;
4625 mutex_unlock(&cgroup_root_mutex);
4855 mutex_unlock(&cgroup_mutex); 4626 mutex_unlock(&cgroup_mutex);
4856 return PTR_ERR(css); 4627 return PTR_ERR(css);
4857 } 4628 }
4858 4629
4859 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4860 ss->root = &cgroup_dummy_root; 4630 ss->root = &cgroup_dummy_root;
4861 4631
4862 /* our new subsystem will be attached to the dummy hierarchy. */ 4632 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4886,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4886 write_unlock(&css_set_lock); 4656 write_unlock(&css_set_lock);
4887 4657
4888 ret = online_css(css); 4658 ret = online_css(css);
4889 if (ret) 4659 if (ret) {
4660 ss->css_free(css);
4890 goto err_unload; 4661 goto err_unload;
4662 }
4891 4663
4892 /* success! */ 4664 /* success! */
4665 mutex_unlock(&cgroup_root_mutex);
4893 mutex_unlock(&cgroup_mutex); 4666 mutex_unlock(&cgroup_mutex);
4894 return 0; 4667 return 0;
4895 4668
4896err_unload: 4669err_unload:
4670 mutex_unlock(&cgroup_root_mutex);
4897 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4898 /* @ss can't be mounted here as try_module_get() would fail */ 4672 /* @ss can't be mounted here as try_module_get() would fail */
4899 cgroup_unload_subsys(ss); 4673 cgroup_unload_subsys(ss);
@@ -4912,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4912void cgroup_unload_subsys(struct cgroup_subsys *ss) 4686void cgroup_unload_subsys(struct cgroup_subsys *ss)
4913{ 4687{
4914 struct cgrp_cset_link *link; 4688 struct cgrp_cset_link *link;
4689 struct cgroup_subsys_state *css;
4915 4690
4916 BUG_ON(ss->module == NULL); 4691 BUG_ON(ss->module == NULL);
4917 4692
@@ -4923,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4923 BUG_ON(ss->root != &cgroup_dummy_root); 4698 BUG_ON(ss->root != &cgroup_dummy_root);
4924 4699
4925 mutex_lock(&cgroup_mutex); 4700 mutex_lock(&cgroup_mutex);
4701 mutex_lock(&cgroup_root_mutex);
4926 4702
4927 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4703 css = cgroup_css(cgroup_dummy_top, ss);
4704 if (css)
4705 offline_css(css);
4928 4706
4929 /* deassign the subsys_id */ 4707 /* deassign the subsys_id */
4930 cgroup_subsys[ss->subsys_id] = NULL; 4708 cgroup_subsys[ss->subsys_id] = NULL;
4931 4709
4932 /* remove subsystem from the dummy root's list of subsystems */
4933 list_del_init(&ss->sibling);
4934
4935 /* 4710 /*
4936 * disentangle the css from all css_sets attached to the dummy 4711 * disentangle the css from all css_sets attached to the dummy
4937 * top. as in loading, we need to pay our respects to the hashtable 4712 * top. as in loading, we need to pay our respects to the hashtable
@@ -4954,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4954 * need to free before marking as null because ss->css_free needs 4729 * need to free before marking as null because ss->css_free needs
4955 * the cgrp->subsys pointer to find their state. 4730 * the cgrp->subsys pointer to find their state.
4956 */ 4731 */
4957 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4732 if (css)
4733 ss->css_free(css);
4958 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4734 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4959 4735
4736 mutex_unlock(&cgroup_root_mutex);
4960 mutex_unlock(&cgroup_mutex); 4737 mutex_unlock(&cgroup_mutex);
4961} 4738}
4962EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4739EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5063,6 +4840,31 @@ out:
5063 return err; 4840 return err;
5064} 4841}
5065 4842
4843static int __init cgroup_wq_init(void)
4844{
4845 /*
4846 * There isn't much point in executing destruction path in
4847 * parallel. Good chunk is serialized with cgroup_mutex anyway.
4848 * Use 1 for @max_active.
4849 *
4850 * We would prefer to do this in cgroup_init() above, but that
4851 * is called before init_workqueues(): so leave this until after.
4852 */
4853 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4854 BUG_ON(!cgroup_destroy_wq);
4855
4856 /*
4857 * Used to destroy pidlists and separate to serve as flush domain.
4858 * Cap @max_active to 1 too.
4859 */
4860 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4861 0, 1);
4862 BUG_ON(!cgroup_pidlist_destroy_wq);
4863
4864 return 0;
4865}
4866core_initcall(cgroup_wq_init);
4867
5066/* 4868/*
5067 * proc_cgroup_show() 4869 * proc_cgroup_show()
5068 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4870 * - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5102,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5102 for_each_active_root(root) { 4904 for_each_active_root(root) {
5103 struct cgroup_subsys *ss; 4905 struct cgroup_subsys *ss;
5104 struct cgroup *cgrp; 4906 struct cgroup *cgrp;
5105 int count = 0; 4907 int ssid, count = 0;
5106 4908
5107 seq_printf(m, "%d:", root->hierarchy_id); 4909 seq_printf(m, "%d:", root->hierarchy_id);
5108 for_each_root_subsys(root, ss) 4910 for_each_subsys(ss, ssid)
5109 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4911 if (root->subsys_mask & (1 << ssid))
4912 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5110 if (strlen(root->name)) 4913 if (strlen(root->name))
5111 seq_printf(m, "%sname=%s", count ? "," : "", 4914 seq_printf(m, "%sname=%s", count ? "," : "",
5112 root->name); 4915 root->name);
@@ -5447,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
5447 * @dentry: directory dentry of interest 5250 * @dentry: directory dentry of interest
5448 * @ss: subsystem of interest 5251 * @ss: subsystem of interest
5449 * 5252 *
5450 * Must be called under RCU read lock. The caller is responsible for 5253 * Must be called under cgroup_mutex or RCU read lock. The caller is
5451 * pinning the returned css if it needs to be accessed outside the RCU 5254 * responsible for pinning the returned css if it needs to be accessed
5452 * critical section. 5255 * outside the critical section.
5453 */ 5256 */
5454struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5257struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5455 struct cgroup_subsys *ss) 5258 struct cgroup_subsys *ss)
5456{ 5259{
5457 struct cgroup *cgrp; 5260 struct cgroup *cgrp;
5458 5261
5459 WARN_ON_ONCE(!rcu_read_lock_held()); 5262 cgroup_assert_mutex_or_rcu_locked();
5460 5263
5461 /* is @dentry a cgroup dir? */ 5264 /* is @dentry a cgroup dir? */
5462 if (!dentry->d_inode || 5265 if (!dentry->d_inode ||
@@ -5479,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5479{ 5282{
5480 struct cgroup *cgrp; 5283 struct cgroup *cgrp;
5481 5284
5482 rcu_lockdep_assert(rcu_read_lock_held() || 5285 cgroup_assert_mutex_or_rcu_locked();
5483 lockdep_is_held(&cgroup_mutex),
5484 "css_from_id() needs proper protection");
5485 5286
5486 cgrp = idr_find(&ss->root->cgroup_idr, id); 5287 cgrp = idr_find(&ss->root->cgroup_idr, id);
5487 if (cgrp) 5288 if (cgrp)
@@ -5529,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5529 return count; 5330 return count;
5530} 5331}
5531 5332
5532static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5333static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5533 struct cftype *cft,
5534 struct seq_file *seq)
5535{ 5334{
5536 struct cgrp_cset_link *link; 5335 struct cgrp_cset_link *link;
5537 struct css_set *cset; 5336 struct css_set *cset;
@@ -5556,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5556} 5355}
5557 5356
5558#define MAX_TASKS_SHOWN_PER_CSS 25 5357#define MAX_TASKS_SHOWN_PER_CSS 25
5559static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5358static int cgroup_css_links_read(struct seq_file *seq, void *v)
5560 struct cftype *cft, struct seq_file *seq)
5561{ 5359{
5360 struct cgroup_subsys_state *css = seq_css(seq);
5562 struct cgrp_cset_link *link; 5361 struct cgrp_cset_link *link;
5563 5362
5564 read_lock(&css_set_lock); 5363 read_lock(&css_set_lock);
@@ -5604,12 +5403,12 @@ static struct cftype debug_files[] = {
5604 5403
5605 { 5404 {
5606 .name = "current_css_set_cg_links", 5405 .name = "current_css_set_cg_links",
5607 .read_seq_string = current_css_set_cg_links_read, 5406 .seq_show = current_css_set_cg_links_read,
5608 }, 5407 },
5609 5408
5610 { 5409 {
5611 .name = "cgroup_css_links", 5410 .name = "cgroup_css_links",
5612 .read_seq_string = cgroup_css_links_read, 5411 .seq_show = cgroup_css_links_read,
5613 }, 5412 },
5614 5413
5615 { 5414 {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d0ebaa..6c3154e477f6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
301 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
302} 302}
303 303
304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, 304static int freezer_read(struct seq_file *m, void *v)
305 struct seq_file *m)
306{ 305{
307 struct cgroup_subsys_state *pos; 306 struct cgroup_subsys_state *css = seq_css(m), *pos;
308 307
309 rcu_read_lock(); 308 rcu_read_lock();
310 309
@@ -458,7 +457,7 @@ static struct cftype files[] = {
458 { 457 {
459 .name = "state", 458 .name = "state",
460 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
461 .read_seq_string = freezer_read, 460 .seq_show = freezer_read,
462 .write_string = freezer_write, 461 .write_string = freezer_write,
463 }, 462 },
464 { 463 {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e5f3917aa05b..6cb20d2e7ee0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -53,10 +53,10 @@ void context_tracking_user_enter(void)
53 /* 53 /*
54 * Repeat the user_enter() check here because some archs may be calling 54 * Repeat the user_enter() check here because some archs may be calling
55 * this from asm and if no CPU needs context tracking, they shouldn't 55 * this from asm and if no CPU needs context tracking, they shouldn't
56 * go further. Repeat the check here until they support the static key 56 * go further. Repeat the check here until they support the inline static
57 * check. 57 * key check.
58 */ 58 */
59 if (!static_key_false(&context_tracking_enabled)) 59 if (!context_tracking_is_enabled())
60 return; 60 return;
61 61
62 /* 62 /*
@@ -160,7 +160,7 @@ void context_tracking_user_exit(void)
160{ 160{
161 unsigned long flags; 161 unsigned long flags;
162 162
163 if (!static_key_false(&context_tracking_enabled)) 163 if (!context_tracking_is_enabled())
164 return; 164 return;
165 165
166 if (in_interrupt()) 166 if (in_interrupt())
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a9a387..277f494c2a9a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
105 __current_set_polling(); 105 __current_set_polling();
106 } 106 }
107 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
115 } 108 }
109
110 /*
111 * Since we fell out of the loop above, we know
112 * TIF_NEED_RESCHED must be set, propagate it into
113 * PREEMPT_NEED_RESCHED.
114 *
115 * This is required because for polling idle loops we will
116 * not have had an IPI to fold the state for us.
117 */
118 preempt_set_need_resched();
116 tick_nohz_idle_exit(); 119 tick_nohz_idle_exit();
117 schedule_preempt_disabled(); 120 schedule_preempt_disabled();
118 } 121 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4410ac6a55f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1033 need_loop = task_has_mempolicy(tsk) || 1033 need_loop = task_has_mempolicy(tsk) ||
1034 !nodes_intersects(*newmems, tsk->mems_allowed); 1034 !nodes_intersects(*newmems, tsk->mems_allowed);
1035 1035
1036 if (need_loop) 1036 if (need_loop) {
1037 local_irq_disable();
1037 write_seqcount_begin(&tsk->mems_allowed_seq); 1038 write_seqcount_begin(&tsk->mems_allowed_seq);
1039 }
1038 1040
1039 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1041 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1040 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1044 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1043 tsk->mems_allowed = *newmems; 1045 tsk->mems_allowed = *newmems;
1044 1046
1045 if (need_loop) 1047 if (need_loop) {
1046 write_seqcount_end(&tsk->mems_allowed_seq); 1048 write_seqcount_end(&tsk->mems_allowed_seq);
1049 local_irq_enable();
1050 }
1047 1051
1048 task_unlock(tsk); 1052 task_unlock(tsk);
1049} 1053}
@@ -1727,66 +1731,41 @@ out_unlock:
1727 * used, list of ranges of sequential numbers, is variable length, 1731 * used, list of ranges of sequential numbers, is variable length,
1728 * and since these maps can change value dynamically, one could read 1732 * and since these maps can change value dynamically, one could read
1729 * gibberish by doing partial reads while a list was changing. 1733 * gibberish by doing partial reads while a list was changing.
1730 * A single large read to a buffer that crosses a page boundary is
1731 * ok, because the result being copied to user land is not recomputed
1732 * across a page fault.
1733 */ 1734 */
1734 1735static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1735static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1736{ 1736{
1737 size_t count; 1737 struct cpuset *cs = css_cs(seq_css(sf));
1738 1738 cpuset_filetype_t type = seq_cft(sf)->private;
1739 mutex_lock(&callback_mutex); 1739 ssize_t count;
1740 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1740 char *buf, *s;
1741 mutex_unlock(&callback_mutex); 1741 int ret = 0;
1742 1742
1743 return count; 1743 count = seq_get_buf(sf, &buf);
1744} 1744 s = buf;
1745
1746static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1747{
1748 size_t count;
1749 1745
1750 mutex_lock(&callback_mutex); 1746 mutex_lock(&callback_mutex);
1751 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1752 mutex_unlock(&callback_mutex);
1753
1754 return count;
1755}
1756
1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1758 struct cftype *cft, struct file *file,
1759 char __user *buf, size_t nbytes,
1760 loff_t *ppos)
1761{
1762 struct cpuset *cs = css_cs(css);
1763 cpuset_filetype_t type = cft->private;
1764 char *page;
1765 ssize_t retval = 0;
1766 char *s;
1767
1768 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1769 return -ENOMEM;
1770
1771 s = page;
1772 1747
1773 switch (type) { 1748 switch (type) {
1774 case FILE_CPULIST: 1749 case FILE_CPULIST:
1775 s += cpuset_sprintf_cpulist(s, cs); 1750 s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1776 break; 1751 break;
1777 case FILE_MEMLIST: 1752 case FILE_MEMLIST:
1778 s += cpuset_sprintf_memlist(s, cs); 1753 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1779 break; 1754 break;
1780 default: 1755 default:
1781 retval = -EINVAL; 1756 ret = -EINVAL;
1782 goto out; 1757 goto out_unlock;
1783 } 1758 }
1784 *s++ = '\n';
1785 1759
1786 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1760 if (s < buf + count - 1) {
1787out: 1761 *s++ = '\n';
1788 free_page((unsigned long)page); 1762 seq_commit(sf, s - buf);
1789 return retval; 1763 } else {
1764 seq_commit(sf, -1);
1765 }
1766out_unlock:
1767 mutex_unlock(&callback_mutex);
1768 return ret;
1790} 1769}
1791 1770
1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1771static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1843,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1843static struct cftype files[] = { 1822static struct cftype files[] = {
1844 { 1823 {
1845 .name = "cpus", 1824 .name = "cpus",
1846 .read = cpuset_common_file_read, 1825 .seq_show = cpuset_common_seq_show,
1847 .write_string = cpuset_write_resmask, 1826 .write_string = cpuset_write_resmask,
1848 .max_write_len = (100U + 6 * NR_CPUS), 1827 .max_write_len = (100U + 6 * NR_CPUS),
1849 .private = FILE_CPULIST, 1828 .private = FILE_CPULIST,
@@ -1851,7 +1830,7 @@ static struct cftype files[] = {
1851 1830
1852 { 1831 {
1853 .name = "mems", 1832 .name = "mems",
1854 .read = cpuset_common_file_read, 1833 .seq_show = cpuset_common_seq_show,
1855 .write_string = cpuset_write_resmask, 1834 .write_string = cpuset_write_resmask,
1856 .max_write_len = (100U + 6 * MAX_NUMNODES), 1835 .max_write_len = (100U + 6 * MAX_NUMNODES),
1857 .private = FILE_MEMLIST, 1836 .private = FILE_MEMLIST,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d724e7757cd1..56003c6edfd3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
119 119
120#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 120#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
121 PERF_FLAG_FD_OUTPUT |\ 121 PERF_FLAG_FD_OUTPUT |\
122 PERF_FLAG_PID_CGROUP) 122 PERF_FLAG_PID_CGROUP |\
123 PERF_FLAG_FD_CLOEXEC)
123 124
124/* 125/*
125 * branch priv levels that need permission checks 126 * branch priv levels that need permission checks
@@ -1396,6 +1397,8 @@ event_sched_out(struct perf_event *event,
1396 if (event->state != PERF_EVENT_STATE_ACTIVE) 1397 if (event->state != PERF_EVENT_STATE_ACTIVE)
1397 return; 1398 return;
1398 1399
1400 perf_pmu_disable(event->pmu);
1401
1399 event->state = PERF_EVENT_STATE_INACTIVE; 1402 event->state = PERF_EVENT_STATE_INACTIVE;
1400 if (event->pending_disable) { 1403 if (event->pending_disable) {
1401 event->pending_disable = 0; 1404 event->pending_disable = 0;
@@ -1412,6 +1415,8 @@ event_sched_out(struct perf_event *event,
1412 ctx->nr_freq--; 1415 ctx->nr_freq--;
1413 if (event->attr.exclusive || !cpuctx->active_oncpu) 1416 if (event->attr.exclusive || !cpuctx->active_oncpu)
1414 cpuctx->exclusive = 0; 1417 cpuctx->exclusive = 0;
1418
1419 perf_pmu_enable(event->pmu);
1415} 1420}
1416 1421
1417static void 1422static void
@@ -1652,6 +1657,7 @@ event_sched_in(struct perf_event *event,
1652 struct perf_event_context *ctx) 1657 struct perf_event_context *ctx)
1653{ 1658{
1654 u64 tstamp = perf_event_time(event); 1659 u64 tstamp = perf_event_time(event);
1660 int ret = 0;
1655 1661
1656 if (event->state <= PERF_EVENT_STATE_OFF) 1662 if (event->state <= PERF_EVENT_STATE_OFF)
1657 return 0; 1663 return 0;
@@ -1674,10 +1680,13 @@ event_sched_in(struct perf_event *event,
1674 */ 1680 */
1675 smp_wmb(); 1681 smp_wmb();
1676 1682
1683 perf_pmu_disable(event->pmu);
1684
1677 if (event->pmu->add(event, PERF_EF_START)) { 1685 if (event->pmu->add(event, PERF_EF_START)) {
1678 event->state = PERF_EVENT_STATE_INACTIVE; 1686 event->state = PERF_EVENT_STATE_INACTIVE;
1679 event->oncpu = -1; 1687 event->oncpu = -1;
1680 return -EAGAIN; 1688 ret = -EAGAIN;
1689 goto out;
1681 } 1690 }
1682 1691
1683 event->tstamp_running += tstamp - event->tstamp_stopped; 1692 event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1693,7 +1702,10 @@ event_sched_in(struct perf_event *event,
1693 if (event->attr.exclusive) 1702 if (event->attr.exclusive)
1694 cpuctx->exclusive = 1; 1703 cpuctx->exclusive = 1;
1695 1704
1696 return 0; 1705out:
1706 perf_pmu_enable(event->pmu);
1707
1708 return ret;
1697} 1709}
1698 1710
1699static int 1711static int
@@ -2743,6 +2755,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2743 if (!event_filter_match(event)) 2755 if (!event_filter_match(event))
2744 continue; 2756 continue;
2745 2757
2758 perf_pmu_disable(event->pmu);
2759
2746 hwc = &event->hw; 2760 hwc = &event->hw;
2747 2761
2748 if (hwc->interrupts == MAX_INTERRUPTS) { 2762 if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2752,7 +2766,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2752 } 2766 }
2753 2767
2754 if (!event->attr.freq || !event->attr.sample_freq) 2768 if (!event->attr.freq || !event->attr.sample_freq)
2755 continue; 2769 goto next;
2756 2770
2757 /* 2771 /*
2758 * stop the event and update event->count 2772 * stop the event and update event->count
@@ -2774,6 +2788,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2774 perf_adjust_period(event, period, delta, false); 2788 perf_adjust_period(event, period, delta, false);
2775 2789
2776 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 2790 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2791 next:
2792 perf_pmu_enable(event->pmu);
2777 } 2793 }
2778 2794
2779 perf_pmu_enable(ctx->pmu); 2795 perf_pmu_enable(ctx->pmu);
@@ -3527,7 +3543,7 @@ static void perf_event_for_each(struct perf_event *event,
3527static int perf_event_period(struct perf_event *event, u64 __user *arg) 3543static int perf_event_period(struct perf_event *event, u64 __user *arg)
3528{ 3544{
3529 struct perf_event_context *ctx = event->ctx; 3545 struct perf_event_context *ctx = event->ctx;
3530 int ret = 0; 3546 int ret = 0, active;
3531 u64 value; 3547 u64 value;
3532 3548
3533 if (!is_sampling_event(event)) 3549 if (!is_sampling_event(event))
@@ -3551,6 +3567,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
3551 event->attr.sample_period = value; 3567 event->attr.sample_period = value;
3552 event->hw.sample_period = value; 3568 event->hw.sample_period = value;
3553 } 3569 }
3570
3571 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3572 if (active) {
3573 perf_pmu_disable(ctx->pmu);
3574 event->pmu->stop(event, PERF_EF_UPDATE);
3575 }
3576
3577 local64_set(&event->hw.period_left, 0);
3578
3579 if (active) {
3580 event->pmu->start(event, PERF_EF_RELOAD);
3581 perf_pmu_enable(ctx->pmu);
3582 }
3583
3554unlock: 3584unlock:
3555 raw_spin_unlock_irq(&ctx->lock); 3585 raw_spin_unlock_irq(&ctx->lock);
3556 3586
@@ -5680,11 +5710,6 @@ static void swevent_hlist_put(struct perf_event *event)
5680{ 5710{
5681 int cpu; 5711 int cpu;
5682 5712
5683 if (event->cpu != -1) {
5684 swevent_hlist_put_cpu(event, event->cpu);
5685 return;
5686 }
5687
5688 for_each_possible_cpu(cpu) 5713 for_each_possible_cpu(cpu)
5689 swevent_hlist_put_cpu(event, cpu); 5714 swevent_hlist_put_cpu(event, cpu);
5690} 5715}
@@ -5718,9 +5743,6 @@ static int swevent_hlist_get(struct perf_event *event)
5718 int err; 5743 int err;
5719 int cpu, failed_cpu; 5744 int cpu, failed_cpu;
5720 5745
5721 if (event->cpu != -1)
5722 return swevent_hlist_get_cpu(event, event->cpu);
5723
5724 get_online_cpus(); 5746 get_online_cpus();
5725 for_each_possible_cpu(cpu) { 5747 for_each_possible_cpu(cpu) {
5726 err = swevent_hlist_get_cpu(event, cpu); 5748 err = swevent_hlist_get_cpu(event, cpu);
@@ -6663,6 +6685,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6663 INIT_LIST_HEAD(&event->event_entry); 6685 INIT_LIST_HEAD(&event->event_entry);
6664 INIT_LIST_HEAD(&event->sibling_list); 6686 INIT_LIST_HEAD(&event->sibling_list);
6665 INIT_LIST_HEAD(&event->rb_entry); 6687 INIT_LIST_HEAD(&event->rb_entry);
6688 INIT_LIST_HEAD(&event->active_entry);
6689 INIT_HLIST_NODE(&event->hlist_entry);
6690
6666 6691
6667 init_waitqueue_head(&event->waitq); 6692 init_waitqueue_head(&event->waitq);
6668 init_irq_work(&event->pending, perf_pending_event); 6693 init_irq_work(&event->pending, perf_pending_event);
@@ -6973,6 +6998,7 @@ SYSCALL_DEFINE5(perf_event_open,
6973 int event_fd; 6998 int event_fd;
6974 int move_group = 0; 6999 int move_group = 0;
6975 int err; 7000 int err;
7001 int f_flags = O_RDWR;
6976 7002
6977 /* for future expandability... */ 7003 /* for future expandability... */
6978 if (flags & ~PERF_FLAG_ALL) 7004 if (flags & ~PERF_FLAG_ALL)
@@ -7001,7 +7027,10 @@ SYSCALL_DEFINE5(perf_event_open,
7001 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 7027 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7002 return -EINVAL; 7028 return -EINVAL;
7003 7029
7004 event_fd = get_unused_fd(); 7030 if (flags & PERF_FLAG_FD_CLOEXEC)
7031 f_flags |= O_CLOEXEC;
7032
7033 event_fd = get_unused_fd_flags(f_flags);
7005 if (event_fd < 0) 7034 if (event_fd < 0)
7006 return event_fd; 7035 return event_fd;
7007 7036
@@ -7123,7 +7152,8 @@ SYSCALL_DEFINE5(perf_event_open,
7123 goto err_context; 7152 goto err_context;
7124 } 7153 }
7125 7154
7126 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 7155 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
7156 f_flags);
7127 if (IS_ERR(event_file)) { 7157 if (IS_ERR(event_file)) {
7128 err = PTR_ERR(event_file); 7158 err = PTR_ERR(event_file);
7129 goto err_context; 7159 goto err_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8b168af135b..146a5792b1d2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -61,19 +61,20 @@ again:
61 * 61 *
62 * kernel user 62 * kernel user
63 * 63 *
64 * READ ->data_tail READ ->data_head 64 * if (LOAD ->data_tail) { LOAD ->data_head
65 * smp_mb() (A) smp_rmb() (C) 65 * (A) smp_rmb() (C)
66 * WRITE $data READ $data 66 * STORE $data LOAD $data
67 * smp_wmb() (B) smp_mb() (D) 67 * smp_wmb() (B) smp_mb() (D)
68 * STORE ->data_head WRITE ->data_tail 68 * STORE ->data_head STORE ->data_tail
69 * }
69 * 70 *
70 * Where A pairs with D, and B pairs with C. 71 * Where A pairs with D, and B pairs with C.
71 * 72 *
72 * I don't think A needs to be a full barrier because we won't in fact 73 * In our case (A) is a control dependency that separates the load of
73 * write data until we see the store from userspace. So we simply don't 74 * the ->data_tail and the stores of $data. In case ->data_tail
74 * issue the data WRITE until we observe it. Be conservative for now. 75 * indicates there is no room in the buffer to store $data we do not.
75 * 76 *
76 * OTOH, D needs to be a full barrier since it separates the data READ 77 * D needs to be a full barrier since it separates the data READ
77 * from the tail WRITE. 78 * from the tail WRITE.
78 * 79 *
79 * For B a WMB is sufficient since it separates two WRITEs, and for C 80 * For B a WMB is sufficient since it separates two WRITEs, and for C
@@ -81,7 +82,7 @@ again:
81 * 82 *
82 * See perf_output_begin(). 83 * See perf_output_begin().
83 */ 84 */
84 smp_wmb(); 85 smp_wmb(); /* B, matches C */
85 rb->user_page->data_head = head; 86 rb->user_page->data_head = head;
86 87
87 /* 88 /*
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle,
144 if (!rb->overwrite && 145 if (!rb->overwrite &&
145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) 146 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
146 goto fail; 147 goto fail;
148
149 /*
150 * The above forms a control dependency barrier separating the
151 * @tail load above from the data stores below. Since the @tail
152 * load is required to compute the branch to fail below.
153 *
154 * A, matches D; the full memory barrier userspace SHOULD issue
155 * after reading the data and before storing the new tail
156 * position.
157 *
158 * See perf_output_put_handle().
159 */
160
147 head += size; 161 head += size;
148 } while (local_cmpxchg(&rb->head, offset, head) != offset); 162 } while (local_cmpxchg(&rb->head, offset, head) != offset);
149 163
150 /* 164 /*
151 * Separate the userpage->tail read from the data stores below. 165 * We rely on the implied barrier() by local_cmpxchg() to ensure
152 * Matches the MB userspace SHOULD issue after reading the data 166 * none of the data stores below can be lifted up by the compiler.
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */ 167 */
157 smp_mb();
158 168
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) 169 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
160 local_add(rb->watermark, &rb->wakeup); 170 local_add(rb->watermark, &rb->wakeup);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6ca871b..307d87c0991a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -73,6 +73,17 @@ struct uprobe {
73 struct inode *inode; /* Also hold a ref to inode */ 73 struct inode *inode; /* Also hold a ref to inode */
74 loff_t offset; 74 loff_t offset;
75 unsigned long flags; 75 unsigned long flags;
76
77 /*
78 * The generic code assumes that it has two members of unknown type
79 * owned by the arch-specific code:
80 *
81 * insn - copy_insn() saves the original instruction here for
82 * arch_uprobe_analyze_insn().
83 *
84 * ixol - potentially modified instruction to execute out of
85 * line, copied to xol_area by xol_get_insn_slot().
86 */
76 struct arch_uprobe arch; 87 struct arch_uprobe arch;
77}; 88};
78 89
@@ -86,6 +97,29 @@ struct return_instance {
86}; 97};
87 98
88/* 99/*
100 * Execute out of line area: anonymous executable mapping installed
101 * by the probed task to execute the copy of the original instruction
102 * mangled by set_swbp().
103 *
104 * On a breakpoint hit, thread contests for a slot. It frees the
105 * slot after singlestep. Currently a fixed number of slots are
106 * allocated.
107 */
108struct xol_area {
109 wait_queue_head_t wq; /* if all slots are busy */
110 atomic_t slot_count; /* number of in-use slots */
111 unsigned long *bitmap; /* 0 = free slot */
112 struct page *page;
113
114 /*
115 * We keep the vma's vm_start rather than a pointer to the vma
116 * itself. The probed process or a naughty kernel module could make
117 * the vma go away, and we must handle that reasonably gracefully.
118 */
119 unsigned long vaddr; /* Page(s) of instruction slots */
120};
121
122/*
89 * valid_vma: Verify if the specified vma is an executable vma 123 * valid_vma: Verify if the specified vma is an executable vma
90 * Relax restrictions while unregistering: vm_flags might have 124 * Relax restrictions while unregistering: vm_flags might have
91 * changed after breakpoint was inserted. 125 * changed after breakpoint was inserted.
@@ -330,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
330int __weak 364int __weak
331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 365set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
332{ 366{
333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 367 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
334} 368}
335 369
336static int match_uprobe(struct uprobe *l, struct uprobe *r) 370static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -529,8 +563,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 563{
530 struct address_space *mapping = uprobe->inode->i_mapping; 564 struct address_space *mapping = uprobe->inode->i_mapping;
531 loff_t offs = uprobe->offset; 565 loff_t offs = uprobe->offset;
532 void *insn = uprobe->arch.insn; 566 void *insn = &uprobe->arch.insn;
533 int size = MAX_UINSN_BYTES; 567 int size = sizeof(uprobe->arch.insn);
534 int len, err = -EIO; 568 int len, err = -EIO;
535 569
536 /* Copy only available bytes, -EIO if nothing was read */ 570 /* Copy only available bytes, -EIO if nothing was read */
@@ -569,7 +603,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
569 goto out; 603 goto out;
570 604
571 ret = -ENOTSUPP; 605 ret = -ENOTSUPP;
572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) 606 if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
573 goto out; 607 goto out;
574 608
575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 609 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -1264,7 +1298,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1264 1298
1265 /* Initialize the slot */ 1299 /* Initialize the slot */
1266 copy_to_page(area->page, xol_vaddr, 1300 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); 1301 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1268 /* 1302 /*
1269 * We probably need flush_icache_user_range() but it needs vma. 1303 * We probably need flush_icache_user_range() but it needs vma.
1270 * This should work on supported architectures too. 1304 * This should work on supported architectures too.
@@ -1403,12 +1437,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg)
1403 1437
1404static void dup_xol_work(struct callback_head *work) 1438static void dup_xol_work(struct callback_head *work)
1405{ 1439{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING) 1440 if (current->flags & PF_EXITING)
1409 return; 1441 return;
1410 1442
1411 if (!__create_xol_area(current->utask->vaddr)) 1443 if (!__create_xol_area(current->utask->dup_xol_addr))
1412 uprobe_warn(current, "dup xol area"); 1444 uprobe_warn(current, "dup xol area");
1413} 1445}
1414 1446
@@ -1419,7 +1451,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{ 1451{
1420 struct uprobe_task *utask = current->utask; 1452 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm; 1453 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area; 1454 struct xol_area *area;
1424 1455
1425 t->utask = NULL; 1456 t->utask = NULL;
@@ -1441,14 +1472,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1441 if (mm == t->mm) 1472 if (mm == t->mm)
1442 return; 1473 return;
1443 1474
1444 /* TODO: move it into the union in uprobe_task */ 1475 t->utask->dup_xol_addr = area->vaddr;
1445 work = kmalloc(sizeof(*work), GFP_KERNEL); 1476 init_task_work(&t->utask->dup_xol_work, dup_xol_work);
1446 if (!work) 1477 task_work_add(t, &t->utask->dup_xol_work, true);
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452} 1478}
1453 1479
1454/* 1480/*
@@ -1828,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs)
1828 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1854 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1829 goto out; 1855 goto out;
1830 1856
1857 /* Tracing handlers use ->utask to communicate with fetch methods */
1858 if (!get_utask())
1859 goto out;
1860
1831 handler_chain(uprobe, regs); 1861 handler_chain(uprobe, regs);
1832 if (can_skip_sstep(uprobe, regs)) 1862 if (can_skip_sstep(uprobe, regs))
1833 goto out; 1863 goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819055d5..1e77fc645317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 } 75 }
76 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
77 list_del_rcu(&p->thread_node);
77} 78}
78 79
79/* 80/*
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
61static inline int init_kernel_text(unsigned long addr) 61static inline int init_kernel_text(unsigned long addr)
62{ 62{
63 if (addr >= (unsigned long)_sinittext && 63 if (addr >= (unsigned long)_sinittext &&
64 addr <= (unsigned long)_einittext) 64 addr < (unsigned long)_einittext)
65 return 1; 65 return 1;
66 return 0; 66 return 0;
67} 67}
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
69int core_kernel_text(unsigned long addr) 69int core_kernel_text(unsigned long addr)
70{ 70{
71 if (addr >= (unsigned long)_stext && 71 if (addr >= (unsigned long)_stext &&
72 addr <= (unsigned long)_etext) 72 addr < (unsigned long)_etext)
73 return 1; 73 return 1;
74 74
75 if (system_state == SYSTEM_BOOTING && 75 if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..2f11bbe376b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
537 spin_lock_init(&mm->page_table_lock); 537 spin_lock_init(&mm->page_table_lock);
538 mm_init_aio(mm); 538 mm_init_aio(mm);
539 mm_init_owner(mm, p); 539 mm_init_owner(mm, p);
540 clear_tlb_flush_pending(mm);
540 541
541 if (likely(!mm_alloc_pgd(mm))) { 542 if (likely(!mm_alloc_pgd(mm))) {
542 mm->def_flags = 0; 543 mm->def_flags = 0;
@@ -1034,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1034 sig->nr_threads = 1; 1035 sig->nr_threads = 1;
1035 atomic_set(&sig->live, 1); 1036 atomic_set(&sig->live, 1);
1036 atomic_set(&sig->sigcnt, 1); 1037 atomic_set(&sig->sigcnt, 1);
1038
1039 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1040 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1041 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1042
1037 init_waitqueue_head(&sig->wait_chldexit); 1043 init_waitqueue_head(&sig->wait_chldexit);
1038 sig->curr_target = tsk; 1044 sig->curr_target = tsk;
1039 init_sigpending(&sig->shared_pending); 1045 init_sigpending(&sig->shared_pending);
@@ -1086,8 +1092,10 @@ static void rt_mutex_init_task(struct task_struct *p)
1086{ 1092{
1087 raw_spin_lock_init(&p->pi_lock); 1093 raw_spin_lock_init(&p->pi_lock);
1088#ifdef CONFIG_RT_MUTEXES 1094#ifdef CONFIG_RT_MUTEXES
1089 plist_head_init(&p->pi_waiters); 1095 p->pi_waiters = RB_ROOT;
1096 p->pi_waiters_leftmost = NULL;
1090 p->pi_blocked_on = NULL; 1097 p->pi_blocked_on = NULL;
1098 p->pi_top_task = NULL;
1091#endif 1099#endif
1092} 1100}
1093 1101
@@ -1171,7 +1179,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1171 * do not allow it to share a thread group or signal handlers or 1179 * do not allow it to share a thread group or signal handlers or
1172 * parent with the forking task. 1180 * parent with the forking task.
1173 */ 1181 */
1174 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { 1182 if (clone_flags & CLONE_SIGHAND) {
1175 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || 1183 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1176 (task_active_pid_ns(current) != 1184 (task_active_pid_ns(current) !=
1177 current->nsproxy->pid_ns_for_children)) 1185 current->nsproxy->pid_ns_for_children))
@@ -1310,7 +1318,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1310#endif 1318#endif
1311 1319
1312 /* Perform scheduler related setup. Assign this task to a CPU. */ 1320 /* Perform scheduler related setup. Assign this task to a CPU. */
1313 sched_fork(clone_flags, p); 1321 retval = sched_fork(clone_flags, p);
1322 if (retval)
1323 goto bad_fork_cleanup_policy;
1314 1324
1315 retval = perf_event_init_task(p); 1325 retval = perf_event_init_task(p);
1316 if (retval) 1326 if (retval)
@@ -1402,13 +1412,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1402 p->tgid = p->pid; 1412 p->tgid = p->pid;
1403 } 1413 }
1404 1414
1405 p->pdeath_signal = 0;
1406 p->exit_state = 0;
1407
1408 p->nr_dirtied = 0; 1415 p->nr_dirtied = 0;
1409 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1416 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1410 p->dirty_paused_when = 0; 1417 p->dirty_paused_when = 0;
1411 1418
1419 p->pdeath_signal = 0;
1412 INIT_LIST_HEAD(&p->thread_group); 1420 INIT_LIST_HEAD(&p->thread_group);
1413 p->task_works = NULL; 1421 p->task_works = NULL;
1414 1422
@@ -1471,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1471 atomic_inc(&current->signal->sigcnt); 1479 atomic_inc(&current->signal->sigcnt);
1472 list_add_tail_rcu(&p->thread_group, 1480 list_add_tail_rcu(&p->thread_group,
1473 &p->group_leader->thread_group); 1481 &p->group_leader->thread_group);
1482 list_add_tail_rcu(&p->thread_node,
1483 &p->signal->thread_head);
1474 } 1484 }
1475 attach_pid(p, PIDTYPE_PID); 1485 attach_pid(p, PIDTYPE_PID);
1476 nr_threads++; 1486 nr_threads++;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
19bool pm_freezing; 19bool pm_freezing;
20bool pm_nosig_freezing; 20bool pm_nosig_freezing;
21 21
22/*
23 * Temporary export for the deadlock workaround in ata_scsi_hotplug().
24 * Remove once the hack becomes unnecessary.
25 */
26EXPORT_SYMBOL_GPL(pm_freezing);
27
22/* protects freezing and frozen transitions */ 28/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock); 29static DEFINE_SPINLOCK(freezer_lock);
24 30
diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086f021d..44a1261cb9ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,14 +63,101 @@
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h> 64#include <linux/hugetlb.h>
65#include <linux/freezer.h> 65#include <linux/freezer.h>
66#include <linux/bootmem.h>
66 67
67#include <asm/futex.h> 68#include <asm/futex.h>
68 69
69#include "locking/rtmutex_common.h" 70#include "locking/rtmutex_common.h"
70 71
71int __read_mostly futex_cmpxchg_enabled; 72/*
73 * Basic futex operation and ordering guarantees:
74 *
75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not changed
79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * and schedules.
81 *
82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This function computes the hash bucket and acquires the
84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * bucket and wakes them.
86 *
87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
92 *
93 * CPU 0 CPU 1
94 * val = *futex;
95 * sys_futex(WAIT, futex, val);
96 * futex_wait(futex, val);
97 * uval = *futex;
98 * *futex = newval;
99 * sys_futex(WAKE, futex);
100 * futex_wake(futex);
101 * if (queue_empty())
102 * return;
103 * if (uval == val)
104 * lock(hash_bucket(futex));
105 * queue();
106 * unlock(hash_bucket(futex));
107 * schedule();
108 *
109 * This would cause the waiter on CPU 0 to wait forever because it
110 * missed the transition of the user space value from val to newval
111 * and the waker did not find the waiter in the hash bucket queue.
112 *
113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
118 * val = *futex;
119 * sys_futex(WAIT, futex, val);
120 * futex_wait(futex, val);
121 *
122 * waiters++;
123 * mb(); (A) <-- paired with -.
124 * |
125 * lock(hash_bucket(futex)); |
126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
133 * if (uval == val)
134 * queue();
135 * unlock(hash_bucket(futex));
136 * schedule(); if (waiters)
137 * lock(hash_bucket(futex));
138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
158 */
72 159
73#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 160int __read_mostly futex_cmpxchg_enabled;
74 161
75/* 162/*
76 * Futex flags used to encode options to functions and preserve them across 163 * Futex flags used to encode options to functions and preserve them across
@@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = {
149struct futex_hash_bucket { 236struct futex_hash_bucket {
150 spinlock_t lock; 237 spinlock_t lock;
151 struct plist_head chain; 238 struct plist_head chain;
152}; 239} ____cacheline_aligned_in_smp;
240
241static unsigned long __read_mostly futex_hashsize;
242
243static struct futex_hash_bucket *futex_queues;
244
245static inline void futex_get_mm(union futex_key *key)
246{
247 atomic_inc(&key->private.mm->mm_count);
248 /*
249 * Ensure futex_get_mm() implies a full barrier such that
250 * get_futex_key() implies a full barrier. This is relied upon
251 * as full barrier (B), see the ordering comment above.
252 */
253 smp_mb__after_atomic_inc();
254}
255
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
257{
258#ifdef CONFIG_SMP
259 /*
260 * Tasks trying to enter the critical region are most likely
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */
265 if (spin_is_locked(&hb->lock))
266 return true;
267 smp_rmb(); /* Make sure we check the lock state first */
153 268
154static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 269 return !plist_head_empty(&hb->chain);
270#else
271 return true;
272#endif
273}
155 274
156/* 275/*
157 * We hash on the keys returned from get_futex_key (see below). 276 * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
161 u32 hash = jhash2((u32*)&key->both.word, 280 u32 hash = jhash2((u32*)&key->both.word,
162 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 281 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
163 key->both.offset); 282 key->both.offset);
164 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 283 return &futex_queues[hash & (futex_hashsize - 1)];
165} 284}
166 285
167/* 286/*
@@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
187 306
188 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 307 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
189 case FUT_OFF_INODE: 308 case FUT_OFF_INODE:
190 ihold(key->shared.inode); 309 ihold(key->shared.inode); /* implies MB (B) */
191 break; 310 break;
192 case FUT_OFF_MMSHARED: 311 case FUT_OFF_MMSHARED:
193 atomic_inc(&key->private.mm->mm_count); 312 futex_get_mm(key); /* implies MB (B) */
194 break; 313 break;
195 } 314 }
196} 315}
@@ -251,6 +370,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
251 return -EINVAL; 370 return -EINVAL;
252 address -= key->both.offset; 371 address -= key->both.offset;
253 372
373 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
374 return -EFAULT;
375
254 /* 376 /*
255 * PROCESS_PRIVATE futexes are fast. 377 * PROCESS_PRIVATE futexes are fast.
256 * As the mm cannot disappear under us and the 'key' only needs 378 * As the mm cannot disappear under us and the 'key' only needs
@@ -259,11 +381,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
259 * but access_ok() should be faster than find_vma() 381 * but access_ok() should be faster than find_vma()
260 */ 382 */
261 if (!fshared) { 383 if (!fshared) {
262 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
263 return -EFAULT;
264 key->private.mm = mm; 384 key->private.mm = mm;
265 key->private.address = address; 385 key->private.address = address;
266 get_futex_key_refs(key); 386 get_futex_key_refs(key); /* implies MB (B) */
267 return 0; 387 return 0;
268 } 388 }
269 389
@@ -288,7 +408,7 @@ again:
288 put_page(page); 408 put_page(page);
289 /* serialize against __split_huge_page_splitting() */ 409 /* serialize against __split_huge_page_splitting() */
290 local_irq_disable(); 410 local_irq_disable();
291 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { 411 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
292 page_head = compound_head(page); 412 page_head = compound_head(page);
293 /* 413 /*
294 * page_head is valid pointer but we must pin 414 * page_head is valid pointer but we must pin
@@ -370,7 +490,7 @@ again:
370 key->shared.pgoff = basepage_index(page); 490 key->shared.pgoff = basepage_index(page);
371 } 491 }
372 492
373 get_futex_key_refs(key); 493 get_futex_key_refs(key); /* implies MB (B) */
374 494
375out: 495out:
376 unlock_page(page_head); 496 unlock_page(page_head);
@@ -597,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
597{ 717{
598 struct futex_pi_state *pi_state = NULL; 718 struct futex_pi_state *pi_state = NULL;
599 struct futex_q *this, *next; 719 struct futex_q *this, *next;
600 struct plist_head *head;
601 struct task_struct *p; 720 struct task_struct *p;
602 pid_t pid = uval & FUTEX_TID_MASK; 721 pid_t pid = uval & FUTEX_TID_MASK;
603 722
604 head = &hb->chain; 723 plist_for_each_entry_safe(this, next, &hb->chain, list) {
605
606 plist_for_each_entry_safe(this, next, head, list) {
607 if (match_futex(&this->key, key)) { 724 if (match_futex(&this->key, key)) {
608 /* 725 /*
609 * Another waiter already exists - bump up 726 * Another waiter already exists - bump up
@@ -985,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
985{ 1102{
986 struct futex_hash_bucket *hb; 1103 struct futex_hash_bucket *hb;
987 struct futex_q *this, *next; 1104 struct futex_q *this, *next;
988 struct plist_head *head;
989 union futex_key key = FUTEX_KEY_INIT; 1105 union futex_key key = FUTEX_KEY_INIT;
990 int ret; 1106 int ret;
991 1107
@@ -997,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
997 goto out; 1113 goto out;
998 1114
999 hb = hash_futex(&key); 1115 hb = hash_futex(&key);
1116
1117 /* Make sure we really have tasks to wakeup */
1118 if (!hb_waiters_pending(hb))
1119 goto out_put_key;
1120
1000 spin_lock(&hb->lock); 1121 spin_lock(&hb->lock);
1001 head = &hb->chain;
1002 1122
1003 plist_for_each_entry_safe(this, next, head, list) { 1123 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1004 if (match_futex (&this->key, &key)) { 1124 if (match_futex (&this->key, &key)) {
1005 if (this->pi_state || this->rt_waiter) { 1125 if (this->pi_state || this->rt_waiter) {
1006 ret = -EINVAL; 1126 ret = -EINVAL;
@@ -1018,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1018 } 1138 }
1019 1139
1020 spin_unlock(&hb->lock); 1140 spin_unlock(&hb->lock);
1141out_put_key:
1021 put_futex_key(&key); 1142 put_futex_key(&key);
1022out: 1143out:
1023 return ret; 1144 return ret;
@@ -1033,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1033{ 1154{
1034 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1035 struct futex_hash_bucket *hb1, *hb2; 1156 struct futex_hash_bucket *hb1, *hb2;
1036 struct plist_head *head;
1037 struct futex_q *this, *next; 1157 struct futex_q *this, *next;
1038 int ret, op_ret; 1158 int ret, op_ret;
1039 1159
@@ -1081,9 +1201,7 @@ retry_private:
1081 goto retry; 1201 goto retry;
1082 } 1202 }
1083 1203
1084 head = &hb1->chain; 1204 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1085
1086 plist_for_each_entry_safe(this, next, head, list) {
1087 if (match_futex (&this->key, &key1)) { 1205 if (match_futex (&this->key, &key1)) {
1088 if (this->pi_state || this->rt_waiter) { 1206 if (this->pi_state || this->rt_waiter) {
1089 ret = -EINVAL; 1207 ret = -EINVAL;
@@ -1096,10 +1214,8 @@ retry_private:
1096 } 1214 }
1097 1215
1098 if (op_ret > 0) { 1216 if (op_ret > 0) {
1099 head = &hb2->chain;
1100
1101 op_ret = 0; 1217 op_ret = 0;
1102 plist_for_each_entry_safe(this, next, head, list) { 1218 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1103 if (match_futex (&this->key, &key2)) { 1219 if (match_futex (&this->key, &key2)) {
1104 if (this->pi_state || this->rt_waiter) { 1220 if (this->pi_state || this->rt_waiter) {
1105 ret = -EINVAL; 1221 ret = -EINVAL;
@@ -1269,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1269 int drop_count = 0, task_count = 0, ret; 1385 int drop_count = 0, task_count = 0, ret;
1270 struct futex_pi_state *pi_state = NULL; 1386 struct futex_pi_state *pi_state = NULL;
1271 struct futex_hash_bucket *hb1, *hb2; 1387 struct futex_hash_bucket *hb1, *hb2;
1272 struct plist_head *head1;
1273 struct futex_q *this, *next; 1388 struct futex_q *this, *next;
1274 u32 curval2; 1389 u32 curval2;
1275 1390
@@ -1392,8 +1507,7 @@ retry_private:
1392 } 1507 }
1393 } 1508 }
1394 1509
1395 head1 = &hb1->chain; 1510 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1396 plist_for_each_entry_safe(this, next, head1, list) {
1397 if (task_count - nr_wake >= nr_requeue) 1511 if (task_count - nr_wake >= nr_requeue)
1398 break; 1512 break;
1399 1513
@@ -1488,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1488 hb = hash_futex(&q->key); 1602 hb = hash_futex(&q->key);
1489 q->lock_ptr = &hb->lock; 1603 q->lock_ptr = &hb->lock;
1490 1604
1491 spin_lock(&hb->lock); 1605 spin_lock(&hb->lock); /* implies MB (A) */
1492 return hb; 1606 return hb;
1493} 1607}
1494 1608
1495static inline void 1609static inline void
1496queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1610queue_unlock(struct futex_hash_bucket *hb)
1497 __releases(&hb->lock) 1611 __releases(&hb->lock)
1498{ 1612{
1499 spin_unlock(&hb->lock); 1613 spin_unlock(&hb->lock);
@@ -1866,7 +1980,7 @@ retry_private:
1866 ret = get_futex_value_locked(&uval, uaddr); 1980 ret = get_futex_value_locked(&uval, uaddr);
1867 1981
1868 if (ret) { 1982 if (ret) {
1869 queue_unlock(q, *hb); 1983 queue_unlock(*hb);
1870 1984
1871 ret = get_user(uval, uaddr); 1985 ret = get_user(uval, uaddr);
1872 if (ret) 1986 if (ret)
@@ -1880,7 +1994,7 @@ retry_private:
1880 } 1994 }
1881 1995
1882 if (uval != val) { 1996 if (uval != val) {
1883 queue_unlock(q, *hb); 1997 queue_unlock(*hb);
1884 ret = -EWOULDBLOCK; 1998 ret = -EWOULDBLOCK;
1885 } 1999 }
1886 2000
@@ -2028,7 +2142,7 @@ retry_private:
2028 * Task is exiting and we just wait for the 2142 * Task is exiting and we just wait for the
2029 * exit to complete. 2143 * exit to complete.
2030 */ 2144 */
2031 queue_unlock(&q, hb); 2145 queue_unlock(hb);
2032 put_futex_key(&q.key); 2146 put_futex_key(&q.key);
2033 cond_resched(); 2147 cond_resched();
2034 goto retry; 2148 goto retry;
@@ -2080,7 +2194,7 @@ retry_private:
2080 goto out_put_key; 2194 goto out_put_key;
2081 2195
2082out_unlock_put_key: 2196out_unlock_put_key:
2083 queue_unlock(&q, hb); 2197 queue_unlock(hb);
2084 2198
2085out_put_key: 2199out_put_key:
2086 put_futex_key(&q.key); 2200 put_futex_key(&q.key);
@@ -2090,7 +2204,7 @@ out:
2090 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2204 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2091 2205
2092uaddr_faulted: 2206uaddr_faulted:
2093 queue_unlock(&q, hb); 2207 queue_unlock(hb);
2094 2208
2095 ret = fault_in_user_writeable(uaddr); 2209 ret = fault_in_user_writeable(uaddr);
2096 if (ret) 2210 if (ret)
@@ -2112,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2112{ 2226{
2113 struct futex_hash_bucket *hb; 2227 struct futex_hash_bucket *hb;
2114 struct futex_q *this, *next; 2228 struct futex_q *this, *next;
2115 struct plist_head *head;
2116 union futex_key key = FUTEX_KEY_INIT; 2229 union futex_key key = FUTEX_KEY_INIT;
2117 u32 uval, vpid = task_pid_vnr(current); 2230 u32 uval, vpid = task_pid_vnr(current);
2118 int ret; 2231 int ret;
@@ -2152,9 +2265,7 @@ retry:
2152 * Ok, other tasks may need to be woken up - check waiters 2265 * Ok, other tasks may need to be woken up - check waiters
2153 * and do the wakeup if necessary: 2266 * and do the wakeup if necessary:
2154 */ 2267 */
2155 head = &hb->chain; 2268 plist_for_each_entry_safe(this, next, &hb->chain, list) {
2156
2157 plist_for_each_entry_safe(this, next, head, list) {
2158 if (!match_futex (&this->key, &key)) 2269 if (!match_futex (&this->key, &key))
2159 continue; 2270 continue;
2160 ret = wake_futex_pi(uaddr, uval, this); 2271 ret = wake_futex_pi(uaddr, uval, this);
@@ -2315,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2315 * code while we sleep on uaddr. 2426 * code while we sleep on uaddr.
2316 */ 2427 */
2317 debug_rt_mutex_init_waiter(&rt_waiter); 2428 debug_rt_mutex_init_waiter(&rt_waiter);
2429 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2430 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2318 rt_waiter.task = NULL; 2431 rt_waiter.task = NULL;
2319 2432
2320 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2433 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2733,8 +2846,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2733static int __init futex_init(void) 2846static int __init futex_init(void)
2734{ 2847{
2735 u32 curval; 2848 u32 curval;
2736 int i; 2849 unsigned int futex_shift;
2850 unsigned long i;
2851
2852#if CONFIG_BASE_SMALL
2853 futex_hashsize = 16;
2854#else
2855 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2856#endif
2737 2857
2858 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2859 futex_hashsize, 0,
2860 futex_hashsize < 256 ? HASH_SMALL : 0,
2861 &futex_shift, NULL,
2862 futex_hashsize, futex_hashsize);
2863 futex_hashsize = 1UL << futex_shift;
2738 /* 2864 /*
2739 * This will fail and we want it. Some arch implementations do 2865 * This will fail and we want it. Some arch implementations do
2740 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2866 * runtime detection of the futex_atomic_cmpxchg_inatomic()
@@ -2748,7 +2874,7 @@ static int __init futex_init(void)
2748 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 2874 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2749 futex_cmpxchg_enabled = 1; 2875 futex_cmpxchg_enabled = 1;
2750 2876
2751 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2877 for (i = 0; i < futex_hashsize; i++) {
2752 plist_head_init(&futex_queues[i].chain); 2878 plist_head_init(&futex_queues[i].chain);
2753 spin_lock_init(&futex_queues[i].lock); 2879 spin_lock_init(&futex_queues[i].lock);
2754 } 2880 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 383319bae3f7..09094361dce5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,7 @@
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
50#include <linux/freezer.h> 51#include <linux/freezer.h>
51 52
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1610 unsigned long slack; 1611 unsigned long slack;
1611 1612
1612 slack = current->timer_slack_ns; 1613 slack = current->timer_slack_ns;
1613 if (rt_task(current)) 1614 if (dl_task(current) || rt_task(current))
1614 slack = 0; 1615 slack = 0;
1615 1616
1616 hrtimer_init_on_stack(&t.timer, clockid, mode); 1617 hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
50 bool is_early = desc->action && 50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME; 51 desc->action->flags & IRQF_EARLY_RESUME;
52 52
53 if (is_early != want_early) 53 if (!is_early && want_early)
54 continue; 54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc03627e..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
49 49
50/* Flag to indicate we are going to kexec a new kernel */
51bool kexec_in_progress = false;
52
50/* Location of the reserved area for the crash kernel */ 53/* Location of the reserved area for the crash kernel */
51struct resource crashk_res = { 54struct resource crashk_res = {
52 .name = "Crash kernel", 55 .name = "Crash kernel",
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
1675 } else 1678 } else
1676#endif 1679#endif
1677 { 1680 {
1681 kexec_in_progress = true;
1678 kernel_restart_prepare(NULL); 1682 kernel_restart_prepare(NULL);
1683 migrate_to_reboot_cpu();
1679 printk(KERN_EMERG "Starting new kernel\n"); 1684 printk(KERN_EMERG "Starting new kernel\n");
1680 machine_shutdown(); 1685 machine_shutdown();
1681 } 1686 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 576ba756a32d..eb8a54783fa0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class)
590/* 590/*
591 * Is this the address of a static object: 591 * Is this the address of a static object:
592 */ 592 */
593#ifdef __KERNEL__
593static int static_obj(void *obj) 594static int static_obj(void *obj)
594{ 595{
595 unsigned long start = (unsigned long) &_stext, 596 unsigned long start = (unsigned long) &_stext,
@@ -616,6 +617,7 @@ static int static_obj(void *obj)
616 */ 617 */
617 return is_module_address(addr) || is_module_percpu_address(addr); 618 return is_module_address(addr) || is_module_percpu_address(addr);
618} 619}
620#endif
619 621
620/* 622/*
621 * To make lock name printouts unique, we calculate a unique 623 * To make lock name printouts unique, we calculate a unique
@@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void)
4115} 4117}
4116EXPORT_SYMBOL_GPL(debug_check_no_locks_held); 4118EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4117 4119
4120#ifdef __KERNEL__
4118void debug_show_all_locks(void) 4121void debug_show_all_locks(void)
4119{ 4122{
4120 struct task_struct *g, *p; 4123 struct task_struct *g, *p;
@@ -4172,6 +4175,7 @@ retry:
4172 read_unlock(&tasklist_lock); 4175 read_unlock(&tasklist_lock);
4173} 4176}
4174EXPORT_SYMBOL_GPL(debug_show_all_locks); 4177EXPORT_SYMBOL_GPL(debug_show_all_locks);
4178#endif
4175 4179
4176/* 4180/*
4177 * Careful: only use this function if you are sure that 4181 * Careful: only use this function if you are sure that
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..faf6f5b53e77 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current); 78
79 if (!lock->owner)
80 DEBUG_LOCKS_WARN_ON(!lock->owner);
81 else
82 DEBUG_LOCKS_WARN_ON(lock->owner != current);
83
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 85 mutex_clear_owner(lock);
81} 86}
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..49b2ed3dced8 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -24,7 +24,7 @@
24#include <linux/kallsyms.h> 24#include <linux/kallsyms.h>
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/plist.h> 27#include <linux/rbtree.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/debug_locks.h> 29#include <linux/debug_locks.h>
30 30
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
57 57
58void rt_mutex_debug_task_free(struct task_struct *task) 58void rt_mutex_debug_task_free(struct task_struct *task)
59{ 59{
60 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 60 DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
62} 62}
63 63
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
155{ 155{
156 memset(waiter, 0x11, sizeof(*waiter)); 156 memset(waiter, 0x11, sizeof(*waiter));
157 plist_node_init(&waiter->list_entry, MAX_PRIO);
158 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
159 waiter->deadlock_task_pid = NULL; 157 waiter->deadlock_task_pid = NULL;
160} 158}
161 159
162void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 160void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
163{ 161{
164 put_pid(waiter->deadlock_task_pid); 162 put_pid(waiter->deadlock_task_pid);
165 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
166 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
167 memset(waiter, 0x22, sizeof(*waiter)); 163 memset(waiter, 0x22, sizeof(*waiter));
168} 164}
169 165
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..2e960a2bab81 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -14,6 +14,7 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
17#include <linux/sched/deadline.h>
17#include <linux/timer.h> 18#include <linux/timer.h>
18 19
19#include "rtmutex_common.h" 20#include "rtmutex_common.h"
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
91} 92}
92#endif 93#endif
93 94
95static inline int
96rt_mutex_waiter_less(struct rt_mutex_waiter *left,
97 struct rt_mutex_waiter *right)
98{
99 if (left->prio < right->prio)
100 return 1;
101
102 /*
103 * If both waiters have dl_prio(), we check the deadlines of the
104 * associated tasks.
105 * If left waiter has a dl_prio(), and we didn't return 1 above,
106 * then right waiter has a dl_prio() too.
107 */
108 if (dl_prio(left->prio))
109 return (left->task->dl.deadline < right->task->dl.deadline);
110
111 return 0;
112}
113
114static void
115rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
116{
117 struct rb_node **link = &lock->waiters.rb_node;
118 struct rb_node *parent = NULL;
119 struct rt_mutex_waiter *entry;
120 int leftmost = 1;
121
122 while (*link) {
123 parent = *link;
124 entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
125 if (rt_mutex_waiter_less(waiter, entry)) {
126 link = &parent->rb_left;
127 } else {
128 link = &parent->rb_right;
129 leftmost = 0;
130 }
131 }
132
133 if (leftmost)
134 lock->waiters_leftmost = &waiter->tree_entry;
135
136 rb_link_node(&waiter->tree_entry, parent, link);
137 rb_insert_color(&waiter->tree_entry, &lock->waiters);
138}
139
140static void
141rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
142{
143 if (RB_EMPTY_NODE(&waiter->tree_entry))
144 return;
145
146 if (lock->waiters_leftmost == &waiter->tree_entry)
147 lock->waiters_leftmost = rb_next(&waiter->tree_entry);
148
149 rb_erase(&waiter->tree_entry, &lock->waiters);
150 RB_CLEAR_NODE(&waiter->tree_entry);
151}
152
153static void
154rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
155{
156 struct rb_node **link = &task->pi_waiters.rb_node;
157 struct rb_node *parent = NULL;
158 struct rt_mutex_waiter *entry;
159 int leftmost = 1;
160
161 while (*link) {
162 parent = *link;
163 entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
164 if (rt_mutex_waiter_less(waiter, entry)) {
165 link = &parent->rb_left;
166 } else {
167 link = &parent->rb_right;
168 leftmost = 0;
169 }
170 }
171
172 if (leftmost)
173 task->pi_waiters_leftmost = &waiter->pi_tree_entry;
174
175 rb_link_node(&waiter->pi_tree_entry, parent, link);
176 rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
177}
178
179static void
180rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
181{
182 if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
183 return;
184
185 if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
186 task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
187
188 rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
189 RB_CLEAR_NODE(&waiter->pi_tree_entry);
190}
191
94/* 192/*
95 * Calculate task priority from the waiter list priority 193 * Calculate task priority from the waiter tree priority
96 * 194 *
97 * Return task->normal_prio when the waiter list is empty or when 195 * Return task->normal_prio when the waiter tree is empty or when
98 * the waiter is not allowed to do priority boosting 196 * the waiter is not allowed to do priority boosting
99 */ 197 */
100int rt_mutex_getprio(struct task_struct *task) 198int rt_mutex_getprio(struct task_struct *task)
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
102 if (likely(!task_has_pi_waiters(task))) 200 if (likely(!task_has_pi_waiters(task)))
103 return task->normal_prio; 201 return task->normal_prio;
104 202
105 return min(task_top_pi_waiter(task)->pi_list_entry.prio, 203 return min(task_top_pi_waiter(task)->prio,
106 task->normal_prio); 204 task->normal_prio);
107} 205}
108 206
207struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
208{
209 if (likely(!task_has_pi_waiters(task)))
210 return NULL;
211
212 return task_top_pi_waiter(task)->task;
213}
214
109/* 215/*
110 * Adjust the priority of a task, after its pi_waiters got modified. 216 * Adjust the priority of a task, after its pi_waiters got modified.
111 * 217 *
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
115{ 221{
116 int prio = rt_mutex_getprio(task); 222 int prio = rt_mutex_getprio(task);
117 223
118 if (task->prio != prio) 224 if (task->prio != prio || dl_prio(prio))
119 rt_mutex_setprio(task, prio); 225 rt_mutex_setprio(task, prio);
120} 226}
121 227
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
233 * When deadlock detection is off then we check, if further 339 * When deadlock detection is off then we check, if further
234 * priority adjustment is necessary. 340 * priority adjustment is necessary.
235 */ 341 */
236 if (!detect_deadlock && waiter->list_entry.prio == task->prio) 342 if (!detect_deadlock && waiter->prio == task->prio)
237 goto out_unlock_pi; 343 goto out_unlock_pi;
238 344
239 lock = waiter->lock; 345 lock = waiter->lock;
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 top_waiter = rt_mutex_top_waiter(lock); 360 top_waiter = rt_mutex_top_waiter(lock);
255 361
256 /* Requeue the waiter */ 362 /* Requeue the waiter */
257 plist_del(&waiter->list_entry, &lock->wait_list); 363 rt_mutex_dequeue(lock, waiter);
258 waiter->list_entry.prio = task->prio; 364 waiter->prio = task->prio;
259 plist_add(&waiter->list_entry, &lock->wait_list); 365 rt_mutex_enqueue(lock, waiter);
260 366
261 /* Release the task */ 367 /* Release the task */
262 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 368 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
280 386
281 if (waiter == rt_mutex_top_waiter(lock)) { 387 if (waiter == rt_mutex_top_waiter(lock)) {
282 /* Boost the owner */ 388 /* Boost the owner */
283 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); 389 rt_mutex_dequeue_pi(task, top_waiter);
284 waiter->pi_list_entry.prio = waiter->list_entry.prio; 390 rt_mutex_enqueue_pi(task, waiter);
285 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
286 __rt_mutex_adjust_prio(task); 391 __rt_mutex_adjust_prio(task);
287 392
288 } else if (top_waiter == waiter) { 393 } else if (top_waiter == waiter) {
289 /* Deboost the owner */ 394 /* Deboost the owner */
290 plist_del(&waiter->pi_list_entry, &task->pi_waiters); 395 rt_mutex_dequeue_pi(task, waiter);
291 waiter = rt_mutex_top_waiter(lock); 396 waiter = rt_mutex_top_waiter(lock);
292 waiter->pi_list_entry.prio = waiter->list_entry.prio; 397 rt_mutex_enqueue_pi(task, waiter);
293 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
294 __rt_mutex_adjust_prio(task); 398 __rt_mutex_adjust_prio(task);
295 } 399 }
296 400
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
355 * 3) it is top waiter 459 * 3) it is top waiter
356 */ 460 */
357 if (rt_mutex_has_waiters(lock)) { 461 if (rt_mutex_has_waiters(lock)) {
358 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { 462 if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
359 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 463 if (!waiter || waiter != rt_mutex_top_waiter(lock))
360 return 0; 464 return 0;
361 } 465 }
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
369 473
370 /* remove the queued waiter. */ 474 /* remove the queued waiter. */
371 if (waiter) { 475 if (waiter) {
372 plist_del(&waiter->list_entry, &lock->wait_list); 476 rt_mutex_dequeue(lock, waiter);
373 task->pi_blocked_on = NULL; 477 task->pi_blocked_on = NULL;
374 } 478 }
375 479
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
379 */ 483 */
380 if (rt_mutex_has_waiters(lock)) { 484 if (rt_mutex_has_waiters(lock)) {
381 top = rt_mutex_top_waiter(lock); 485 top = rt_mutex_top_waiter(lock);
382 top->pi_list_entry.prio = top->list_entry.prio; 486 rt_mutex_enqueue_pi(task, top);
383 plist_add(&top->pi_list_entry, &task->pi_waiters);
384 } 487 }
385 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 488 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
386 } 489 }
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
416 __rt_mutex_adjust_prio(task); 519 __rt_mutex_adjust_prio(task);
417 waiter->task = task; 520 waiter->task = task;
418 waiter->lock = lock; 521 waiter->lock = lock;
419 plist_node_init(&waiter->list_entry, task->prio); 522 waiter->prio = task->prio;
420 plist_node_init(&waiter->pi_list_entry, task->prio);
421 523
422 /* Get the top priority waiter on the lock */ 524 /* Get the top priority waiter on the lock */
423 if (rt_mutex_has_waiters(lock)) 525 if (rt_mutex_has_waiters(lock))
424 top_waiter = rt_mutex_top_waiter(lock); 526 top_waiter = rt_mutex_top_waiter(lock);
425 plist_add(&waiter->list_entry, &lock->wait_list); 527 rt_mutex_enqueue(lock, waiter);
426 528
427 task->pi_blocked_on = waiter; 529 task->pi_blocked_on = waiter;
428 530
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
433 535
434 if (waiter == rt_mutex_top_waiter(lock)) { 536 if (waiter == rt_mutex_top_waiter(lock)) {
435 raw_spin_lock_irqsave(&owner->pi_lock, flags); 537 raw_spin_lock_irqsave(&owner->pi_lock, flags);
436 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 538 rt_mutex_dequeue_pi(owner, top_waiter);
437 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 539 rt_mutex_enqueue_pi(owner, waiter);
438 540
439 __rt_mutex_adjust_prio(owner); 541 __rt_mutex_adjust_prio(owner);
440 if (owner->pi_blocked_on) 542 if (owner->pi_blocked_on)
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
486 * boosted mode and go back to normal after releasing 588 * boosted mode and go back to normal after releasing
487 * lock->wait_lock. 589 * lock->wait_lock.
488 */ 590 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 591 rt_mutex_dequeue_pi(current, waiter);
490 592
491 rt_mutex_set_owner(lock, NULL); 593 rt_mutex_set_owner(lock, NULL);
492 594
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
510 int chain_walk = 0; 612 int chain_walk = 0;
511 613
512 raw_spin_lock_irqsave(&current->pi_lock, flags); 614 raw_spin_lock_irqsave(&current->pi_lock, flags);
513 plist_del(&waiter->list_entry, &lock->wait_list); 615 rt_mutex_dequeue(lock, waiter);
514 current->pi_blocked_on = NULL; 616 current->pi_blocked_on = NULL;
515 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 617 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
516 618
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
521 623
522 raw_spin_lock_irqsave(&owner->pi_lock, flags); 624 raw_spin_lock_irqsave(&owner->pi_lock, flags);
523 625
524 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 626 rt_mutex_dequeue_pi(owner, waiter);
525 627
526 if (rt_mutex_has_waiters(lock)) { 628 if (rt_mutex_has_waiters(lock)) {
527 struct rt_mutex_waiter *next; 629 struct rt_mutex_waiter *next;
528 630
529 next = rt_mutex_top_waiter(lock); 631 next = rt_mutex_top_waiter(lock);
530 plist_add(&next->pi_list_entry, &owner->pi_waiters); 632 rt_mutex_enqueue_pi(owner, next);
531 } 633 }
532 __rt_mutex_adjust_prio(owner); 634 __rt_mutex_adjust_prio(owner);
533 635
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
537 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 639 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
538 } 640 }
539 641
540 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
541
542 if (!chain_walk) 642 if (!chain_walk)
543 return; 643 return;
544 644
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
565 raw_spin_lock_irqsave(&task->pi_lock, flags); 665 raw_spin_lock_irqsave(&task->pi_lock, flags);
566 666
567 waiter = task->pi_blocked_on; 667 waiter = task->pi_blocked_on;
568 if (!waiter || waiter->list_entry.prio == task->prio) { 668 if (!waiter || (waiter->prio == task->prio &&
669 !dl_prio(task->prio))) {
569 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 670 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
570 return; 671 return;
571 } 672 }
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
638 int ret = 0; 739 int ret = 0;
639 740
640 debug_rt_mutex_init_waiter(&waiter); 741 debug_rt_mutex_init_waiter(&waiter);
742 RB_CLEAR_NODE(&waiter.pi_tree_entry);
743 RB_CLEAR_NODE(&waiter.tree_entry);
641 744
642 raw_spin_lock(&lock->wait_lock); 745 raw_spin_lock(&lock->wait_lock);
643 746
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
904{ 1007{
905 lock->owner = NULL; 1008 lock->owner = NULL;
906 raw_spin_lock_init(&lock->wait_lock); 1009 raw_spin_lock_init(&lock->wait_lock);
907 plist_head_init(&lock->wait_list); 1010 lock->waiters = RB_ROOT;
1011 lock->waiters_leftmost = NULL;
908 1012
909 debug_rt_mutex_init(lock, name); 1013 debug_rt_mutex_init(lock, name);
910} 1014}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..7431a9c86f35 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
40 * This is the control structure for tasks blocked on a rt_mutex, 40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task. 41 * which is allocated on the kernel stack on of the blocked task.
42 * 42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list 43 * @tree_entry: pi node to enqueue into the mutex waiters tree
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list 44 * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
45 * @task: task reference to the blocked task 45 * @task: task reference to the blocked task
46 */ 46 */
47struct rt_mutex_waiter { 47struct rt_mutex_waiter {
48 struct plist_node list_entry; 48 struct rb_node tree_entry;
49 struct plist_node pi_list_entry; 49 struct rb_node pi_tree_entry;
50 struct task_struct *task; 50 struct task_struct *task;
51 struct rt_mutex *lock; 51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES 52#ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -54,14 +54,15 @@ struct rt_mutex_waiter {
54 struct pid *deadlock_task_pid; 54 struct pid *deadlock_task_pid;
55 struct rt_mutex *deadlock_lock; 55 struct rt_mutex *deadlock_lock;
56#endif 56#endif
57 int prio;
57}; 58};
58 59
59/* 60/*
60 * Various helpers to access the waiters-plist: 61 * Various helpers to access the waiters-tree:
61 */ 62 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 63static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{ 64{
64 return !plist_head_empty(&lock->wait_list); 65 return !RB_EMPTY_ROOT(&lock->waiters);
65} 66}
66 67
67static inline struct rt_mutex_waiter * 68static inline struct rt_mutex_waiter *
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
69{ 70{
70 struct rt_mutex_waiter *w; 71 struct rt_mutex_waiter *w;
71 72
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, 73 w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
73 list_entry); 74 tree_entry);
74 BUG_ON(w->lock != lock); 75 BUG_ON(w->lock != lock);
75 76
76 return w; 77 return w;
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
78 79
79static inline int task_has_pi_waiters(struct task_struct *p) 80static inline int task_has_pi_waiters(struct task_struct *p)
80{ 81{
81 return !plist_head_empty(&p->pi_waiters); 82 return !RB_EMPTY_ROOT(&p->pi_waiters);
82} 83}
83 84
84static inline struct rt_mutex_waiter * 85static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p) 86task_top_pi_waiter(struct task_struct *p)
86{ 87{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, 88 return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
88 pi_list_entry); 89 pi_tree_entry);
89} 90}
90 91
91/* 92/*
diff --git a/kernel/module.c b/kernel/module.c
index f5a3b1e8ec51..d24fcf29cb64 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
815 return -EFAULT; 815 return -EFAULT;
816 name[MODULE_NAME_LEN-1] = '\0'; 816 name[MODULE_NAME_LEN-1] = '\0';
817 817
818 if (!(flags & O_NONBLOCK)) { 818 if (!(flags & O_NONBLOCK))
819 printk(KERN_WARNING 819 pr_warn("waiting module removal not supported: please upgrade\n");
820 "waiting module removal not supported: please upgrade");
821 }
822 820
823 if (mutex_lock_interruptible(&module_mutex) != 0) 821 if (mutex_lock_interruptible(&module_mutex) != 0)
824 return -EINTR; 822 return -EINTR;
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..161402f0b517 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
46 46
47static int padata_cpu_hash(struct parallel_data *pd) 47static int padata_cpu_hash(struct parallel_data *pd)
48{ 48{
49 unsigned int seq_nr;
49 int cpu_index; 50 int cpu_index;
50 51
51 /* 52 /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
53 * seq_nr mod. number of cpus in use. 54 * seq_nr mod. number of cpus in use.
54 */ 55 */
55 56
56 spin_lock(&pd->seq_lock); 57 seq_nr = atomic_inc_return(&pd->seq_nr);
57 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); 58 cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
58 pd->seq_nr++;
59 spin_unlock(&pd->seq_lock);
60 59
61 return padata_index_to_cpu(pd, cpu_index); 60 return padata_index_to_cpu(pd, cpu_index);
62} 61}
@@ -113,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst,
113 112
114 rcu_read_lock_bh(); 113 rcu_read_lock_bh();
115 114
116 pd = rcu_dereference(pinst->pd); 115 pd = rcu_dereference_bh(pinst->pd);
117 116
118 err = -EINVAL; 117 err = -EINVAL;
119 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) 118 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
429 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
430 padata_init_squeues(pd); 429 padata_init_squeues(pd);
431 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
432 pd->seq_nr = 0; 431 atomic_set(&pd->seq_nr, -1);
433 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
434 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
435 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/panic.c b/kernel/panic.c
index c00b4ceb39e8..6d6300375090 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,7 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 37EXPORT_SYMBOL_GPL(panic_timeout);
38 38
39ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/params.c b/kernel/params.c
index c00d5b502aa4..b00142e7f3ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -227,17 +227,10 @@ int parse_args(const char *doing,
227} 227}
228 228
229/* Lazy bastard, eh? */ 229/* Lazy bastard, eh? */
230#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 230#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \
231 int param_set_##name(const char *val, const struct kernel_param *kp) \ 231 int param_set_##name(const char *val, const struct kernel_param *kp) \
232 { \ 232 { \
233 tmptype l; \ 233 return strtolfn(val, 0, (type *)kp->arg); \
234 int ret; \
235 \
236 ret = strtolfn(val, 0, &l); \
237 if (ret < 0 || ((type)l != l)) \
238 return ret < 0 ? ret : -EINVAL; \
239 *((type *)kp->arg) = l; \
240 return 0; \
241 } \ 234 } \
242 int param_get_##name(char *buffer, const struct kernel_param *kp) \ 235 int param_get_##name(char *buffer, const struct kernel_param *kp) \
243 { \ 236 { \
@@ -253,13 +246,13 @@ int parse_args(const char *doing,
253 EXPORT_SYMBOL(param_ops_##name) 246 EXPORT_SYMBOL(param_ops_##name)
254 247
255 248
256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); 249STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
257STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); 250STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); 251STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
259STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); 252STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); 253STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
261STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); 254STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); 255STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
263 256
264int param_set_charp(const char *val, const struct kernel_param *kp) 257int param_set_charp(const char *val, const struct kernel_param *kp)
265{ 258{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c7f31aa272f7..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
233 233
234/* 234/*
235 * Sample a process (thread group) clock for the given group_leader task. 235 * Sample a process (thread group) clock for the given group_leader task.
236 * Must be called with tasklist_lock held for reading. 236 * Must be called with task sighand lock held for safe while_each_thread()
237 * traversal.
237 */ 238 */
238static int cpu_clock_sample_group(const clockid_t which_clock, 239static int cpu_clock_sample_group(const clockid_t which_clock,
239 struct task_struct *p, 240 struct task_struct *p,
@@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
260 return 0; 261 return 0;
261} 262}
262 263
264static int posix_cpu_clock_get_task(struct task_struct *tsk,
265 const clockid_t which_clock,
266 struct timespec *tp)
267{
268 int err = -EINVAL;
269 unsigned long long rtn;
270
271 if (CPUCLOCK_PERTHREAD(which_clock)) {
272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 }
292
293 if (!err)
294 sample_to_timespec(which_clock, rtn, tp);
295
296 return err;
297}
298
263 299
264static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 300static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
265{ 301{
266 const pid_t pid = CPUCLOCK_PID(which_clock); 302 const pid_t pid = CPUCLOCK_PID(which_clock);
267 int error = -EINVAL; 303 int err = -EINVAL;
268 unsigned long long rtn;
269 304
270 if (pid == 0) { 305 if (pid == 0) {
271 /* 306 /*
272 * Special case constant value for our own clocks. 307 * Special case constant value for our own clocks.
273 * We don't have to do any lookup to find ourselves. 308 * We don't have to do any lookup to find ourselves.
274 */ 309 */
275 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 err = posix_cpu_clock_get_task(current, which_clock, tp);
276 /*
277 * Sampling just ourselves we can do with no locking.
278 */
279 error = cpu_clock_sample(which_clock,
280 current, &rtn);
281 } else {
282 read_lock(&tasklist_lock);
283 error = cpu_clock_sample_group(which_clock,
284 current, &rtn);
285 read_unlock(&tasklist_lock);
286 }
287 } else { 311 } else {
288 /* 312 /*
289 * Find the given PID, and validate that the caller 313 * Find the given PID, and validate that the caller
@@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
292 struct task_struct *p; 316 struct task_struct *p;
293 rcu_read_lock(); 317 rcu_read_lock();
294 p = find_task_by_vpid(pid); 318 p = find_task_by_vpid(pid);
295 if (p) { 319 if (p)
296 if (CPUCLOCK_PERTHREAD(which_clock)) { 320 err = posix_cpu_clock_get_task(p, which_clock, tp);
297 if (same_thread_group(p, current)) {
298 error = cpu_clock_sample(which_clock,
299 p, &rtn);
300 }
301 } else {
302 read_lock(&tasklist_lock);
303 if (thread_group_leader(p) && p->sighand) {
304 error =
305 cpu_clock_sample_group(which_clock,
306 p, &rtn);
307 }
308 read_unlock(&tasklist_lock);
309 }
310 }
311 rcu_read_unlock(); 321 rcu_read_unlock();
312 } 322 }
313 323
314 if (error) 324 return err;
315 return error;
316 sample_to_timespec(which_clock, rtn, tp);
317 return 0;
318} 325}
319 326
320 327
@@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
371 */ 378 */
372static int posix_cpu_timer_del(struct k_itimer *timer) 379static int posix_cpu_timer_del(struct k_itimer *timer)
373{ 380{
374 struct task_struct *p = timer->it.cpu.task;
375 int ret = 0; 381 int ret = 0;
382 unsigned long flags;
383 struct sighand_struct *sighand;
384 struct task_struct *p = timer->it.cpu.task;
376 385
377 if (likely(p != NULL)) { 386 WARN_ON_ONCE(p == NULL);
378 read_lock(&tasklist_lock);
379 if (unlikely(p->sighand == NULL)) {
380 /*
381 * We raced with the reaping of the task.
382 * The deletion should have cleared us off the list.
383 */
384 BUG_ON(!list_empty(&timer->it.cpu.entry));
385 } else {
386 spin_lock(&p->sighand->siglock);
387 if (timer->it.cpu.firing)
388 ret = TIMER_RETRY;
389 else
390 list_del(&timer->it.cpu.entry);
391 spin_unlock(&p->sighand->siglock);
392 }
393 read_unlock(&tasklist_lock);
394 387
395 if (!ret) 388 /*
396 put_task_struct(p); 389 * Protect against sighand release/switch in exit/exec and process/
390 * thread timer list entry concurrent read/writes.
391 */
392 sighand = lock_task_sighand(p, &flags);
393 if (unlikely(sighand == NULL)) {
394 /*
395 * We raced with the reaping of the task.
396 * The deletion should have cleared us off the list.
397 */
398 WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
399 } else {
400 if (timer->it.cpu.firing)
401 ret = TIMER_RETRY;
402 else
403 list_del(&timer->it.cpu.entry);
404
405 unlock_task_sighand(p, &flags);
397 } 406 }
398 407
408 if (!ret)
409 put_task_struct(p);
410
399 return ret; 411 return ret;
400} 412}
401 413
402static void cleanup_timers_list(struct list_head *head, 414static void cleanup_timers_list(struct list_head *head)
403 unsigned long long curr)
404{ 415{
405 struct cpu_timer_list *timer, *next; 416 struct cpu_timer_list *timer, *next;
406 417
@@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head,
414 * time for later timer_gettime calls to return. 425 * time for later timer_gettime calls to return.
415 * This must be called with the siglock held. 426 * This must be called with the siglock held.
416 */ 427 */
417static void cleanup_timers(struct list_head *head, 428static void cleanup_timers(struct list_head *head)
418 cputime_t utime, cputime_t stime,
419 unsigned long long sum_exec_runtime)
420{ 429{
421 430 cleanup_timers_list(head);
422 cputime_t ptime = utime + stime; 431 cleanup_timers_list(++head);
423 432 cleanup_timers_list(++head);
424 cleanup_timers_list(head, cputime_to_expires(ptime));
425 cleanup_timers_list(++head, cputime_to_expires(utime));
426 cleanup_timers_list(++head, sum_exec_runtime);
427} 433}
428 434
429/* 435/*
@@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head,
433 */ 439 */
434void posix_cpu_timers_exit(struct task_struct *tsk) 440void posix_cpu_timers_exit(struct task_struct *tsk)
435{ 441{
436 cputime_t utime, stime;
437
438 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 442 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
439 sizeof(unsigned long long)); 443 sizeof(unsigned long long));
440 task_cputime(tsk, &utime, &stime); 444 cleanup_timers(tsk->cpu_timers);
441 cleanup_timers(tsk->cpu_timers,
442 utime, stime, tsk->se.sum_exec_runtime);
443 445
444} 446}
445void posix_cpu_timers_exit_group(struct task_struct *tsk) 447void posix_cpu_timers_exit_group(struct task_struct *tsk)
446{ 448{
447 struct signal_struct *const sig = tsk->signal; 449 cleanup_timers(tsk->signal->cpu_timers);
448 cputime_t utime, stime;
449
450 task_cputime(tsk, &utime, &stime);
451 cleanup_timers(tsk->signal->cpu_timers,
452 utime + sig->utime, stime + sig->stime,
453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
454}
455
456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
460 /*
461 * That's all for this thread or process.
462 * We leave our residual in expires to be reported.
463 */
464 put_task_struct(timer->task);
465 timer->task = NULL;
466 if (timer->expires < now) {
467 timer->expires = 0;
468 } else {
469 timer->expires -= now;
470 }
471} 450}
472 451
473static inline int expires_gt(cputime_t expires, cputime_t new_exp) 452static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
477 456
478/* 457/*
479 * Insert the timer on the appropriate list before any timers that 458 * Insert the timer on the appropriate list before any timers that
480 * expire later. This must be called with the tasklist_lock held 459 * expire later. This must be called with the sighand lock held.
481 * for reading, interrupts disabled and p->sighand->siglock taken.
482 */ 460 */
483static void arm_timer(struct k_itimer *timer) 461static void arm_timer(struct k_itimer *timer)
484{ 462{
@@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
569 547
570/* 548/*
571 * Sample a process (thread group) timer for the given group_leader task. 549 * Sample a process (thread group) timer for the given group_leader task.
572 * Must be called with tasklist_lock held for reading. 550 * Must be called with task sighand lock held for safe while_each_thread()
551 * traversal.
573 */ 552 */
574static int cpu_timer_sample_group(const clockid_t which_clock, 553static int cpu_timer_sample_group(const clockid_t which_clock,
575 struct task_struct *p, 554 struct task_struct *p,
@@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
608 */ 587 */
609static void posix_cpu_timer_kick_nohz(void) 588static void posix_cpu_timer_kick_nohz(void)
610{ 589{
611 schedule_work(&nohz_kick_work); 590 if (context_tracking_is_enabled())
591 schedule_work(&nohz_kick_work);
612} 592}
613 593
614bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) 594bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
@@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { }
631 * If we return TIMER_RETRY, it's necessary to release the timer's lock 611 * If we return TIMER_RETRY, it's necessary to release the timer's lock
632 * and try again. (This happens when the timer is in the middle of firing.) 612 * and try again. (This happens when the timer is in the middle of firing.)
633 */ 613 */
634static int posix_cpu_timer_set(struct k_itimer *timer, int flags, 614static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
635 struct itimerspec *new, struct itimerspec *old) 615 struct itimerspec *new, struct itimerspec *old)
636{ 616{
617 unsigned long flags;
618 struct sighand_struct *sighand;
637 struct task_struct *p = timer->it.cpu.task; 619 struct task_struct *p = timer->it.cpu.task;
638 unsigned long long old_expires, new_expires, old_incr, val; 620 unsigned long long old_expires, new_expires, old_incr, val;
639 int ret; 621 int ret;
640 622
641 if (unlikely(p == NULL)) { 623 WARN_ON_ONCE(p == NULL);
642 /*
643 * Timer refers to a dead task's clock.
644 */
645 return -ESRCH;
646 }
647 624
648 new_expires = timespec_to_sample(timer->it_clock, &new->it_value); 625 new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
649 626
650 read_lock(&tasklist_lock);
651 /* 627 /*
652 * We need the tasklist_lock to protect against reaping that 628 * Protect against sighand release/switch in exit/exec and p->cpu_timers
653 * clears p->sighand. If p has just been reaped, we can no 629 * and p->signal->cpu_timers read/write in arm_timer()
630 */
631 sighand = lock_task_sighand(p, &flags);
632 /*
633 * If p has just been reaped, we can no
654 * longer get any information about it at all. 634 * longer get any information about it at all.
655 */ 635 */
656 if (unlikely(p->sighand == NULL)) { 636 if (unlikely(sighand == NULL)) {
657 read_unlock(&tasklist_lock);
658 put_task_struct(p);
659 timer->it.cpu.task = NULL;
660 return -ESRCH; 637 return -ESRCH;
661 } 638 }
662 639
663 /* 640 /*
664 * Disarm any old timer after extracting its expiry time. 641 * Disarm any old timer after extracting its expiry time.
665 */ 642 */
666 BUG_ON(!irqs_disabled()); 643 WARN_ON_ONCE(!irqs_disabled());
667 644
668 ret = 0; 645 ret = 0;
669 old_incr = timer->it.cpu.incr; 646 old_incr = timer->it.cpu.incr;
670 spin_lock(&p->sighand->siglock);
671 old_expires = timer->it.cpu.expires; 647 old_expires = timer->it.cpu.expires;
672 if (unlikely(timer->it.cpu.firing)) { 648 if (unlikely(timer->it.cpu.firing)) {
673 timer->it.cpu.firing = -1; 649 timer->it.cpu.firing = -1;
@@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
724 * disable this firing since we are already reporting 700 * disable this firing since we are already reporting
725 * it as an overrun (thanks to bump_cpu_timer above). 701 * it as an overrun (thanks to bump_cpu_timer above).
726 */ 702 */
727 spin_unlock(&p->sighand->siglock); 703 unlock_task_sighand(p, &flags);
728 read_unlock(&tasklist_lock);
729 goto out; 704 goto out;
730 } 705 }
731 706
732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { 707 if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
733 new_expires += val; 708 new_expires += val;
734 } 709 }
735 710
@@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
743 arm_timer(timer); 718 arm_timer(timer);
744 } 719 }
745 720
746 spin_unlock(&p->sighand->siglock); 721 unlock_task_sighand(p, &flags);
747 read_unlock(&tasklist_lock);
748
749 /* 722 /*
750 * Install the new reload setting, and 723 * Install the new reload setting, and
751 * set up the signal and overrun bookkeeping. 724 * set up the signal and overrun bookkeeping.
@@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
787{ 760{
788 unsigned long long now; 761 unsigned long long now;
789 struct task_struct *p = timer->it.cpu.task; 762 struct task_struct *p = timer->it.cpu.task;
790 int clear_dead; 763
764 WARN_ON_ONCE(p == NULL);
791 765
792 /* 766 /*
793 * Easy part: convert the reload time. 767 * Easy part: convert the reload time.
@@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
800 return; 774 return;
801 } 775 }
802 776
803 if (unlikely(p == NULL)) {
804 /*
805 * This task already died and the timer will never fire.
806 * In this case, expires is actually the dead value.
807 */
808 dead:
809 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
810 &itp->it_value);
811 return;
812 }
813
814 /* 777 /*
815 * Sample the clock to take the difference with the expiry time. 778 * Sample the clock to take the difference with the expiry time.
816 */ 779 */
817 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 780 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
818 cpu_clock_sample(timer->it_clock, p, &now); 781 cpu_clock_sample(timer->it_clock, p, &now);
819 clear_dead = p->exit_state;
820 } else { 782 } else {
821 read_lock(&tasklist_lock); 783 struct sighand_struct *sighand;
822 if (unlikely(p->sighand == NULL)) { 784 unsigned long flags;
785
786 /*
787 * Protect against sighand release/switch in exit/exec and
788 * also make timer sampling safe if it ends up calling
789 * thread_group_cputime().
790 */
791 sighand = lock_task_sighand(p, &flags);
792 if (unlikely(sighand == NULL)) {
823 /* 793 /*
824 * The process has been reaped. 794 * The process has been reaped.
825 * We can't even collect a sample any more. 795 * We can't even collect a sample any more.
826 * Call the timer disarmed, nothing else to do. 796 * Call the timer disarmed, nothing else to do.
827 */ 797 */
828 put_task_struct(p);
829 timer->it.cpu.task = NULL;
830 timer->it.cpu.expires = 0; 798 timer->it.cpu.expires = 0;
831 read_unlock(&tasklist_lock); 799 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
832 goto dead; 800 &itp->it_value);
833 } else { 801 } else {
834 cpu_timer_sample_group(timer->it_clock, p, &now); 802 cpu_timer_sample_group(timer->it_clock, p, &now);
835 clear_dead = (unlikely(p->exit_state) && 803 unlock_task_sighand(p, &flags);
836 thread_group_empty(p));
837 } 804 }
838 read_unlock(&tasklist_lock);
839 }
840
841 if (unlikely(clear_dead)) {
842 /*
843 * We've noticed that the thread is dead, but
844 * not yet reaped. Take this opportunity to
845 * drop our task ref.
846 */
847 clear_dead_task(timer, now);
848 goto dead;
849 } 805 }
850 806
851 if (now < timer->it.cpu.expires) { 807 if (now < timer->it.cpu.expires) {
@@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk,
1059 */ 1015 */
1060void posix_cpu_timer_schedule(struct k_itimer *timer) 1016void posix_cpu_timer_schedule(struct k_itimer *timer)
1061{ 1017{
1018 struct sighand_struct *sighand;
1019 unsigned long flags;
1062 struct task_struct *p = timer->it.cpu.task; 1020 struct task_struct *p = timer->it.cpu.task;
1063 unsigned long long now; 1021 unsigned long long now;
1064 1022
1065 if (unlikely(p == NULL)) 1023 WARN_ON_ONCE(p == NULL);
1066 /*
1067 * The task was cleaned up already, no future firings.
1068 */
1069 goto out;
1070 1024
1071 /* 1025 /*
1072 * Fetch the current sample and update the timer's expiry time. 1026 * Fetch the current sample and update the timer's expiry time.
@@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1074 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 1028 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1075 cpu_clock_sample(timer->it_clock, p, &now); 1029 cpu_clock_sample(timer->it_clock, p, &now);
1076 bump_cpu_timer(timer, now); 1030 bump_cpu_timer(timer, now);
1077 if (unlikely(p->exit_state)) { 1031 if (unlikely(p->exit_state))
1078 clear_dead_task(timer, now); 1032 goto out;
1033
1034 /* Protect timer list r/w in arm_timer() */
1035 sighand = lock_task_sighand(p, &flags);
1036 if (!sighand)
1079 goto out; 1037 goto out;
1080 }
1081 read_lock(&tasklist_lock); /* arm_timer needs it. */
1082 spin_lock(&p->sighand->siglock);
1083 } else { 1038 } else {
1084 read_lock(&tasklist_lock); 1039 /*
1085 if (unlikely(p->sighand == NULL)) { 1040 * Protect arm_timer() and timer sampling in case of call to
1041 * thread_group_cputime().
1042 */
1043 sighand = lock_task_sighand(p, &flags);
1044 if (unlikely(sighand == NULL)) {
1086 /* 1045 /*
1087 * The process has been reaped. 1046 * The process has been reaped.
1088 * We can't even collect a sample any more. 1047 * We can't even collect a sample any more.
1089 */ 1048 */
1090 put_task_struct(p);
1091 timer->it.cpu.task = p = NULL;
1092 timer->it.cpu.expires = 0; 1049 timer->it.cpu.expires = 0;
1093 goto out_unlock; 1050 goto out;
1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1051 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1095 /* 1052 unlock_task_sighand(p, &flags);
1096 * We've noticed that the thread is dead, but 1053 /* Optimizations: if the process is dying, no need to rearm */
1097 * not yet reaped. Take this opportunity to 1054 goto out;
1098 * drop our task ref.
1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1101 clear_dead_task(timer, now);
1102 goto out_unlock;
1103 } 1055 }
1104 spin_lock(&p->sighand->siglock);
1105 cpu_timer_sample_group(timer->it_clock, p, &now); 1056 cpu_timer_sample_group(timer->it_clock, p, &now);
1106 bump_cpu_timer(timer, now); 1057 bump_cpu_timer(timer, now);
1107 /* Leave the tasklist_lock locked for the call below. */ 1058 /* Leave the sighand locked for the call below. */
1108 } 1059 }
1109 1060
1110 /* 1061 /*
1111 * Now re-arm for the new expiry time. 1062 * Now re-arm for the new expiry time.
1112 */ 1063 */
1113 BUG_ON(!irqs_disabled()); 1064 WARN_ON_ONCE(!irqs_disabled());
1114 arm_timer(timer); 1065 arm_timer(timer);
1115 spin_unlock(&p->sighand->siglock); 1066 unlock_task_sighand(p, &flags);
1116
1117out_unlock:
1118 read_unlock(&tasklist_lock);
1119 1067
1068 /* Kick full dynticks CPUs in case they need to tick on the new timer */
1069 posix_cpu_timer_kick_nohz();
1120out: 1070out:
1121 timer->it_overrun_last = timer->it_overrun; 1071 timer->it_overrun_last = timer->it_overrun;
1122 timer->it_overrun = -1; 1072 timer->it_overrun = -1;
@@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1200 struct k_itimer *timer, *next; 1150 struct k_itimer *timer, *next;
1201 unsigned long flags; 1151 unsigned long flags;
1202 1152
1203 BUG_ON(!irqs_disabled()); 1153 WARN_ON_ONCE(!irqs_disabled());
1204 1154
1205 /* 1155 /*
1206 * The fast path checks that there are no expired thread or thread 1156 * The fast path checks that there are no expired thread or thread
@@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1256 cpu_timer_fire(timer); 1206 cpu_timer_fire(timer);
1257 spin_unlock(&timer->it_lock); 1207 spin_unlock(&timer->it_lock);
1258 } 1208 }
1259
1260 /*
1261 * In case some timers were rescheduled after the queue got emptied,
1262 * wake up full dynticks CPUs.
1263 */
1264 if (tsk->signal->cputimer.running)
1265 posix_cpu_timer_kick_nohz();
1266} 1209}
1267 1210
1268/* 1211/*
@@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1274{ 1217{
1275 unsigned long long now; 1218 unsigned long long now;
1276 1219
1277 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1220 WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
1278 cpu_timer_sample_group(clock_idx, tsk, &now); 1221 cpu_timer_sample_group(clock_idx, tsk, &now);
1279 1222
1280 if (oldval) { 1223 if (oldval) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
81 list_for_each_entry(tmp, &pm_vt_switch_list, head) { 81 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
82 if (tmp->dev == dev) { 82 if (tmp->dev == dev) {
83 list_del(&tmp->head); 83 list_del(&tmp->head);
84 kfree(tmp);
84 break; 85 break;
85 } 86 }
86 } 87 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e204af..d9f61a145802 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
637 BUG_ON(!region); 637 BUG_ON(!region);
638 } else 638 } else
639 /* This allocation cannot fail */ 639 /* This allocation cannot fail */
640 region = alloc_bootmem(sizeof(struct nosave_region)); 640 region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
641 region->start_pfn = start_pfn; 641 region->start_pfn = start_pfn;
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86bae576..f8b41bddc6dc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
757 return; 757 return;
758 758
759 if (early) { 759 if (early) {
760 unsigned long mem; 760 new_log_buf =
761 761 memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
762 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
763 if (!mem)
764 return;
765 new_log_buf = __va(mem);
766 } else { 762 } else {
767 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); 763 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
768 } 764 }
769 765
770 if (unlikely(!new_log_buf)) { 766 if (unlikely(!new_log_buf)) {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7859a0a3951e..79c3877e9c5b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
96} 96}
97#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 97#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
98 98
99extern void kfree(const void *); 99void kfree(const void *);
100 100
101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
102{ 102{
103 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
104 104
105 rcu_lock_acquire(&rcu_callback_map);
105 if (__is_kfree_rcu_offset(offset)) { 106 if (__is_kfree_rcu_offset(offset)) {
106 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 107 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
107 kfree((void *)head - offset); 108 kfree((void *)head - offset);
109 rcu_lock_release(&rcu_callback_map);
108 return 1; 110 return 1;
109 } else { 111 } else {
110 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 112 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
111 head->func(head); 113 head->func(head);
114 rcu_lock_release(&rcu_callback_map);
112 return 0; 115 return 0;
113 } 116 }
114} 117}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..3318d8284384 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp)
363/* 363/*
364 * Enqueue an SRCU callback on the specified srcu_struct structure, 364 * Enqueue an SRCU callback on the specified srcu_struct structure,
365 * initiating grace-period processing if it is not already running. 365 * initiating grace-period processing if it is not already running.
366 *
367 * Note that all CPUs must agree that the grace period extended beyond
368 * all pre-existing SRCU read-side critical section. On systems with
369 * more than one CPU, this means that when "func()" is invoked, each CPU
370 * is guaranteed to have executed a full memory barrier since the end of
371 * its last corresponding SRCU read-side critical section whose beginning
372 * preceded the call to call_rcu(). It also means that each CPU executing
373 * an SRCU read-side critical section that continues beyond the start of
374 * "func()" must have executed a memory barrier after the call_rcu()
375 * but before the beginning of that SRCU read-side critical section.
376 * Note that these guarantees include CPUs that are offline, idle, or
377 * executing in user mode, as well as CPUs that are executing in the kernel.
378 *
379 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
380 * resulting SRCU callback function "func()", then both CPU A and CPU
381 * B are guaranteed to execute a full memory barrier during the time
382 * interval between the call to call_rcu() and the invocation of "func()".
383 * This guarantee applies even if CPU A and CPU B are the same CPU (but
384 * again only if the system has more than one CPU).
385 *
386 * Of course, these guarantees apply only for invocations of call_srcu(),
387 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
388 * srcu_struct structure.
366 */ 389 */
367void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 390void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
368 void (*func)(struct rcu_head *head)) 391 void (*func)(struct rcu_head *head))
@@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
459 * Note that it is illegal to call synchronize_srcu() from the corresponding 482 * Note that it is illegal to call synchronize_srcu() from the corresponding
460 * SRCU read-side critical section; doing so will result in deadlock. 483 * SRCU read-side critical section; doing so will result in deadlock.
461 * However, it is perfectly legal to call synchronize_srcu() on one 484 * However, it is perfectly legal to call synchronize_srcu() on one
462 * srcu_struct from some other srcu_struct's read-side critical section. 485 * srcu_struct from some other srcu_struct's read-side critical section,
486 * as long as the resulting graph of srcu_structs is acyclic.
487 *
488 * There are memory-ordering constraints implied by synchronize_srcu().
489 * On systems with more than one CPU, when synchronize_srcu() returns,
490 * each CPU is guaranteed to have executed a full memory barrier since
491 * the end of its last corresponding SRCU-sched read-side critical section
492 * whose beginning preceded the call to synchronize_srcu(). In addition,
493 * each CPU having an SRCU read-side critical section that extends beyond
494 * the return from synchronize_srcu() is guaranteed to have executed a
495 * full memory barrier after the beginning of synchronize_srcu() and before
496 * the beginning of that SRCU read-side critical section. Note that these
497 * guarantees include CPUs that are offline, idle, or executing in user mode,
498 * as well as CPUs that are executing in the kernel.
499 *
500 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
501 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
502 * to have executed a full memory barrier during the execution of
503 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
504 * are the same CPU, but again only if the system has more than one CPU.
505 *
506 * Of course, these memory-ordering guarantees apply only when
507 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
508 * passed the same srcu_struct structure.
463 */ 509 */
464void synchronize_srcu(struct srcu_struct *sp) 510void synchronize_srcu(struct srcu_struct *sp)
465{ 511{
@@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
476 * Wait for an SRCU grace period to elapse, but be more aggressive about 522 * Wait for an SRCU grace period to elapse, but be more aggressive about
477 * spinning rather than blocking when waiting. 523 * spinning rather than blocking when waiting.
478 * 524 *
479 * Note that it is also illegal to call synchronize_srcu_expedited() 525 * Note that synchronize_srcu_expedited() has the same deadlock and
480 * from the corresponding SRCU read-side critical section; 526 * memory-ordering properties as does synchronize_srcu().
481 * doing so will result in deadlock. However, it is perfectly legal
482 * to call synchronize_srcu_expedited() on one srcu_struct from some
483 * other srcu_struct's read-side critical section, as long as
484 * the resulting graph of srcu_structs is acyclic.
485 */ 527 */
486void synchronize_srcu_expedited(struct srcu_struct *sp) 528void synchronize_srcu_expedited(struct srcu_struct *sp)
487{ 529{
@@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
491 533
492/** 534/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 535 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
536 * @sp: srcu_struct on which to wait for in-flight callbacks.
494 */ 537 */
495void srcu_barrier(struct srcu_struct *sp) 538void srcu_barrier(struct srcu_struct *sp)
496{ 539{
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 3929cd451511..732f8ae3086a 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
139#define VERBOSE_PRINTK_ERRSTRING(s) \ 139#define VERBOSE_PRINTK_ERRSTRING(s) \
140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
141 141
142static char printk_buf[4096];
143
144static int nrealreaders; 142static int nrealreaders;
145static struct task_struct *writer_task; 143static struct task_struct *writer_task;
146static struct task_struct **fakewriter_tasks; 144static struct task_struct **fakewriter_tasks;
@@ -376,7 +374,7 @@ struct rcu_torture_ops {
376 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 374 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
377 void (*cb_barrier)(void); 375 void (*cb_barrier)(void);
378 void (*fqs)(void); 376 void (*fqs)(void);
379 int (*stats)(char *page); 377 void (*stats)(char *page);
380 int irq_capable; 378 int irq_capable;
381 int can_boost; 379 int can_boost;
382 const char *name; 380 const char *name;
@@ -578,21 +576,19 @@ static void srcu_torture_barrier(void)
578 srcu_barrier(&srcu_ctl); 576 srcu_barrier(&srcu_ctl);
579} 577}
580 578
581static int srcu_torture_stats(char *page) 579static void srcu_torture_stats(char *page)
582{ 580{
583 int cnt = 0;
584 int cpu; 581 int cpu;
585 int idx = srcu_ctl.completed & 0x1; 582 int idx = srcu_ctl.completed & 0x1;
586 583
587 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 584 page += sprintf(page, "%s%s per-CPU(idx=%d):",
588 torture_type, TORTURE_FLAG, idx); 585 torture_type, TORTURE_FLAG, idx);
589 for_each_possible_cpu(cpu) { 586 for_each_possible_cpu(cpu) {
590 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, 587 page += sprintf(page, " %d(%lu,%lu)", cpu,
591 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 588 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
592 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 589 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
593 } 590 }
594 cnt += sprintf(&page[cnt], "\n"); 591 sprintf(page, "\n");
595 return cnt;
596} 592}
597 593
598static void srcu_torture_synchronize_expedited(void) 594static void srcu_torture_synchronize_expedited(void)
@@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg)
1052/* 1048/*
1053 * Create an RCU-torture statistics message in the specified buffer. 1049 * Create an RCU-torture statistics message in the specified buffer.
1054 */ 1050 */
1055static int 1051static void
1056rcu_torture_printk(char *page) 1052rcu_torture_printk(char *page)
1057{ 1053{
1058 int cnt = 0;
1059 int cpu; 1054 int cpu;
1060 int i; 1055 int i;
1061 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1056 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
@@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page)
1071 if (pipesummary[i] != 0) 1066 if (pipesummary[i] != 0)
1072 break; 1067 break;
1073 } 1068 }
1074 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1069 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
1075 cnt += sprintf(&page[cnt], 1070 page += sprintf(page,
1076 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1071 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1077 rcu_torture_current, 1072 rcu_torture_current,
1078 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page)
1080 atomic_read(&n_rcu_torture_alloc), 1075 atomic_read(&n_rcu_torture_alloc),
1081 atomic_read(&n_rcu_torture_alloc_fail), 1076 atomic_read(&n_rcu_torture_alloc_fail),
1082 atomic_read(&n_rcu_torture_free)); 1077 atomic_read(&n_rcu_torture_free));
1083 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", 1078 page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
1084 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1085 n_rcu_torture_boost_ktrerror, 1080 n_rcu_torture_boost_ktrerror,
1086 n_rcu_torture_boost_rterror); 1081 n_rcu_torture_boost_rterror);
1087 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", 1082 page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
1088 n_rcu_torture_boost_failure, 1083 n_rcu_torture_boost_failure,
1089 n_rcu_torture_boosts, 1084 n_rcu_torture_boosts,
1090 n_rcu_torture_timers); 1085 n_rcu_torture_timers);
1091 cnt += sprintf(&page[cnt], 1086 page += sprintf(page,
1092 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", 1087 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1093 n_online_successes, n_online_attempts, 1088 n_online_successes, n_online_attempts,
1094 n_offline_successes, n_offline_attempts, 1089 n_offline_successes, n_offline_attempts,
1095 min_online, max_online, 1090 min_online, max_online,
1096 min_offline, max_offline, 1091 min_offline, max_offline,
1097 sum_online, sum_offline, HZ); 1092 sum_online, sum_offline, HZ);
1098 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", 1093 page += sprintf(page, "barrier: %ld/%ld:%ld",
1099 n_barrier_successes, 1094 n_barrier_successes,
1100 n_barrier_attempts, 1095 n_barrier_attempts,
1101 n_rcu_torture_barrier_error); 1096 n_rcu_torture_barrier_error);
1102 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1097 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1103 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1098 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1104 n_rcu_torture_barrier_error != 0 || 1099 n_rcu_torture_barrier_error != 0 ||
1105 n_rcu_torture_boost_ktrerror != 0 || 1100 n_rcu_torture_boost_ktrerror != 0 ||
1106 n_rcu_torture_boost_rterror != 0 || 1101 n_rcu_torture_boost_rterror != 0 ||
1107 n_rcu_torture_boost_failure != 0 || 1102 n_rcu_torture_boost_failure != 0 ||
1108 i > 1) { 1103 i > 1) {
1109 cnt += sprintf(&page[cnt], "!!! "); 1104 page += sprintf(page, "!!! ");
1110 atomic_inc(&n_rcu_torture_error); 1105 atomic_inc(&n_rcu_torture_error);
1111 WARN_ON_ONCE(1); 1106 WARN_ON_ONCE(1);
1112 } 1107 }
1113 cnt += sprintf(&page[cnt], "Reader Pipe: "); 1108 page += sprintf(page, "Reader Pipe: ");
1114 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1109 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1115 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 1110 page += sprintf(page, " %ld", pipesummary[i]);
1116 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1111 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1117 cnt += sprintf(&page[cnt], "Reader Batch: "); 1112 page += sprintf(page, "Reader Batch: ");
1118 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1113 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1119 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 1114 page += sprintf(page, " %ld", batchsummary[i]);
1120 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1115 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1121 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 1116 page += sprintf(page, "Free-Block Circulation: ");
1122 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1117 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1123 cnt += sprintf(&page[cnt], " %d", 1118 page += sprintf(page, " %d",
1124 atomic_read(&rcu_torture_wcount[i])); 1119 atomic_read(&rcu_torture_wcount[i]));
1125 } 1120 }
1126 cnt += sprintf(&page[cnt], "\n"); 1121 page += sprintf(page, "\n");
1127 if (cur_ops->stats) 1122 if (cur_ops->stats)
1128 cnt += cur_ops->stats(&page[cnt]); 1123 cur_ops->stats(page);
1129 return cnt;
1130} 1124}
1131 1125
1132/* 1126/*
@@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page)
1140static void 1134static void
1141rcu_torture_stats_print(void) 1135rcu_torture_stats_print(void)
1142{ 1136{
1143 int cnt; 1137 int size = nr_cpu_ids * 200 + 8192;
1138 char *buf;
1144 1139
1145 cnt = rcu_torture_printk(printk_buf); 1140 buf = kmalloc(size, GFP_KERNEL);
1146 pr_alert("%s", printk_buf); 1141 if (!buf) {
1142 pr_err("rcu-torture: Out of memory, need: %d", size);
1143 return;
1144 }
1145 rcu_torture_printk(buf);
1146 pr_alert("%s", buf);
1147 kfree(buf);
1147} 1148}
1148 1149
1149/* 1150/*
@@ -1578,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg)
1578{ 1579{
1579 long myid = (long)arg; 1580 long myid = (long)arg;
1580 bool lastphase = 0; 1581 bool lastphase = 0;
1582 bool newphase;
1581 struct rcu_head rcu; 1583 struct rcu_head rcu;
1582 1584
1583 init_rcu_head_on_stack(&rcu); 1585 init_rcu_head_on_stack(&rcu);
@@ -1585,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg)
1585 set_user_nice(current, 19); 1587 set_user_nice(current, 19);
1586 do { 1588 do {
1587 wait_event(barrier_cbs_wq[myid], 1589 wait_event(barrier_cbs_wq[myid],
1588 barrier_phase != lastphase || 1590 (newphase =
1591 ACCESS_ONCE(barrier_phase)) != lastphase ||
1589 kthread_should_stop() || 1592 kthread_should_stop() ||
1590 fullstop != FULLSTOP_DONTSTOP); 1593 fullstop != FULLSTOP_DONTSTOP);
1591 lastphase = barrier_phase; 1594 lastphase = newphase;
1592 smp_mb(); /* ensure barrier_phase load before ->call(). */ 1595 smp_mb(); /* ensure barrier_phase load before ->call(). */
1593 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1596 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1594 break; 1597 break;
@@ -1625,7 +1628,7 @@ static int rcu_torture_barrier(void *arg)
1625 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1628 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1626 break; 1629 break;
1627 n_barrier_attempts++; 1630 n_barrier_attempts++;
1628 cur_ops->cb_barrier(); 1631 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1629 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1632 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1630 n_rcu_torture_barrier_error++; 1633 n_rcu_torture_barrier_error++;
1631 WARN_ON_ONCE(1); 1634 WARN_ON_ONCE(1);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index dd081987a8ec..b3d116cd072d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
370 bool user) 370 bool user)
371{ 371{
372 struct rcu_state *rsp;
373 struct rcu_data *rdp;
374
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 375 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 376 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle __maybe_unused = 377 struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
380 current->pid, current->comm, 383 current->pid, current->comm,
381 idle->pid, idle->comm); /* must be idle task! */ 384 idle->pid, idle->comm); /* must be idle task! */
382 } 385 }
386 for_each_rcu_flavor(rsp) {
387 rdp = this_cpu_ptr(rsp->rda);
388 do_nocb_deferred_wakeup(rdp);
389 }
383 rcu_prepare_for_idle(smp_processor_id()); 390 rcu_prepare_for_idle(smp_processor_id());
384 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 391 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
385 smp_mb__before_atomic_inc(); /* See above. */ 392 smp_mb__before_atomic_inc(); /* See above. */
@@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user)
411 rdtp = this_cpu_ptr(&rcu_dynticks); 418 rdtp = this_cpu_ptr(&rcu_dynticks);
412 oldval = rdtp->dynticks_nesting; 419 oldval = rdtp->dynticks_nesting;
413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 420 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 421 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
415 rdtp->dynticks_nesting = 0; 422 rdtp->dynticks_nesting = 0;
416 else 423 rcu_eqs_enter_common(rdtp, oldval, user);
424 } else {
417 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 425 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
418 rcu_eqs_enter_common(rdtp, oldval, user); 426 }
419} 427}
420 428
421/** 429/**
@@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user)
533 rdtp = this_cpu_ptr(&rcu_dynticks); 541 rdtp = this_cpu_ptr(&rcu_dynticks);
534 oldval = rdtp->dynticks_nesting; 542 oldval = rdtp->dynticks_nesting;
535 WARN_ON_ONCE(oldval < 0); 543 WARN_ON_ONCE(oldval < 0);
536 if (oldval & DYNTICK_TASK_NEST_MASK) 544 if (oldval & DYNTICK_TASK_NEST_MASK) {
537 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 545 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
538 else 546 } else {
539 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 547 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
540 rcu_eqs_exit_common(rdtp, oldval, user); 548 rcu_eqs_exit_common(rdtp, oldval, user);
549 }
541} 550}
542 551
543/** 552/**
@@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void)
716 bool ret; 725 bool ret;
717 726
718 if (in_nmi()) 727 if (in_nmi())
719 return 1; 728 return true;
720 preempt_disable(); 729 preempt_disable();
721 rdp = this_cpu_ptr(&rcu_sched_data); 730 rdp = this_cpu_ptr(&rcu_sched_data);
722 rnp = rdp->mynode; 731 rnp = rdp->mynode;
@@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
755} 764}
756 765
757/* 766/*
767 * This function really isn't for public consumption, but RCU is special in
768 * that context switches can allow the state machine to make progress.
769 */
770extern void resched_cpu(int cpu);
771
772/*
758 * Return true if the specified CPU has passed through a quiescent 773 * Return true if the specified CPU has passed through a quiescent
759 * state by virtue of being in or having passed through an dynticks 774 * state by virtue of being in or having passed through an dynticks
760 * idle state since the last call to dyntick_save_progress_counter() 775 * idle state since the last call to dyntick_save_progress_counter()
@@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
812 */ 827 */
813 rcu_kick_nohz_cpu(rdp->cpu); 828 rcu_kick_nohz_cpu(rdp->cpu);
814 829
830 /*
831 * Alternatively, the CPU might be running in the kernel
832 * for an extended period of time without a quiescent state.
833 * Attempt to force the CPU through the scheduler to gain the
834 * needed quiescent state, but only if the grace period has gone
835 * on for an uncommonly long time. If there are many stuck CPUs,
836 * we will beat on the first one until it gets unstuck, then move
837 * to the next. Only do this for the primary flavor of RCU.
838 */
839 if (rdp->rsp == rcu_state &&
840 ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
841 rdp->rsp->jiffies_resched += 5;
842 resched_cpu(rdp->cpu);
843 }
844
815 return 0; 845 return 0;
816} 846}
817 847
818static void record_gp_stall_check_time(struct rcu_state *rsp) 848static void record_gp_stall_check_time(struct rcu_state *rsp)
819{ 849{
820 unsigned long j = ACCESS_ONCE(jiffies); 850 unsigned long j = ACCESS_ONCE(jiffies);
851 unsigned long j1;
821 852
822 rsp->gp_start = j; 853 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */ 854 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); 855 j1 = rcu_jiffies_till_stall_check();
856 rsp->jiffies_stall = j + j1;
857 rsp->jiffies_resched = j + j1 / 2;
825} 858}
826 859
827/* 860/*
@@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1133 * hold it, acquire the root rcu_node structure's lock in order to 1166 * hold it, acquire the root rcu_node structure's lock in order to
1134 * start one (if needed). 1167 * start one (if needed).
1135 */ 1168 */
1136 if (rnp != rnp_root) 1169 if (rnp != rnp_root) {
1137 raw_spin_lock(&rnp_root->lock); 1170 raw_spin_lock(&rnp_root->lock);
1171 smp_mb__after_unlock_lock();
1172 }
1138 1173
1139 /* 1174 /*
1140 * Get a new grace-period number. If there really is no grace 1175 * Get a new grace-period number. If there really is no grace
@@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1354 local_irq_restore(flags); 1389 local_irq_restore(flags);
1355 return; 1390 return;
1356 } 1391 }
1392 smp_mb__after_unlock_lock();
1357 __note_gp_changes(rsp, rnp, rdp); 1393 __note_gp_changes(rsp, rnp, rdp);
1358 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1394 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1359} 1395}
@@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1368 1404
1369 rcu_bind_gp_kthread(); 1405 rcu_bind_gp_kthread();
1370 raw_spin_lock_irq(&rnp->lock); 1406 raw_spin_lock_irq(&rnp->lock);
1407 smp_mb__after_unlock_lock();
1371 if (rsp->gp_flags == 0) { 1408 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */ 1409 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock); 1410 raw_spin_unlock_irq(&rnp->lock);
@@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1409 */ 1446 */
1410 rcu_for_each_node_breadth_first(rsp, rnp) { 1447 rcu_for_each_node_breadth_first(rsp, rnp) {
1411 raw_spin_lock_irq(&rnp->lock); 1448 raw_spin_lock_irq(&rnp->lock);
1449 smp_mb__after_unlock_lock();
1412 rdp = this_cpu_ptr(rsp->rda); 1450 rdp = this_cpu_ptr(rsp->rda);
1413 rcu_preempt_check_blocked_tasks(rnp); 1451 rcu_preempt_check_blocked_tasks(rnp);
1414 rnp->qsmask = rnp->qsmaskinit; 1452 rnp->qsmask = rnp->qsmaskinit;
@@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1463 /* Clear flag to prevent immediate re-entry. */ 1501 /* Clear flag to prevent immediate re-entry. */
1464 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1502 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1465 raw_spin_lock_irq(&rnp->lock); 1503 raw_spin_lock_irq(&rnp->lock);
1504 smp_mb__after_unlock_lock();
1466 rsp->gp_flags &= ~RCU_GP_FLAG_FQS; 1505 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1467 raw_spin_unlock_irq(&rnp->lock); 1506 raw_spin_unlock_irq(&rnp->lock);
1468 } 1507 }
@@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1480 struct rcu_node *rnp = rcu_get_root(rsp); 1519 struct rcu_node *rnp = rcu_get_root(rsp);
1481 1520
1482 raw_spin_lock_irq(&rnp->lock); 1521 raw_spin_lock_irq(&rnp->lock);
1522 smp_mb__after_unlock_lock();
1483 gp_duration = jiffies - rsp->gp_start; 1523 gp_duration = jiffies - rsp->gp_start;
1484 if (gp_duration > rsp->gp_max) 1524 if (gp_duration > rsp->gp_max)
1485 rsp->gp_max = gp_duration; 1525 rsp->gp_max = gp_duration;
@@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1505 */ 1545 */
1506 rcu_for_each_node_breadth_first(rsp, rnp) { 1546 rcu_for_each_node_breadth_first(rsp, rnp) {
1507 raw_spin_lock_irq(&rnp->lock); 1547 raw_spin_lock_irq(&rnp->lock);
1548 smp_mb__after_unlock_lock();
1508 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1549 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1509 rdp = this_cpu_ptr(rsp->rda); 1550 rdp = this_cpu_ptr(rsp->rda);
1510 if (rnp == rdp->mynode) 1551 if (rnp == rdp->mynode)
1511 __note_gp_changes(rsp, rnp, rdp); 1552 __note_gp_changes(rsp, rnp, rdp);
1553 /* smp_mb() provided by prior unlock-lock pair. */
1512 nocb += rcu_future_gp_cleanup(rsp, rnp); 1554 nocb += rcu_future_gp_cleanup(rsp, rnp);
1513 raw_spin_unlock_irq(&rnp->lock); 1555 raw_spin_unlock_irq(&rnp->lock);
1514 cond_resched(); 1556 cond_resched();
1515 } 1557 }
1516 rnp = rcu_get_root(rsp); 1558 rnp = rcu_get_root(rsp);
1517 raw_spin_lock_irq(&rnp->lock); 1559 raw_spin_lock_irq(&rnp->lock);
1560 smp_mb__after_unlock_lock();
1518 rcu_nocb_gp_set(rnp, nocb); 1561 rcu_nocb_gp_set(rnp, nocb);
1519 1562
1520 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1563 rsp->completed = rsp->gpnum; /* Declare grace period done. */
@@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1553 wait_event_interruptible(rsp->gp_wq, 1596 wait_event_interruptible(rsp->gp_wq,
1554 ACCESS_ONCE(rsp->gp_flags) & 1597 ACCESS_ONCE(rsp->gp_flags) &
1555 RCU_GP_FLAG_INIT); 1598 RCU_GP_FLAG_INIT);
1599 /* Locking provides needed memory barrier. */
1556 if (rcu_gp_init(rsp)) 1600 if (rcu_gp_init(rsp))
1557 break; 1601 break;
1558 cond_resched(); 1602 cond_resched();
@@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1582 (!ACCESS_ONCE(rnp->qsmask) && 1626 (!ACCESS_ONCE(rnp->qsmask) &&
1583 !rcu_preempt_blocked_readers_cgp(rnp)), 1627 !rcu_preempt_blocked_readers_cgp(rnp)),
1584 j); 1628 j);
1629 /* Locking provides needed memory barriers. */
1585 /* If grace period done, leave loop. */ 1630 /* If grace period done, leave loop. */
1586 if (!ACCESS_ONCE(rnp->qsmask) && 1631 if (!ACCESS_ONCE(rnp->qsmask) &&
1587 !rcu_preempt_blocked_readers_cgp(rnp)) 1632 !rcu_preempt_blocked_readers_cgp(rnp))
@@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1749 rnp_c = rnp; 1794 rnp_c = rnp;
1750 rnp = rnp->parent; 1795 rnp = rnp->parent;
1751 raw_spin_lock_irqsave(&rnp->lock, flags); 1796 raw_spin_lock_irqsave(&rnp->lock, flags);
1797 smp_mb__after_unlock_lock();
1752 WARN_ON_ONCE(rnp_c->qsmask); 1798 WARN_ON_ONCE(rnp_c->qsmask);
1753 } 1799 }
1754 1800
@@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1778 1824
1779 rnp = rdp->mynode; 1825 rnp = rdp->mynode;
1780 raw_spin_lock_irqsave(&rnp->lock, flags); 1826 raw_spin_lock_irqsave(&rnp->lock, flags);
1827 smp_mb__after_unlock_lock();
1781 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 1828 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1782 rnp->completed == rnp->gpnum) { 1829 rnp->completed == rnp->gpnum) {
1783 1830
@@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1901 * Adopt the RCU callbacks from the specified rcu_state structure's 1948 * Adopt the RCU callbacks from the specified rcu_state structure's
1902 * orphanage. The caller must hold the ->orphan_lock. 1949 * orphanage. The caller must hold the ->orphan_lock.
1903 */ 1950 */
1904static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1951static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1905{ 1952{
1906 int i; 1953 int i;
1907 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1954 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1908 1955
1909 /* No-CBs CPUs are handled specially. */ 1956 /* No-CBs CPUs are handled specially. */
1910 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) 1957 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
1911 return; 1958 return;
1912 1959
1913 /* Do the accounting first. */ 1960 /* Do the accounting first. */
@@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1986 2033
1987 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2034 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1988 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2035 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1989 rcu_adopt_orphan_cbs(rsp); 2036 rcu_adopt_orphan_cbs(rsp, flags);
1990 2037
1991 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2038 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1992 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2039 mask = rdp->grpmask; /* rnp->grplo is constant. */
1993 do { 2040 do {
1994 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2041 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2042 smp_mb__after_unlock_lock();
1995 rnp->qsmaskinit &= ~mask; 2043 rnp->qsmaskinit &= ~mask;
1996 if (rnp->qsmaskinit != 0) { 2044 if (rnp->qsmaskinit != 0) {
1997 if (rnp != rdp->mynode) 2045 if (rnp != rdp->mynode)
@@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2202 cond_resched(); 2250 cond_resched();
2203 mask = 0; 2251 mask = 0;
2204 raw_spin_lock_irqsave(&rnp->lock, flags); 2252 raw_spin_lock_irqsave(&rnp->lock, flags);
2253 smp_mb__after_unlock_lock();
2205 if (!rcu_gp_in_progress(rsp)) { 2254 if (!rcu_gp_in_progress(rsp)) {
2206 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2255 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2207 return; 2256 return;
@@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2231 rnp = rcu_get_root(rsp); 2280 rnp = rcu_get_root(rsp);
2232 if (rnp->qsmask == 0) { 2281 if (rnp->qsmask == 0) {
2233 raw_spin_lock_irqsave(&rnp->lock, flags); 2282 raw_spin_lock_irqsave(&rnp->lock, flags);
2283 smp_mb__after_unlock_lock();
2234 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ 2284 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2235 } 2285 }
2236} 2286}
@@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2263 2313
2264 /* Reached the root of the rcu_node tree, acquire lock. */ 2314 /* Reached the root of the rcu_node tree, acquire lock. */
2265 raw_spin_lock_irqsave(&rnp_old->lock, flags); 2315 raw_spin_lock_irqsave(&rnp_old->lock, flags);
2316 smp_mb__after_unlock_lock();
2266 raw_spin_unlock(&rnp_old->fqslock); 2317 raw_spin_unlock(&rnp_old->fqslock);
2267 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2268 rsp->n_force_qs_lh++; 2319 rsp->n_force_qs_lh++;
@@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2303 /* If there are callbacks ready, invoke them. */ 2354 /* If there are callbacks ready, invoke them. */
2304 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2355 if (cpu_has_callbacks_ready_to_invoke(rdp))
2305 invoke_rcu_callbacks(rsp, rdp); 2356 invoke_rcu_callbacks(rsp, rdp);
2357
2358 /* Do any needed deferred wakeups of rcuo kthreads. */
2359 do_nocb_deferred_wakeup(rdp);
2306} 2360}
2307 2361
2308/* 2362/*
@@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2378 struct rcu_node *rnp_root = rcu_get_root(rsp); 2432 struct rcu_node *rnp_root = rcu_get_root(rsp);
2379 2433
2380 raw_spin_lock(&rnp_root->lock); 2434 raw_spin_lock(&rnp_root->lock);
2435 smp_mb__after_unlock_lock();
2381 rcu_start_gp(rsp); 2436 rcu_start_gp(rsp);
2382 raw_spin_unlock(&rnp_root->lock); 2437 raw_spin_unlock(&rnp_root->lock);
2383 } else { 2438 } else {
@@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2437 2492
2438 if (cpu != -1) 2493 if (cpu != -1)
2439 rdp = per_cpu_ptr(rsp->rda, cpu); 2494 rdp = per_cpu_ptr(rsp->rda, cpu);
2440 offline = !__call_rcu_nocb(rdp, head, lazy); 2495 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2441 WARN_ON_ONCE(offline); 2496 WARN_ON_ONCE(offline);
2442 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2497 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2443 local_irq_restore(flags); 2498 local_irq_restore(flags);
@@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2757 /* Check for CPU stalls, if enabled. */ 2812 /* Check for CPU stalls, if enabled. */
2758 check_cpu_stall(rsp, rdp); 2813 check_cpu_stall(rsp, rdp);
2759 2814
2815 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
2816 if (rcu_nohz_full_cpu(rsp))
2817 return 0;
2818
2760 /* Is the RCU core waiting for a quiescent state from this CPU? */ 2819 /* Is the RCU core waiting for a quiescent state from this CPU? */
2761 if (rcu_scheduler_fully_active && 2820 if (rcu_scheduler_fully_active &&
2762 rdp->qs_pending && !rdp->passed_quiesce) { 2821 rdp->qs_pending && !rdp->passed_quiesce) {
@@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2790 return 1; 2849 return 1;
2791 } 2850 }
2792 2851
2852 /* Does this CPU need a deferred NOCB wakeup? */
2853 if (rcu_nocb_need_deferred_wakeup(rdp)) {
2854 rdp->n_rp_nocb_defer_wakeup++;
2855 return 1;
2856 }
2857
2793 /* nothing to do */ 2858 /* nothing to do */
2794 rdp->n_rp_need_nothing++; 2859 rdp->n_rp_need_nothing++;
2795 return 0; 2860 return 0;
@@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
3214{ 3279{
3215 int i; 3280 int i;
3216 3281
3217 for (i = rcu_num_lvls - 1; i > 0; i--) 3282 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3283 for (i = rcu_num_lvls - 2; i >= 0; i--)
3218 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 3284 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3219 rsp->levelspread[0] = rcu_fanout_leaf;
3220} 3285}
3221#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 3286#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3222static void __init rcu_init_levelspread(struct rcu_state *rsp) 3287static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void)
3346 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3411 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3347 nr_cpu_ids == NR_CPUS) 3412 nr_cpu_ids == NR_CPUS)
3348 return; 3413 return;
3414 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
3415 rcu_fanout_leaf, nr_cpu_ids);
3349 3416
3350 /* 3417 /*
3351 * Compute number of nodes that can be handled an rcu_node tree 3418 * Compute number of nodes that can be handled an rcu_node tree
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 52be957c9fe2..8c19873f1ac9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -317,6 +317,7 @@ struct rcu_data {
317 unsigned long n_rp_cpu_needs_gp; 317 unsigned long n_rp_cpu_needs_gp;
318 unsigned long n_rp_gp_completed; 318 unsigned long n_rp_gp_completed;
319 unsigned long n_rp_gp_started; 319 unsigned long n_rp_gp_started;
320 unsigned long n_rp_nocb_defer_wakeup;
320 unsigned long n_rp_need_nothing; 321 unsigned long n_rp_need_nothing;
321 322
322 /* 6) _rcu_barrier() and OOM callbacks. */ 323 /* 6) _rcu_barrier() and OOM callbacks. */
@@ -335,6 +336,7 @@ struct rcu_data {
335 int nocb_p_count_lazy; /* (approximate). */ 336 int nocb_p_count_lazy; /* (approximate). */
336 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 337 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
337 struct task_struct *nocb_kthread; 338 struct task_struct *nocb_kthread;
339 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
338#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 340#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
339 341
340 /* 8) RCU CPU stall data. */ 342 /* 8) RCU CPU stall data. */
@@ -453,6 +455,8 @@ struct rcu_state {
453 /* but in jiffies. */ 455 /* but in jiffies. */
454 unsigned long jiffies_stall; /* Time at which to check */ 456 unsigned long jiffies_stall; /* Time at which to check */
455 /* for CPU stalls. */ 457 /* for CPU stalls. */
458 unsigned long jiffies_resched; /* Time at which to resched */
459 /* a reluctant CPU. */
456 unsigned long gp_max; /* Maximum GP duration in */ 460 unsigned long gp_max; /* Maximum GP duration in */
457 /* jiffies. */ 461 /* jiffies. */
458 const char *name; /* Name of structure. */ 462 const char *name; /* Name of structure. */
@@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
548static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 552static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
549static void rcu_init_one_nocb(struct rcu_node *rnp); 553static void rcu_init_one_nocb(struct rcu_node *rnp);
550static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 554static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
551 bool lazy); 555 bool lazy, unsigned long flags);
552static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 556static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
553 struct rcu_data *rdp); 557 struct rcu_data *rdp,
558 unsigned long flags);
559static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
560static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
554static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 561static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
555static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 562static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
556static void rcu_kick_nohz_cpu(int cpu); 563static void rcu_kick_nohz_cpu(int cpu);
@@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
564 unsigned long maxj); 571 unsigned long maxj);
565static void rcu_bind_gp_kthread(void); 572static void rcu_bind_gp_kthread(void);
566static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); 573static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
574static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
567 575
568#endif /* #ifndef RCU_TREE_NONCORE */ 576#endif /* #ifndef RCU_TREE_NONCORE */
569 577
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6abb03dff5c0..6e2ef4b2b920 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu)
204 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 204 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
205 rnp = rdp->mynode; 205 rnp = rdp->mynode;
206 raw_spin_lock_irqsave(&rnp->lock, flags); 206 raw_spin_lock_irqsave(&rnp->lock, flags);
207 smp_mb__after_unlock_lock();
207 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 208 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
208 t->rcu_blocked_node = rnp; 209 t->rcu_blocked_node = rnp;
209 210
@@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
312 mask = rnp->grpmask; 313 mask = rnp->grpmask;
313 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 314 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
314 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 315 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
316 smp_mb__after_unlock_lock();
315 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 317 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
316} 318}
317 319
@@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t)
361 special = t->rcu_read_unlock_special; 363 special = t->rcu_read_unlock_special;
362 if (special & RCU_READ_UNLOCK_NEED_QS) { 364 if (special & RCU_READ_UNLOCK_NEED_QS) {
363 rcu_preempt_qs(smp_processor_id()); 365 rcu_preempt_qs(smp_processor_id());
366 if (!t->rcu_read_unlock_special) {
367 local_irq_restore(flags);
368 return;
369 }
364 } 370 }
365 371
366 /* Hardware IRQ handlers cannot block. */ 372 /* Hardware IRQ handlers cannot block, complain if they get here. */
367 if (in_irq() || in_serving_softirq()) { 373 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
368 local_irq_restore(flags); 374 local_irq_restore(flags);
369 return; 375 return;
370 } 376 }
@@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t)
381 for (;;) { 387 for (;;) {
382 rnp = t->rcu_blocked_node; 388 rnp = t->rcu_blocked_node;
383 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 389 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
390 smp_mb__after_unlock_lock();
384 if (rnp == t->rcu_blocked_node) 391 if (rnp == t->rcu_blocked_node)
385 break; 392 break;
386 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 393 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
605 while (!list_empty(lp)) { 612 while (!list_empty(lp)) {
606 t = list_entry(lp->next, typeof(*t), rcu_node_entry); 613 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
607 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 614 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
615 smp_mb__after_unlock_lock();
608 list_del(&t->rcu_node_entry); 616 list_del(&t->rcu_node_entry);
609 t->rcu_blocked_node = rnp_root; 617 t->rcu_blocked_node = rnp_root;
610 list_add(&t->rcu_node_entry, lp_root); 618 list_add(&t->rcu_node_entry, lp_root);
@@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
629 * in this case. 637 * in this case.
630 */ 638 */
631 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 639 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
640 smp_mb__after_unlock_lock();
632 if (rnp_root->boost_tasks != NULL && 641 if (rnp_root->boost_tasks != NULL &&
633 rnp_root->boost_tasks != rnp_root->gp_tasks && 642 rnp_root->boost_tasks != rnp_root->gp_tasks &&
634 rnp_root->boost_tasks != rnp_root->exp_tasks) 643 rnp_root->boost_tasks != rnp_root->exp_tasks)
@@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
772 unsigned long mask; 781 unsigned long mask;
773 782
774 raw_spin_lock_irqsave(&rnp->lock, flags); 783 raw_spin_lock_irqsave(&rnp->lock, flags);
784 smp_mb__after_unlock_lock();
775 for (;;) { 785 for (;;) {
776 if (!sync_rcu_preempt_exp_done(rnp)) { 786 if (!sync_rcu_preempt_exp_done(rnp)) {
777 raw_spin_unlock_irqrestore(&rnp->lock, flags); 787 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
779 } 789 }
780 if (rnp->parent == NULL) { 790 if (rnp->parent == NULL) {
781 raw_spin_unlock_irqrestore(&rnp->lock, flags); 791 raw_spin_unlock_irqrestore(&rnp->lock, flags);
782 if (wake) 792 if (wake) {
793 smp_mb(); /* EGP done before wake_up(). */
783 wake_up(&sync_rcu_preempt_exp_wq); 794 wake_up(&sync_rcu_preempt_exp_wq);
795 }
784 break; 796 break;
785 } 797 }
786 mask = rnp->grpmask; 798 mask = rnp->grpmask;
787 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 799 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
788 rnp = rnp->parent; 800 rnp = rnp->parent;
789 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 801 raw_spin_lock(&rnp->lock); /* irqs already disabled */
802 smp_mb__after_unlock_lock();
790 rnp->expmask &= ~mask; 803 rnp->expmask &= ~mask;
791 } 804 }
792} 805}
@@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
806 int must_wait = 0; 819 int must_wait = 0;
807 820
808 raw_spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
822 smp_mb__after_unlock_lock();
809 if (list_empty(&rnp->blkd_tasks)) { 823 if (list_empty(&rnp->blkd_tasks)) {
810 raw_spin_unlock_irqrestore(&rnp->lock, flags); 824 raw_spin_unlock_irqrestore(&rnp->lock, flags);
811 } else { 825 } else {
@@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void)
886 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 900 /* Initialize ->expmask for all non-leaf rcu_node structures. */
887 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 901 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
888 raw_spin_lock_irqsave(&rnp->lock, flags); 902 raw_spin_lock_irqsave(&rnp->lock, flags);
903 smp_mb__after_unlock_lock();
889 rnp->expmask = rnp->qsmaskinit; 904 rnp->expmask = rnp->qsmaskinit;
890 raw_spin_unlock_irqrestore(&rnp->lock, flags); 905 raw_spin_unlock_irqrestore(&rnp->lock, flags);
891 } 906 }
@@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp)
1191 return 0; /* Nothing left to boost. */ 1206 return 0; /* Nothing left to boost. */
1192 1207
1193 raw_spin_lock_irqsave(&rnp->lock, flags); 1208 raw_spin_lock_irqsave(&rnp->lock, flags);
1209 smp_mb__after_unlock_lock();
1194 1210
1195 /* 1211 /*
1196 * Recheck under the lock: all tasks in need of boosting 1212 * Recheck under the lock: all tasks in need of boosting
@@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1377 if (IS_ERR(t)) 1393 if (IS_ERR(t))
1378 return PTR_ERR(t); 1394 return PTR_ERR(t);
1379 raw_spin_lock_irqsave(&rnp->lock, flags); 1395 raw_spin_lock_irqsave(&rnp->lock, flags);
1396 smp_mb__after_unlock_lock();
1380 rnp->boost_kthread_task = t; 1397 rnp->boost_kthread_task = t;
1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1398 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1382 sp.sched_priority = RCU_BOOST_PRIO; 1399 sp.sched_priority = RCU_BOOST_PRIO;
@@ -1632,7 +1649,7 @@ module_param(rcu_idle_gp_delay, int, 0644);
1632static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1649static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1633module_param(rcu_idle_lazy_gp_delay, int, 0644); 1650module_param(rcu_idle_lazy_gp_delay, int, 0644);
1634 1651
1635extern int tick_nohz_enabled; 1652extern int tick_nohz_active;
1636 1653
1637/* 1654/*
1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but 1655 * Try to advance callbacks for all flavors of RCU on the current CPU, but
@@ -1729,7 +1746,7 @@ static void rcu_prepare_for_idle(int cpu)
1729 int tne; 1746 int tne;
1730 1747
1731 /* Handle nohz enablement switches conservatively. */ 1748 /* Handle nohz enablement switches conservatively. */
1732 tne = ACCESS_ONCE(tick_nohz_enabled); 1749 tne = ACCESS_ONCE(tick_nohz_active);
1733 if (tne != rdtp->tick_nohz_enabled_snap) { 1750 if (tne != rdtp->tick_nohz_enabled_snap) {
1734 if (rcu_cpu_has_callbacks(cpu, NULL)) 1751 if (rcu_cpu_has_callbacks(cpu, NULL))
1735 invoke_rcu_core(); /* force nohz to see update. */ 1752 invoke_rcu_core(); /* force nohz to see update. */
@@ -1769,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu)
1769 continue; 1786 continue;
1770 rnp = rdp->mynode; 1787 rnp = rdp->mynode;
1771 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1788 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1789 smp_mb__after_unlock_lock();
1772 rcu_accelerate_cbs(rsp, rnp, rdp); 1790 rcu_accelerate_cbs(rsp, rnp, rdp);
1773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1774 } 1792 }
@@ -1852,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self,
1852 1870
1853 /* Wait for callbacks from earlier instance to complete. */ 1871 /* Wait for callbacks from earlier instance to complete. */
1854 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); 1872 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1873 smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1855 1874
1856 /* 1875 /*
1857 * Prevent premature wakeup: ensure that all increments happen 1876 * Prevent premature wakeup: ensure that all increments happen
@@ -2101,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu)
2101static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 2120static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2102 struct rcu_head *rhp, 2121 struct rcu_head *rhp,
2103 struct rcu_head **rhtp, 2122 struct rcu_head **rhtp,
2104 int rhcount, int rhcount_lazy) 2123 int rhcount, int rhcount_lazy,
2124 unsigned long flags)
2105{ 2125{
2106 int len; 2126 int len;
2107 struct rcu_head **old_rhpp; 2127 struct rcu_head **old_rhpp;
@@ -2122,9 +2142,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2122 } 2142 }
2123 len = atomic_long_read(&rdp->nocb_q_count); 2143 len = atomic_long_read(&rdp->nocb_q_count);
2124 if (old_rhpp == &rdp->nocb_head) { 2144 if (old_rhpp == &rdp->nocb_head) {
2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2145 if (!irqs_disabled_flags(flags)) {
2146 wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
2147 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2148 TPS("WakeEmpty"));
2149 } else {
2150 rdp->nocb_defer_wakeup = true;
2151 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2152 TPS("WakeEmptyIsDeferred"));
2153 }
2126 rdp->qlen_last_fqs_check = 0; 2154 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2155 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2129 wake_up_process(t); /* ... or if many callbacks queued. */ 2156 wake_up_process(t); /* ... or if many callbacks queued. */
2130 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2157 rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2145,12 +2172,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2145 * "rcuo" kthread can find it. 2172 * "rcuo" kthread can find it.
2146 */ 2173 */
2147static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2174static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2148 bool lazy) 2175 bool lazy, unsigned long flags)
2149{ 2176{
2150 2177
2151 if (!rcu_is_nocb_cpu(rdp->cpu)) 2178 if (!rcu_is_nocb_cpu(rdp->cpu))
2152 return 0; 2179 return 0;
2153 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2180 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2181 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2182 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2156 (unsigned long)rhp->func, 2183 (unsigned long)rhp->func,
@@ -2168,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2168 * not a no-CBs CPU. 2195 * not a no-CBs CPU.
2169 */ 2196 */
2170static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2197static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2171 struct rcu_data *rdp) 2198 struct rcu_data *rdp,
2199 unsigned long flags)
2172{ 2200{
2173 long ql = rsp->qlen; 2201 long ql = rsp->qlen;
2174 long qll = rsp->qlen_lazy; 2202 long qll = rsp->qlen_lazy;
@@ -2182,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2182 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 2210 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2183 if (rsp->orphan_donelist != NULL) { 2211 if (rsp->orphan_donelist != NULL) {
2184 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 2212 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2185 rsp->orphan_donetail, ql, qll); 2213 rsp->orphan_donetail, ql, qll, flags);
2186 ql = qll = 0; 2214 ql = qll = 0;
2187 rsp->orphan_donelist = NULL; 2215 rsp->orphan_donelist = NULL;
2188 rsp->orphan_donetail = &rsp->orphan_donelist; 2216 rsp->orphan_donetail = &rsp->orphan_donelist;
2189 } 2217 }
2190 if (rsp->orphan_nxtlist != NULL) { 2218 if (rsp->orphan_nxtlist != NULL) {
2191 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 2219 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2192 rsp->orphan_nxttail, ql, qll); 2220 rsp->orphan_nxttail, ql, qll, flags);
2193 ql = qll = 0; 2221 ql = qll = 0;
2194 rsp->orphan_nxtlist = NULL; 2222 rsp->orphan_nxtlist = NULL;
2195 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2223 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
@@ -2209,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2209 struct rcu_node *rnp = rdp->mynode; 2237 struct rcu_node *rnp = rdp->mynode;
2210 2238
2211 raw_spin_lock_irqsave(&rnp->lock, flags); 2239 raw_spin_lock_irqsave(&rnp->lock, flags);
2240 smp_mb__after_unlock_lock();
2212 c = rcu_start_future_gp(rnp, rdp); 2241 c = rcu_start_future_gp(rnp, rdp);
2213 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2214 2243
@@ -2250,6 +2279,7 @@ static int rcu_nocb_kthread(void *arg)
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2279 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep")); 2280 TPS("Sleep"));
2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2281 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2282 /* Memory barrier provide by xchg() below. */
2253 } else if (firsttime) { 2283 } else if (firsttime) {
2254 firsttime = 0; 2284 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2285 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2310,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg)
2310 return 0; 2340 return 0;
2311} 2341}
2312 2342
2343/* Is a deferred wakeup of rcu_nocb_kthread() required? */
2344static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2345{
2346 return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2347}
2348
2349/* Do a deferred wakeup of rcu_nocb_kthread(). */
2350static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2351{
2352 if (!rcu_nocb_need_deferred_wakeup(rdp))
2353 return;
2354 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
2355 wake_up(&rdp->nocb_wq);
2356 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
2357}
2358
2313/* Initialize per-rcu_data variables for no-CBs CPUs. */ 2359/* Initialize per-rcu_data variables for no-CBs CPUs. */
2314static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2360static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2315{ 2361{
@@ -2365,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2365} 2411}
2366 2412
2367static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2413static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2368 bool lazy) 2414 bool lazy, unsigned long flags)
2369{ 2415{
2370 return 0; 2416 return 0;
2371} 2417}
2372 2418
2373static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2419static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2374 struct rcu_data *rdp) 2420 struct rcu_data *rdp,
2421 unsigned long flags)
2375{ 2422{
2376 return 0; 2423 return 0;
2377} 2424}
@@ -2380,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2380{ 2427{
2381} 2428}
2382 2429
2430static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2431{
2432 return false;
2433}
2434
2435static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2436{
2437}
2438
2383static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2439static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2384{ 2440{
2385} 2441}
@@ -2829,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2829} 2885}
2830 2886
2831#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2887#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2888
2889/*
2890 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2891 * grace-period kthread will do force_quiescent_state() processing?
2892 * The idea is to avoid waking up RCU core processing on such a
2893 * CPU unless the grace period has extended for too long.
2894 *
2895 * This code relies on the fact that all NO_HZ_FULL CPUs are also
2896 * CONFIG_RCU_NOCB_CPUs.
2897 */
2898static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2899{
2900#ifdef CONFIG_NO_HZ_FULL
2901 if (tick_nohz_full_cpu(smp_processor_id()) &&
2902 (!rcu_gp_in_progress(rsp) ||
2903 ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
2904 return 1;
2905#endif /* #ifdef CONFIG_NO_HZ_FULL */
2906 return 0;
2907}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3596797b7e46..4def475336d4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
364 rdp->n_rp_report_qs, 364 rdp->n_rp_report_qs,
365 rdp->n_rp_cb_ready, 365 rdp->n_rp_cb_ready,
366 rdp->n_rp_cpu_needs_gp); 366 rdp->n_rp_cpu_needs_gp);
367 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", 367 seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
368 rdp->n_rp_gp_completed, 368 rdp->n_rp_gp_completed,
369 rdp->n_rp_gp_started, 369 rdp->n_rp_gp_started,
370 rdp->n_rp_nocb_defer_wakeup,
370 rdp->n_rp_need_nothing); 371 rdp->n_rp_need_nothing);
371} 372}
372 373
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6cb3dff89e2b..802365ccd591 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map =
128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
129EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 129EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
130 130
131static struct lock_class_key rcu_callback_key;
132struct lockdep_map rcu_callback_map =
133 STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
134EXPORT_SYMBOL_GPL(rcu_callback_map);
135
131int notrace debug_lockdep_rcu_enabled(void) 136int notrace debug_lockdep_rcu_enabled(void)
132{ 137{
133 return rcu_scheduler_active && debug_locks && 138 return rcu_scheduler_active && debug_locks &&
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107static void migrate_to_reboot_cpu(void) 107void migrate_to_reboot_cpu(void)
108{ 108{
109 /* The boot cpu is always logical cpu 0 */ 109 /* The boot cpu is always logical cpu 0 */
110 int cpu = reboot_cpu; 110 int cpu = reboot_cpu;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15obj-y += wait.o completion.o 16obj-y += wait.o completion.o
16obj-$(CONFIG_SMP) += cpupri.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
18obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
19obj-$(CONFIG_SCHED_DEBUG) += debug.o 20obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..6bd6a6731b21 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current cpu.
31 * 30 *
31 * sched_clock_cpu(i)
32 *
32 * How: 33 * How:
33 * 34 *
34 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 51 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
52 * 53 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */ 54 */
63#include <linux/spinlock.h> 55#include <linux/spinlock.h>
64#include <linux/hardirq.h> 56#include <linux/hardirq.h>
@@ -66,6 +58,8 @@
66#include <linux/percpu.h> 58#include <linux/percpu.h>
67#include <linux/ktime.h> 59#include <linux/ktime.h>
68#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/static_key.h>
62#include <linux/workqueue.h>
69 63
70/* 64/*
71 * Scheduler clock - returns current time in nanosec units. 65 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock);
82__read_mostly int sched_clock_running; 76__read_mostly int sched_clock_running;
83 77
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 78#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 79static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
80
81int sched_clock_stable(void)
82{
83 if (static_key_false(&__sched_clock_stable))
84 return false;
85 return true;
86}
87
88void set_sched_clock_stable(void)
89{
90 if (!sched_clock_stable())
91 static_key_slow_dec(&__sched_clock_stable);
92}
93
94static void __clear_sched_clock_stable(struct work_struct *work)
95{
96 /* XXX worry about clock continuity */
97 if (sched_clock_stable())
98 static_key_slow_inc(&__sched_clock_stable);
99}
100
101static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
102
103void clear_sched_clock_stable(void)
104{
105 if (keventd_up())
106 schedule_work(&sched_clock_work);
107 else
108 __clear_sched_clock_stable(&sched_clock_work);
109}
86 110
87struct sched_clock_data { 111struct sched_clock_data {
88 u64 tick_raw; 112 u64 tick_raw;
@@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu)
242 struct sched_clock_data *scd; 266 struct sched_clock_data *scd;
243 u64 clock; 267 u64 clock;
244 268
245 WARN_ON_ONCE(!irqs_disabled()); 269 if (sched_clock_stable())
246
247 if (sched_clock_stable)
248 return sched_clock(); 270 return sched_clock();
249 271
250 if (unlikely(!sched_clock_running)) 272 if (unlikely(!sched_clock_running))
251 return 0ull; 273 return 0ull;
252 274
275 preempt_disable();
253 scd = cpu_sdc(cpu); 276 scd = cpu_sdc(cpu);
254 277
255 if (cpu != smp_processor_id()) 278 if (cpu != smp_processor_id())
256 clock = sched_clock_remote(scd); 279 clock = sched_clock_remote(scd);
257 else 280 else
258 clock = sched_clock_local(scd); 281 clock = sched_clock_local(scd);
282 preempt_enable();
259 283
260 return clock; 284 return clock;
261} 285}
@@ -265,7 +289,7 @@ void sched_clock_tick(void)
265 struct sched_clock_data *scd; 289 struct sched_clock_data *scd;
266 u64 now, now_gtod; 290 u64 now, now_gtod;
267 291
268 if (sched_clock_stable) 292 if (sched_clock_stable())
269 return; 293 return;
270 294
271 if (unlikely(!sched_clock_running)) 295 if (unlikely(!sched_clock_running))
@@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
316 */ 340 */
317u64 cpu_clock(int cpu) 341u64 cpu_clock(int cpu)
318{ 342{
319 u64 clock; 343 if (static_key_false(&__sched_clock_stable))
320 unsigned long flags; 344 return sched_clock_cpu(cpu);
321
322 local_irq_save(flags);
323 clock = sched_clock_cpu(cpu);
324 local_irq_restore(flags);
325 345
326 return clock; 346 return sched_clock();
327} 347}
328 348
329/* 349/*
@@ -335,14 +355,10 @@ u64 cpu_clock(int cpu)
335 */ 355 */
336u64 local_clock(void) 356u64 local_clock(void)
337{ 357{
338 u64 clock; 358 if (static_key_false(&__sched_clock_stable))
339 unsigned long flags; 359 return sched_clock_cpu(raw_smp_processor_id());
340 360
341 local_irq_save(flags); 361 return sched_clock();
342 clock = sched_clock_cpu(smp_processor_id());
343 local_irq_restore(flags);
344
345 return clock;
346} 362}
347 363
348#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 364#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu)
362 378
363u64 cpu_clock(int cpu) 379u64 cpu_clock(int cpu)
364{ 380{
365 return sched_clock_cpu(cpu); 381 return sched_clock();
366} 382}
367 383
368u64 local_clock(void) 384u64 local_clock(void)
369{ 385{
370 return sched_clock_cpu(0); 386 return sched_clock();
371} 387}
372 388
373#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 389#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..4d6964e49711 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out; 1109 goto out;
1110 1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112 1113
1113out: 1114out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1500 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1501 * this IPI.
1501 */ 1502 */
1502 if (tif_need_resched()) 1503 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1504
1505 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1718#endif
1719 1719
1720 RB_CLEAR_NODE(&p->dl.rb_node);
1721 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1722 p->dl.dl_runtime = p->dl.runtime = 0;
1723 p->dl.dl_deadline = p->dl.deadline = 0;
1724 p->dl.dl_period = 0;
1725 p->dl.flags = 0;
1726
1720 INIT_LIST_HEAD(&p->rt.run_list); 1727 INIT_LIST_HEAD(&p->rt.run_list);
1721 1728
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1729#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1768,7 +1775,7 @@ void set_numabalancing_state(bool enabled)
1768/* 1775/*
1769 * fork()/clone()-time setup: 1776 * fork()/clone()-time setup:
1770 */ 1777 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1778int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1779{
1773 unsigned long flags; 1780 unsigned long flags;
1774 int cpu = get_cpu(); 1781 int cpu = get_cpu();
@@ -1790,7 +1797,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1797 * Revert to default priority/policy on fork if requested.
1791 */ 1798 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1799 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1800 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1801 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1802 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1803 p->rt_priority = 0;
@@ -1807,8 +1814,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1814 p->sched_reset_on_fork = 0;
1808 } 1815 }
1809 1816
1810 if (!rt_prio(p->prio)) 1817 if (dl_prio(p->prio)) {
1818 put_cpu();
1819 return -EAGAIN;
1820 } else if (rt_prio(p->prio)) {
1821 p->sched_class = &rt_sched_class;
1822 } else {
1811 p->sched_class = &fair_sched_class; 1823 p->sched_class = &fair_sched_class;
1824 }
1812 1825
1813 if (p->sched_class->task_fork) 1826 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1827 p->sched_class->task_fork(p);
@@ -1834,11 +1847,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1847 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1848#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1849 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1850 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1851#endif
1838 1852
1839 put_cpu(); 1853 put_cpu();
1854 return 0;
1855}
1856
1857unsigned long to_ratio(u64 period, u64 runtime)
1858{
1859 if (runtime == RUNTIME_INF)
1860 return 1ULL << 20;
1861
1862 /*
1863 * Doing this here saves a lot of checks in all
1864 * the calling paths, and returning zero seems
1865 * safe for them anyway.
1866 */
1867 if (period == 0)
1868 return 0;
1869
1870 return div64_u64(runtime << 20, period);
1871}
1872
1873#ifdef CONFIG_SMP
1874inline struct dl_bw *dl_bw_of(int i)
1875{
1876 return &cpu_rq(i)->rd->dl_bw;
1877}
1878
1879static inline int dl_bw_cpus(int i)
1880{
1881 struct root_domain *rd = cpu_rq(i)->rd;
1882 int cpus = 0;
1883
1884 for_each_cpu_and(i, rd->span, cpu_active_mask)
1885 cpus++;
1886
1887 return cpus;
1888}
1889#else
1890inline struct dl_bw *dl_bw_of(int i)
1891{
1892 return &cpu_rq(i)->dl.dl_bw;
1893}
1894
1895static inline int dl_bw_cpus(int i)
1896{
1897 return 1;
1898}
1899#endif
1900
1901static inline
1902void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1903{
1904 dl_b->total_bw -= tsk_bw;
1905}
1906
1907static inline
1908void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1909{
1910 dl_b->total_bw += tsk_bw;
1911}
1912
1913static inline
1914bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1915{
1916 return dl_b->bw != -1 &&
1917 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1918}
1919
1920/*
1921 * We must be sure that accepting a new task (or allowing changing the
1922 * parameters of an existing one) is consistent with the bandwidth
1923 * constraints. If yes, this function also accordingly updates the currently
1924 * allocated bandwidth to reflect the new situation.
1925 *
1926 * This function is called while holding p's rq->lock.
1927 */
1928static int dl_overflow(struct task_struct *p, int policy,
1929 const struct sched_attr *attr)
1930{
1931
1932 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1933 u64 period = attr->sched_period;
1934 u64 runtime = attr->sched_runtime;
1935 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1936 int cpus, err = -1;
1937
1938 if (new_bw == p->dl.dl_bw)
1939 return 0;
1940
1941 /*
1942 * Either if a task, enters, leave, or stays -deadline but changes
1943 * its parameters, we may need to update accordingly the total
1944 * allocated bandwidth of the container.
1945 */
1946 raw_spin_lock(&dl_b->lock);
1947 cpus = dl_bw_cpus(task_cpu(p));
1948 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1949 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1950 __dl_add(dl_b, new_bw);
1951 err = 0;
1952 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1953 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1954 __dl_clear(dl_b, p->dl.dl_bw);
1955 __dl_add(dl_b, new_bw);
1956 err = 0;
1957 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1958 __dl_clear(dl_b, p->dl.dl_bw);
1959 err = 0;
1960 }
1961 raw_spin_unlock(&dl_b->lock);
1962
1963 return err;
1840} 1964}
1841 1965
1966extern void init_dl_bw(struct dl_bw *dl_b);
1967
1842/* 1968/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1969 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1970 *
@@ -2003,6 +2129,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2129 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2130 task_numa_free(prev);
2005 2131
2132 if (prev->sched_class->task_dead)
2133 prev->sched_class->task_dead(prev);
2134
2006 /* 2135 /*
2007 * Remove function-return probe instances associated with this 2136 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2137 * task and put them back on the free list.
@@ -2296,7 +2425,7 @@ void scheduler_tick(void)
2296 2425
2297#ifdef CONFIG_SMP 2426#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2427 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2428 trigger_load_balance(rq);
2300#endif 2429#endif
2301 rq_last_tick_reset(rq); 2430 rq_last_tick_reset(rq);
2302} 2431}
@@ -2414,10 +2543,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2543{
2415 /* 2544 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2545 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2546 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2547 * if we are scheduling when we should not.
2419 */ 2548 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2549 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2550 __schedule_bug(prev);
2422 rcu_sleep_check(); 2551 rcu_sleep_check();
2423 2552
@@ -2660,6 +2789,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
2660 } while (need_resched()); 2789 } while (need_resched());
2661} 2790}
2662EXPORT_SYMBOL(preempt_schedule); 2791EXPORT_SYMBOL(preempt_schedule);
2792#endif /* CONFIG_PREEMPT */
2663 2793
2664/* 2794/*
2665 * this is the entry point to schedule() from kernel preemption 2795 * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2823,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
2693 exception_exit(prev_state); 2823 exception_exit(prev_state);
2694} 2824}
2695 2825
2696#endif /* CONFIG_PREEMPT */
2697
2698int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2826int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2699 void *key) 2827 void *key)
2700{ 2828{
@@ -2762,11 +2890,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2762 */ 2890 */
2763void rt_mutex_setprio(struct task_struct *p, int prio) 2891void rt_mutex_setprio(struct task_struct *p, int prio)
2764{ 2892{
2765 int oldprio, on_rq, running; 2893 int oldprio, on_rq, running, enqueue_flag = 0;
2766 struct rq *rq; 2894 struct rq *rq;
2767 const struct sched_class *prev_class; 2895 const struct sched_class *prev_class;
2768 2896
2769 BUG_ON(prio < 0 || prio > MAX_PRIO); 2897 BUG_ON(prio > MAX_PRIO);
2770 2898
2771 rq = __task_rq_lock(p); 2899 rq = __task_rq_lock(p);
2772 2900
@@ -2789,6 +2917,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2789 } 2917 }
2790 2918
2791 trace_sched_pi_setprio(p, prio); 2919 trace_sched_pi_setprio(p, prio);
2920 p->pi_top_task = rt_mutex_get_top_task(p);
2792 oldprio = p->prio; 2921 oldprio = p->prio;
2793 prev_class = p->sched_class; 2922 prev_class = p->sched_class;
2794 on_rq = p->on_rq; 2923 on_rq = p->on_rq;
@@ -2798,23 +2927,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2798 if (running) 2927 if (running)
2799 p->sched_class->put_prev_task(rq, p); 2928 p->sched_class->put_prev_task(rq, p);
2800 2929
2801 if (rt_prio(prio)) 2930 /*
2931 * Boosting condition are:
2932 * 1. -rt task is running and holds mutex A
2933 * --> -dl task blocks on mutex A
2934 *
2935 * 2. -dl task is running and holds mutex A
2936 * --> -dl task blocks on mutex A and could preempt the
2937 * running task
2938 */
2939 if (dl_prio(prio)) {
2940 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2941 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2942 p->dl.dl_boosted = 1;
2943 p->dl.dl_throttled = 0;
2944 enqueue_flag = ENQUEUE_REPLENISH;
2945 } else
2946 p->dl.dl_boosted = 0;
2947 p->sched_class = &dl_sched_class;
2948 } else if (rt_prio(prio)) {
2949 if (dl_prio(oldprio))
2950 p->dl.dl_boosted = 0;
2951 if (oldprio < prio)
2952 enqueue_flag = ENQUEUE_HEAD;
2802 p->sched_class = &rt_sched_class; 2953 p->sched_class = &rt_sched_class;
2803 else 2954 } else {
2955 if (dl_prio(oldprio))
2956 p->dl.dl_boosted = 0;
2804 p->sched_class = &fair_sched_class; 2957 p->sched_class = &fair_sched_class;
2958 }
2805 2959
2806 p->prio = prio; 2960 p->prio = prio;
2807 2961
2808 if (running) 2962 if (running)
2809 p->sched_class->set_curr_task(rq); 2963 p->sched_class->set_curr_task(rq);
2810 if (on_rq) 2964 if (on_rq)
2811 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2965 enqueue_task(rq, p, enqueue_flag);
2812 2966
2813 check_class_changed(rq, p, prev_class, oldprio); 2967 check_class_changed(rq, p, prev_class, oldprio);
2814out_unlock: 2968out_unlock:
2815 __task_rq_unlock(rq); 2969 __task_rq_unlock(rq);
2816} 2970}
2817#endif 2971#endif
2972
2818void set_user_nice(struct task_struct *p, long nice) 2973void set_user_nice(struct task_struct *p, long nice)
2819{ 2974{
2820 int old_prio, delta, on_rq; 2975 int old_prio, delta, on_rq;
@@ -2832,9 +2987,9 @@ void set_user_nice(struct task_struct *p, long nice)
2832 * The RT priorities are set via sched_setscheduler(), but we still 2987 * The RT priorities are set via sched_setscheduler(), but we still
2833 * allow the 'normal' nice value to be set - but as expected 2988 * allow the 'normal' nice value to be set - but as expected
2834 * it wont have any effect on scheduling until the task is 2989 * it wont have any effect on scheduling until the task is
2835 * SCHED_FIFO/SCHED_RR: 2990 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2836 */ 2991 */
2837 if (task_has_rt_policy(p)) { 2992 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2838 p->static_prio = NICE_TO_PRIO(nice); 2993 p->static_prio = NICE_TO_PRIO(nice);
2839 goto out_unlock; 2994 goto out_unlock;
2840 } 2995 }
@@ -2989,22 +3144,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2989 return pid ? find_task_by_vpid(pid) : current; 3144 return pid ? find_task_by_vpid(pid) : current;
2990} 3145}
2991 3146
2992/* Actually do priority change: must hold rq lock. */ 3147/*
3148 * This function initializes the sched_dl_entity of a newly becoming
3149 * SCHED_DEADLINE task.
3150 *
3151 * Only the static values are considered here, the actual runtime and the
3152 * absolute deadline will be properly calculated when the task is enqueued
3153 * for the first time with its new policy.
3154 */
2993static void 3155static void
2994__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3156__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3157{
3158 struct sched_dl_entity *dl_se = &p->dl;
3159
3160 init_dl_task_timer(dl_se);
3161 dl_se->dl_runtime = attr->sched_runtime;
3162 dl_se->dl_deadline = attr->sched_deadline;
3163 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3164 dl_se->flags = attr->sched_flags;
3165 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3166 dl_se->dl_throttled = 0;
3167 dl_se->dl_new = 1;
3168}
3169
3170/* Actually do priority change: must hold pi & rq lock. */
3171static void __setscheduler(struct rq *rq, struct task_struct *p,
3172 const struct sched_attr *attr)
2995{ 3173{
3174 int policy = attr->sched_policy;
3175
3176 if (policy == -1) /* setparam */
3177 policy = p->policy;
3178
2996 p->policy = policy; 3179 p->policy = policy;
2997 p->rt_priority = prio; 3180
3181 if (dl_policy(policy))
3182 __setparam_dl(p, attr);
3183 else if (fair_policy(policy))
3184 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3185
3186 /*
3187 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3188 * !rt_policy. Always setting this ensures that things like
3189 * getparam()/getattr() don't report silly values for !rt tasks.
3190 */
3191 p->rt_priority = attr->sched_priority;
3192
2998 p->normal_prio = normal_prio(p); 3193 p->normal_prio = normal_prio(p);
2999 /* we are holding p->pi_lock already */
3000 p->prio = rt_mutex_getprio(p); 3194 p->prio = rt_mutex_getprio(p);
3001 if (rt_prio(p->prio)) 3195
3196 if (dl_prio(p->prio))
3197 p->sched_class = &dl_sched_class;
3198 else if (rt_prio(p->prio))
3002 p->sched_class = &rt_sched_class; 3199 p->sched_class = &rt_sched_class;
3003 else 3200 else
3004 p->sched_class = &fair_sched_class; 3201 p->sched_class = &fair_sched_class;
3202
3005 set_load_weight(p); 3203 set_load_weight(p);
3006} 3204}
3007 3205
3206static void
3207__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3208{
3209 struct sched_dl_entity *dl_se = &p->dl;
3210
3211 attr->sched_priority = p->rt_priority;
3212 attr->sched_runtime = dl_se->dl_runtime;
3213 attr->sched_deadline = dl_se->dl_deadline;
3214 attr->sched_period = dl_se->dl_period;
3215 attr->sched_flags = dl_se->flags;
3216}
3217
3218/*
3219 * This function validates the new parameters of a -deadline task.
3220 * We ask for the deadline not being zero, and greater or equal
3221 * than the runtime, as well as the period of being zero or
3222 * greater than deadline. Furthermore, we have to be sure that
3223 * user parameters are above the internal resolution (1us); we
3224 * check sched_runtime only since it is always the smaller one.
3225 */
3226static bool
3227__checkparam_dl(const struct sched_attr *attr)
3228{
3229 return attr && attr->sched_deadline != 0 &&
3230 (attr->sched_period == 0 ||
3231 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3232 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3233 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3234}
3235
3008/* 3236/*
3009 * check the target process has a UID that matches the current process's 3237 * check the target process has a UID that matches the current process's
3010 */ 3238 */
@@ -3021,10 +3249,12 @@ static bool check_same_owner(struct task_struct *p)
3021 return match; 3249 return match;
3022} 3250}
3023 3251
3024static int __sched_setscheduler(struct task_struct *p, int policy, 3252static int __sched_setscheduler(struct task_struct *p,
3025 const struct sched_param *param, bool user) 3253 const struct sched_attr *attr,
3254 bool user)
3026{ 3255{
3027 int retval, oldprio, oldpolicy = -1, on_rq, running; 3256 int retval, oldprio, oldpolicy = -1, on_rq, running;
3257 int policy = attr->sched_policy;
3028 unsigned long flags; 3258 unsigned long flags;
3029 const struct sched_class *prev_class; 3259 const struct sched_class *prev_class;
3030 struct rq *rq; 3260 struct rq *rq;
@@ -3038,31 +3268,40 @@ recheck:
3038 reset_on_fork = p->sched_reset_on_fork; 3268 reset_on_fork = p->sched_reset_on_fork;
3039 policy = oldpolicy = p->policy; 3269 policy = oldpolicy = p->policy;
3040 } else { 3270 } else {
3041 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3271 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3042 policy &= ~SCHED_RESET_ON_FORK;
3043 3272
3044 if (policy != SCHED_FIFO && policy != SCHED_RR && 3273 if (policy != SCHED_DEADLINE &&
3274 policy != SCHED_FIFO && policy != SCHED_RR &&
3045 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3275 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3046 policy != SCHED_IDLE) 3276 policy != SCHED_IDLE)
3047 return -EINVAL; 3277 return -EINVAL;
3048 } 3278 }
3049 3279
3280 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3281 return -EINVAL;
3282
3050 /* 3283 /*
3051 * Valid priorities for SCHED_FIFO and SCHED_RR are 3284 * Valid priorities for SCHED_FIFO and SCHED_RR are
3052 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3285 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3053 * SCHED_BATCH and SCHED_IDLE is 0. 3286 * SCHED_BATCH and SCHED_IDLE is 0.
3054 */ 3287 */
3055 if (param->sched_priority < 0 || 3288 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3056 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3289 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3057 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3058 return -EINVAL; 3290 return -EINVAL;
3059 if (rt_policy(policy) != (param->sched_priority != 0)) 3291 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3292 (rt_policy(policy) != (attr->sched_priority != 0)))
3060 return -EINVAL; 3293 return -EINVAL;
3061 3294
3062 /* 3295 /*
3063 * Allow unprivileged RT tasks to decrease priority: 3296 * Allow unprivileged RT tasks to decrease priority:
3064 */ 3297 */
3065 if (user && !capable(CAP_SYS_NICE)) { 3298 if (user && !capable(CAP_SYS_NICE)) {
3299 if (fair_policy(policy)) {
3300 if (attr->sched_nice < TASK_NICE(p) &&
3301 !can_nice(p, attr->sched_nice))
3302 return -EPERM;
3303 }
3304
3066 if (rt_policy(policy)) { 3305 if (rt_policy(policy)) {
3067 unsigned long rlim_rtprio = 3306 unsigned long rlim_rtprio =
3068 task_rlimit(p, RLIMIT_RTPRIO); 3307 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3072,8 +3311,8 @@ recheck:
3072 return -EPERM; 3311 return -EPERM;
3073 3312
3074 /* can't increase priority */ 3313 /* can't increase priority */
3075 if (param->sched_priority > p->rt_priority && 3314 if (attr->sched_priority > p->rt_priority &&
3076 param->sched_priority > rlim_rtprio) 3315 attr->sched_priority > rlim_rtprio)
3077 return -EPERM; 3316 return -EPERM;
3078 } 3317 }
3079 3318
@@ -3121,14 +3360,21 @@ recheck:
3121 /* 3360 /*
3122 * If not changing anything there's no need to proceed further: 3361 * If not changing anything there's no need to proceed further:
3123 */ 3362 */
3124 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3363 if (unlikely(policy == p->policy)) {
3125 param->sched_priority == p->rt_priority))) { 3364 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3365 goto change;
3366 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3367 goto change;
3368 if (dl_policy(policy))
3369 goto change;
3370
3126 task_rq_unlock(rq, p, &flags); 3371 task_rq_unlock(rq, p, &flags);
3127 return 0; 3372 return 0;
3128 } 3373 }
3374change:
3129 3375
3130#ifdef CONFIG_RT_GROUP_SCHED
3131 if (user) { 3376 if (user) {
3377#ifdef CONFIG_RT_GROUP_SCHED
3132 /* 3378 /*
3133 * Do not allow realtime tasks into groups that have no runtime 3379 * Do not allow realtime tasks into groups that have no runtime
3134 * assigned. 3380 * assigned.
@@ -3139,8 +3385,24 @@ recheck:
3139 task_rq_unlock(rq, p, &flags); 3385 task_rq_unlock(rq, p, &flags);
3140 return -EPERM; 3386 return -EPERM;
3141 } 3387 }
3142 }
3143#endif 3388#endif
3389#ifdef CONFIG_SMP
3390 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3391 cpumask_t *span = rq->rd->span;
3392
3393 /*
3394 * Don't allow tasks with an affinity mask smaller than
3395 * the entire root_domain to become SCHED_DEADLINE. We
3396 * will also fail if there's no bandwidth available.
3397 */
3398 if (!cpumask_subset(span, &p->cpus_allowed) ||
3399 rq->rd->dl_bw.bw == 0) {
3400 task_rq_unlock(rq, p, &flags);
3401 return -EPERM;
3402 }
3403 }
3404#endif
3405 }
3144 3406
3145 /* recheck policy now with rq lock held */ 3407 /* recheck policy now with rq lock held */
3146 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3408 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3148,6 +3410,17 @@ recheck:
3148 task_rq_unlock(rq, p, &flags); 3410 task_rq_unlock(rq, p, &flags);
3149 goto recheck; 3411 goto recheck;
3150 } 3412 }
3413
3414 /*
3415 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3416 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3417 * is available.
3418 */
3419 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3420 task_rq_unlock(rq, p, &flags);
3421 return -EBUSY;
3422 }
3423
3151 on_rq = p->on_rq; 3424 on_rq = p->on_rq;
3152 running = task_current(rq, p); 3425 running = task_current(rq, p);
3153 if (on_rq) 3426 if (on_rq)
@@ -3159,7 +3432,7 @@ recheck:
3159 3432
3160 oldprio = p->prio; 3433 oldprio = p->prio;
3161 prev_class = p->sched_class; 3434 prev_class = p->sched_class;
3162 __setscheduler(rq, p, policy, param->sched_priority); 3435 __setscheduler(rq, p, attr);
3163 3436
3164 if (running) 3437 if (running)
3165 p->sched_class->set_curr_task(rq); 3438 p->sched_class->set_curr_task(rq);
@@ -3174,6 +3447,26 @@ recheck:
3174 return 0; 3447 return 0;
3175} 3448}
3176 3449
3450static int _sched_setscheduler(struct task_struct *p, int policy,
3451 const struct sched_param *param, bool check)
3452{
3453 struct sched_attr attr = {
3454 .sched_policy = policy,
3455 .sched_priority = param->sched_priority,
3456 .sched_nice = PRIO_TO_NICE(p->static_prio),
3457 };
3458
3459 /*
3460 * Fixup the legacy SCHED_RESET_ON_FORK hack
3461 */
3462 if (policy & SCHED_RESET_ON_FORK) {
3463 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3464 policy &= ~SCHED_RESET_ON_FORK;
3465 attr.sched_policy = policy;
3466 }
3467
3468 return __sched_setscheduler(p, &attr, check);
3469}
3177/** 3470/**
3178 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3471 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3179 * @p: the task in question. 3472 * @p: the task in question.
@@ -3187,10 +3480,16 @@ recheck:
3187int sched_setscheduler(struct task_struct *p, int policy, 3480int sched_setscheduler(struct task_struct *p, int policy,
3188 const struct sched_param *param) 3481 const struct sched_param *param)
3189{ 3482{
3190 return __sched_setscheduler(p, policy, param, true); 3483 return _sched_setscheduler(p, policy, param, true);
3191} 3484}
3192EXPORT_SYMBOL_GPL(sched_setscheduler); 3485EXPORT_SYMBOL_GPL(sched_setscheduler);
3193 3486
3487int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3488{
3489 return __sched_setscheduler(p, attr, true);
3490}
3491EXPORT_SYMBOL_GPL(sched_setattr);
3492
3194/** 3493/**
3195 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3494 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3196 * @p: the task in question. 3495 * @p: the task in question.
@@ -3207,7 +3506,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3207int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3506int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3208 const struct sched_param *param) 3507 const struct sched_param *param)
3209{ 3508{
3210 return __sched_setscheduler(p, policy, param, false); 3509 return _sched_setscheduler(p, policy, param, false);
3211} 3510}
3212 3511
3213static int 3512static int
@@ -3232,6 +3531,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3232 return retval; 3531 return retval;
3233} 3532}
3234 3533
3534/*
3535 * Mimics kernel/events/core.c perf_copy_attr().
3536 */
3537static int sched_copy_attr(struct sched_attr __user *uattr,
3538 struct sched_attr *attr)
3539{
3540 u32 size;
3541 int ret;
3542
3543 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3544 return -EFAULT;
3545
3546 /*
3547 * zero the full structure, so that a short copy will be nice.
3548 */
3549 memset(attr, 0, sizeof(*attr));
3550
3551 ret = get_user(size, &uattr->size);
3552 if (ret)
3553 return ret;
3554
3555 if (size > PAGE_SIZE) /* silly large */
3556 goto err_size;
3557
3558 if (!size) /* abi compat */
3559 size = SCHED_ATTR_SIZE_VER0;
3560
3561 if (size < SCHED_ATTR_SIZE_VER0)
3562 goto err_size;
3563
3564 /*
3565 * If we're handed a bigger struct than we know of,
3566 * ensure all the unknown bits are 0 - i.e. new
3567 * user-space does not rely on any kernel feature
3568 * extensions we dont know about yet.
3569 */
3570 if (size > sizeof(*attr)) {
3571 unsigned char __user *addr;
3572 unsigned char __user *end;
3573 unsigned char val;
3574
3575 addr = (void __user *)uattr + sizeof(*attr);
3576 end = (void __user *)uattr + size;
3577
3578 for (; addr < end; addr++) {
3579 ret = get_user(val, addr);
3580 if (ret)
3581 return ret;
3582 if (val)
3583 goto err_size;
3584 }
3585 size = sizeof(*attr);
3586 }
3587
3588 ret = copy_from_user(attr, uattr, size);
3589 if (ret)
3590 return -EFAULT;
3591
3592 /*
3593 * XXX: do we want to be lenient like existing syscalls; or do we want
3594 * to be strict and return an error on out-of-bounds values?
3595 */
3596 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3597
3598out:
3599 return ret;
3600
3601err_size:
3602 put_user(sizeof(*attr), &uattr->size);
3603 ret = -E2BIG;
3604 goto out;
3605}
3606
3235/** 3607/**
3236 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3608 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3237 * @pid: the pid in question. 3609 * @pid: the pid in question.
@@ -3263,6 +3635,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3263} 3635}
3264 3636
3265/** 3637/**
3638 * sys_sched_setattr - same as above, but with extended sched_attr
3639 * @pid: the pid in question.
3640 * @uattr: structure containing the extended parameters.
3641 */
3642SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
3643{
3644 struct sched_attr attr;
3645 struct task_struct *p;
3646 int retval;
3647
3648 if (!uattr || pid < 0)
3649 return -EINVAL;
3650
3651 if (sched_copy_attr(uattr, &attr))
3652 return -EFAULT;
3653
3654 rcu_read_lock();
3655 retval = -ESRCH;
3656 p = find_process_by_pid(pid);
3657 if (p != NULL)
3658 retval = sched_setattr(p, &attr);
3659 rcu_read_unlock();
3660
3661 return retval;
3662}
3663
3664/**
3266 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3665 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3267 * @pid: the pid in question. 3666 * @pid: the pid in question.
3268 * 3667 *
@@ -3317,6 +3716,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3317 if (retval) 3716 if (retval)
3318 goto out_unlock; 3717 goto out_unlock;
3319 3718
3719 if (task_has_dl_policy(p)) {
3720 retval = -EINVAL;
3721 goto out_unlock;
3722 }
3320 lp.sched_priority = p->rt_priority; 3723 lp.sched_priority = p->rt_priority;
3321 rcu_read_unlock(); 3724 rcu_read_unlock();
3322 3725
@@ -3332,6 +3735,96 @@ out_unlock:
3332 return retval; 3735 return retval;
3333} 3736}
3334 3737
3738static int sched_read_attr(struct sched_attr __user *uattr,
3739 struct sched_attr *attr,
3740 unsigned int usize)
3741{
3742 int ret;
3743
3744 if (!access_ok(VERIFY_WRITE, uattr, usize))
3745 return -EFAULT;
3746
3747 /*
3748 * If we're handed a smaller struct than we know of,
3749 * ensure all the unknown bits are 0 - i.e. old
3750 * user-space does not get uncomplete information.
3751 */
3752 if (usize < sizeof(*attr)) {
3753 unsigned char *addr;
3754 unsigned char *end;
3755
3756 addr = (void *)attr + usize;
3757 end = (void *)attr + sizeof(*attr);
3758
3759 for (; addr < end; addr++) {
3760 if (*addr)
3761 goto err_size;
3762 }
3763
3764 attr->size = usize;
3765 }
3766
3767 ret = copy_to_user(uattr, attr, usize);
3768 if (ret)
3769 return -EFAULT;
3770
3771out:
3772 return ret;
3773
3774err_size:
3775 ret = -E2BIG;
3776 goto out;
3777}
3778
3779/**
3780 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3781 * @pid: the pid in question.
3782 * @uattr: structure containing the extended parameters.
3783 * @size: sizeof(attr) for fwd/bwd comp.
3784 */
3785SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3786 unsigned int, size)
3787{
3788 struct sched_attr attr = {
3789 .size = sizeof(struct sched_attr),
3790 };
3791 struct task_struct *p;
3792 int retval;
3793
3794 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3795 size < SCHED_ATTR_SIZE_VER0)
3796 return -EINVAL;
3797
3798 rcu_read_lock();
3799 p = find_process_by_pid(pid);
3800 retval = -ESRCH;
3801 if (!p)
3802 goto out_unlock;
3803
3804 retval = security_task_getscheduler(p);
3805 if (retval)
3806 goto out_unlock;
3807
3808 attr.sched_policy = p->policy;
3809 if (p->sched_reset_on_fork)
3810 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3811 if (task_has_dl_policy(p))
3812 __getparam_dl(p, &attr);
3813 else if (task_has_rt_policy(p))
3814 attr.sched_priority = p->rt_priority;
3815 else
3816 attr.sched_nice = TASK_NICE(p);
3817
3818 rcu_read_unlock();
3819
3820 retval = sched_read_attr(uattr, &attr, size);
3821 return retval;
3822
3823out_unlock:
3824 rcu_read_unlock();
3825 return retval;
3826}
3827
3335long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3828long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3336{ 3829{
3337 cpumask_var_t cpus_allowed, new_mask; 3830 cpumask_var_t cpus_allowed, new_mask;
@@ -3376,8 +3869,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3376 if (retval) 3869 if (retval)
3377 goto out_unlock; 3870 goto out_unlock;
3378 3871
3872
3379 cpuset_cpus_allowed(p, cpus_allowed); 3873 cpuset_cpus_allowed(p, cpus_allowed);
3380 cpumask_and(new_mask, in_mask, cpus_allowed); 3874 cpumask_and(new_mask, in_mask, cpus_allowed);
3875
3876 /*
3877 * Since bandwidth control happens on root_domain basis,
3878 * if admission test is enabled, we only admit -deadline
3879 * tasks allowed to run on all the CPUs in the task's
3880 * root_domain.
3881 */
3882#ifdef CONFIG_SMP
3883 if (task_has_dl_policy(p)) {
3884 const struct cpumask *span = task_rq(p)->rd->span;
3885
3886 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3887 retval = -EBUSY;
3888 goto out_unlock;
3889 }
3890 }
3891#endif
3381again: 3892again:
3382 retval = set_cpus_allowed_ptr(p, new_mask); 3893 retval = set_cpus_allowed_ptr(p, new_mask);
3383 3894
@@ -3654,7 +4165,7 @@ again:
3654 } 4165 }
3655 4166
3656 double_rq_lock(rq, p_rq); 4167 double_rq_lock(rq, p_rq);
3657 while (task_rq(p) != p_rq) { 4168 if (task_rq(p) != p_rq) {
3658 double_rq_unlock(rq, p_rq); 4169 double_rq_unlock(rq, p_rq);
3659 goto again; 4170 goto again;
3660 } 4171 }
@@ -3743,6 +4254,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3743 case SCHED_RR: 4254 case SCHED_RR:
3744 ret = MAX_USER_RT_PRIO-1; 4255 ret = MAX_USER_RT_PRIO-1;
3745 break; 4256 break;
4257 case SCHED_DEADLINE:
3746 case SCHED_NORMAL: 4258 case SCHED_NORMAL:
3747 case SCHED_BATCH: 4259 case SCHED_BATCH:
3748 case SCHED_IDLE: 4260 case SCHED_IDLE:
@@ -3769,6 +4281,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3769 case SCHED_RR: 4281 case SCHED_RR:
3770 ret = 1; 4282 ret = 1;
3771 break; 4283 break;
4284 case SCHED_DEADLINE:
3772 case SCHED_NORMAL: 4285 case SCHED_NORMAL:
3773 case SCHED_BATCH: 4286 case SCHED_BATCH:
3774 case SCHED_IDLE: 4287 case SCHED_IDLE:
@@ -4091,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4091 4604
4092 /* TODO: This is not properly updating schedstats */ 4605 /* TODO: This is not properly updating schedstats */
4093 4606
4607 trace_sched_move_numa(p, curr_cpu, target_cpu);
4094 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4608 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4095} 4609}
4096 4610
@@ -4515,13 +5029,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4515static int sched_cpu_inactive(struct notifier_block *nfb, 5029static int sched_cpu_inactive(struct notifier_block *nfb,
4516 unsigned long action, void *hcpu) 5030 unsigned long action, void *hcpu)
4517{ 5031{
5032 unsigned long flags;
5033 long cpu = (long)hcpu;
5034
4518 switch (action & ~CPU_TASKS_FROZEN) { 5035 switch (action & ~CPU_TASKS_FROZEN) {
4519 case CPU_DOWN_PREPARE: 5036 case CPU_DOWN_PREPARE:
4520 set_cpu_active((long)hcpu, false); 5037 set_cpu_active(cpu, false);
5038
5039 /* explicitly allow suspend */
5040 if (!(action & CPU_TASKS_FROZEN)) {
5041 struct dl_bw *dl_b = dl_bw_of(cpu);
5042 bool overflow;
5043 int cpus;
5044
5045 raw_spin_lock_irqsave(&dl_b->lock, flags);
5046 cpus = dl_bw_cpus(cpu);
5047 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5048 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5049
5050 if (overflow)
5051 return notifier_from_errno(-EBUSY);
5052 }
4521 return NOTIFY_OK; 5053 return NOTIFY_OK;
4522 default:
4523 return NOTIFY_DONE;
4524 } 5054 }
5055
5056 return NOTIFY_DONE;
4525} 5057}
4526 5058
4527static int __init migration_init(void) 5059static int __init migration_init(void)
@@ -4740,6 +5272,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4740 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5272 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4741 5273
4742 cpupri_cleanup(&rd->cpupri); 5274 cpupri_cleanup(&rd->cpupri);
5275 cpudl_cleanup(&rd->cpudl);
5276 free_cpumask_var(rd->dlo_mask);
4743 free_cpumask_var(rd->rto_mask); 5277 free_cpumask_var(rd->rto_mask);
4744 free_cpumask_var(rd->online); 5278 free_cpumask_var(rd->online);
4745 free_cpumask_var(rd->span); 5279 free_cpumask_var(rd->span);
@@ -4762,7 +5296,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4762 cpumask_clear_cpu(rq->cpu, old_rd->span); 5296 cpumask_clear_cpu(rq->cpu, old_rd->span);
4763 5297
4764 /* 5298 /*
4765 * If we dont want to free the old_rt yet then 5299 * If we dont want to free the old_rd yet then
4766 * set old_rd to NULL to skip the freeing later 5300 * set old_rd to NULL to skip the freeing later
4767 * in this function: 5301 * in this function:
4768 */ 5302 */
@@ -4791,8 +5325,14 @@ static int init_rootdomain(struct root_domain *rd)
4791 goto out; 5325 goto out;
4792 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5326 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4793 goto free_span; 5327 goto free_span;
4794 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5328 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4795 goto free_online; 5329 goto free_online;
5330 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5331 goto free_dlo_mask;
5332
5333 init_dl_bw(&rd->dl_bw);
5334 if (cpudl_init(&rd->cpudl) != 0)
5335 goto free_dlo_mask;
4796 5336
4797 if (cpupri_init(&rd->cpupri) != 0) 5337 if (cpupri_init(&rd->cpupri) != 0)
4798 goto free_rto_mask; 5338 goto free_rto_mask;
@@ -4800,6 +5340,8 @@ static int init_rootdomain(struct root_domain *rd)
4800 5340
4801free_rto_mask: 5341free_rto_mask:
4802 free_cpumask_var(rd->rto_mask); 5342 free_cpumask_var(rd->rto_mask);
5343free_dlo_mask:
5344 free_cpumask_var(rd->dlo_mask);
4803free_online: 5345free_online:
4804 free_cpumask_var(rd->online); 5346 free_cpumask_var(rd->online);
4805free_span: 5347free_span:
@@ -4903,6 +5445,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
4903static void update_top_cache_domain(int cpu) 5445static void update_top_cache_domain(int cpu)
4904{ 5446{
4905 struct sched_domain *sd; 5447 struct sched_domain *sd;
5448 struct sched_domain *busy_sd = NULL;
4906 int id = cpu; 5449 int id = cpu;
4907 int size = 1; 5450 int size = 1;
4908 5451
@@ -4910,8 +5453,9 @@ static void update_top_cache_domain(int cpu)
4910 if (sd) { 5453 if (sd) {
4911 id = cpumask_first(sched_domain_span(sd)); 5454 id = cpumask_first(sched_domain_span(sd));
4912 size = cpumask_weight(sched_domain_span(sd)); 5455 size = cpumask_weight(sched_domain_span(sd));
4913 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); 5456 busy_sd = sd->parent; /* sd_busy */
4914 } 5457 }
5458 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
4915 5459
4916 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5460 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
4917 per_cpu(sd_llc_size, cpu) = size; 5461 per_cpu(sd_llc_size, cpu) = size;
@@ -5112,6 +5656,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5112 * die on a /0 trap. 5656 * die on a /0 trap.
5113 */ 5657 */
5114 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5658 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5659 sg->sgp->power_orig = sg->sgp->power;
5115 5660
5116 /* 5661 /*
5117 * Make sure the first group of this domain contains the 5662 * Make sure the first group of this domain contains the
@@ -6148,6 +6693,7 @@ void __init sched_init_smp(void)
6148 free_cpumask_var(non_isolated_cpus); 6693 free_cpumask_var(non_isolated_cpus);
6149 6694
6150 init_sched_rt_class(); 6695 init_sched_rt_class();
6696 init_sched_dl_class();
6151} 6697}
6152#else 6698#else
6153void __init sched_init_smp(void) 6699void __init sched_init_smp(void)
@@ -6217,13 +6763,15 @@ void __init sched_init(void)
6217#endif /* CONFIG_CPUMASK_OFFSTACK */ 6763#endif /* CONFIG_CPUMASK_OFFSTACK */
6218 } 6764 }
6219 6765
6766 init_rt_bandwidth(&def_rt_bandwidth,
6767 global_rt_period(), global_rt_runtime());
6768 init_dl_bandwidth(&def_dl_bandwidth,
6769 global_rt_period(), global_rt_runtime());
6770
6220#ifdef CONFIG_SMP 6771#ifdef CONFIG_SMP
6221 init_defrootdomain(); 6772 init_defrootdomain();
6222#endif 6773#endif
6223 6774
6224 init_rt_bandwidth(&def_rt_bandwidth,
6225 global_rt_period(), global_rt_runtime());
6226
6227#ifdef CONFIG_RT_GROUP_SCHED 6775#ifdef CONFIG_RT_GROUP_SCHED
6228 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6776 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6229 global_rt_period(), global_rt_runtime()); 6777 global_rt_period(), global_rt_runtime());
@@ -6247,6 +6795,7 @@ void __init sched_init(void)
6247 rq->calc_load_update = jiffies + LOAD_FREQ; 6795 rq->calc_load_update = jiffies + LOAD_FREQ;
6248 init_cfs_rq(&rq->cfs); 6796 init_cfs_rq(&rq->cfs);
6249 init_rt_rq(&rq->rt, rq); 6797 init_rt_rq(&rq->rt, rq);
6798 init_dl_rq(&rq->dl, rq);
6250#ifdef CONFIG_FAIR_GROUP_SCHED 6799#ifdef CONFIG_FAIR_GROUP_SCHED
6251 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6800 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6252 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6801 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6318,10 +6867,6 @@ void __init sched_init(void)
6318 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6867 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6319#endif 6868#endif
6320 6869
6321#ifdef CONFIG_RT_MUTEXES
6322 plist_head_init(&init_task.pi_waiters);
6323#endif
6324
6325 /* 6870 /*
6326 * The boot idle thread does lazy MMU switching as well: 6871 * The boot idle thread does lazy MMU switching as well:
6327 */ 6872 */
@@ -6395,13 +6940,16 @@ EXPORT_SYMBOL(__might_sleep);
6395static void normalize_task(struct rq *rq, struct task_struct *p) 6940static void normalize_task(struct rq *rq, struct task_struct *p)
6396{ 6941{
6397 const struct sched_class *prev_class = p->sched_class; 6942 const struct sched_class *prev_class = p->sched_class;
6943 struct sched_attr attr = {
6944 .sched_policy = SCHED_NORMAL,
6945 };
6398 int old_prio = p->prio; 6946 int old_prio = p->prio;
6399 int on_rq; 6947 int on_rq;
6400 6948
6401 on_rq = p->on_rq; 6949 on_rq = p->on_rq;
6402 if (on_rq) 6950 if (on_rq)
6403 dequeue_task(rq, p, 0); 6951 dequeue_task(rq, p, 0);
6404 __setscheduler(rq, p, SCHED_NORMAL, 0); 6952 __setscheduler(rq, p, &attr);
6405 if (on_rq) { 6953 if (on_rq) {
6406 enqueue_task(rq, p, 0); 6954 enqueue_task(rq, p, 0);
6407 resched_task(rq->curr); 6955 resched_task(rq->curr);
@@ -6431,7 +6979,7 @@ void normalize_rt_tasks(void)
6431 p->se.statistics.block_start = 0; 6979 p->se.statistics.block_start = 0;
6432#endif 6980#endif
6433 6981
6434 if (!rt_task(p)) { 6982 if (!dl_task(p) && !rt_task(p)) {
6435 /* 6983 /*
6436 * Renice negative nice level userspace 6984 * Renice negative nice level userspace
6437 * tasks back to 0: 6985 * tasks back to 0:
@@ -6626,16 +7174,6 @@ void sched_move_task(struct task_struct *tsk)
6626} 7174}
6627#endif /* CONFIG_CGROUP_SCHED */ 7175#endif /* CONFIG_CGROUP_SCHED */
6628 7176
6629#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6630static unsigned long to_ratio(u64 period, u64 runtime)
6631{
6632 if (runtime == RUNTIME_INF)
6633 return 1ULL << 20;
6634
6635 return div64_u64(runtime << 20, period);
6636}
6637#endif
6638
6639#ifdef CONFIG_RT_GROUP_SCHED 7177#ifdef CONFIG_RT_GROUP_SCHED
6640/* 7178/*
6641 * Ensure that the real time constraints are schedulable. 7179 * Ensure that the real time constraints are schedulable.
@@ -6809,24 +7347,13 @@ static long sched_group_rt_period(struct task_group *tg)
6809 do_div(rt_period_us, NSEC_PER_USEC); 7347 do_div(rt_period_us, NSEC_PER_USEC);
6810 return rt_period_us; 7348 return rt_period_us;
6811} 7349}
7350#endif /* CONFIG_RT_GROUP_SCHED */
6812 7351
7352#ifdef CONFIG_RT_GROUP_SCHED
6813static int sched_rt_global_constraints(void) 7353static int sched_rt_global_constraints(void)
6814{ 7354{
6815 u64 runtime, period;
6816 int ret = 0; 7355 int ret = 0;
6817 7356
6818 if (sysctl_sched_rt_period <= 0)
6819 return -EINVAL;
6820
6821 runtime = global_rt_runtime();
6822 period = global_rt_period();
6823
6824 /*
6825 * Sanity check on the sysctl variables.
6826 */
6827 if (runtime > period && runtime != RUNTIME_INF)
6828 return -EINVAL;
6829
6830 mutex_lock(&rt_constraints_mutex); 7357 mutex_lock(&rt_constraints_mutex);
6831 read_lock(&tasklist_lock); 7358 read_lock(&tasklist_lock);
6832 ret = __rt_schedulable(NULL, 0, 0); 7359 ret = __rt_schedulable(NULL, 0, 0);
@@ -6849,17 +7376,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6849static int sched_rt_global_constraints(void) 7376static int sched_rt_global_constraints(void)
6850{ 7377{
6851 unsigned long flags; 7378 unsigned long flags;
6852 int i; 7379 int i, ret = 0;
6853
6854 if (sysctl_sched_rt_period <= 0)
6855 return -EINVAL;
6856
6857 /*
6858 * There's always some RT tasks in the root group
6859 * -- migration, kstopmachine etc..
6860 */
6861 if (sysctl_sched_rt_runtime == 0)
6862 return -EBUSY;
6863 7380
6864 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7381 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6865 for_each_possible_cpu(i) { 7382 for_each_possible_cpu(i) {
@@ -6871,36 +7388,88 @@ static int sched_rt_global_constraints(void)
6871 } 7388 }
6872 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7389 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6873 7390
6874 return 0; 7391 return ret;
6875} 7392}
6876#endif /* CONFIG_RT_GROUP_SCHED */ 7393#endif /* CONFIG_RT_GROUP_SCHED */
6877 7394
6878int sched_rr_handler(struct ctl_table *table, int write, 7395static int sched_dl_global_constraints(void)
6879 void __user *buffer, size_t *lenp,
6880 loff_t *ppos)
6881{ 7396{
6882 int ret; 7397 u64 runtime = global_rt_runtime();
6883 static DEFINE_MUTEX(mutex); 7398 u64 period = global_rt_period();
7399 u64 new_bw = to_ratio(period, runtime);
7400 int cpu, ret = 0;
6884 7401
6885 mutex_lock(&mutex); 7402 /*
6886 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7403 * Here we want to check the bandwidth not being set to some
6887 /* make sure that internally we keep jiffies */ 7404 * value smaller than the currently allocated bandwidth in
6888 /* also, writing zero resets timeslice to default */ 7405 * any of the root_domains.
6889 if (!ret && write) { 7406 *
6890 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7407 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6891 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7408 * cycling on root_domains... Discussion on different/better
7409 * solutions is welcome!
7410 */
7411 for_each_possible_cpu(cpu) {
7412 struct dl_bw *dl_b = dl_bw_of(cpu);
7413
7414 raw_spin_lock(&dl_b->lock);
7415 if (new_bw < dl_b->total_bw)
7416 ret = -EBUSY;
7417 raw_spin_unlock(&dl_b->lock);
7418
7419 if (ret)
7420 break;
6892 } 7421 }
6893 mutex_unlock(&mutex); 7422
6894 return ret; 7423 return ret;
6895} 7424}
6896 7425
7426static void sched_dl_do_global(void)
7427{
7428 u64 new_bw = -1;
7429 int cpu;
7430
7431 def_dl_bandwidth.dl_period = global_rt_period();
7432 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7433
7434 if (global_rt_runtime() != RUNTIME_INF)
7435 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7436
7437 /*
7438 * FIXME: As above...
7439 */
7440 for_each_possible_cpu(cpu) {
7441 struct dl_bw *dl_b = dl_bw_of(cpu);
7442
7443 raw_spin_lock(&dl_b->lock);
7444 dl_b->bw = new_bw;
7445 raw_spin_unlock(&dl_b->lock);
7446 }
7447}
7448
7449static int sched_rt_global_validate(void)
7450{
7451 if (sysctl_sched_rt_period <= 0)
7452 return -EINVAL;
7453
7454 if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
7455 return -EINVAL;
7456
7457 return 0;
7458}
7459
7460static void sched_rt_do_global(void)
7461{
7462 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7463 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7464}
7465
6897int sched_rt_handler(struct ctl_table *table, int write, 7466int sched_rt_handler(struct ctl_table *table, int write,
6898 void __user *buffer, size_t *lenp, 7467 void __user *buffer, size_t *lenp,
6899 loff_t *ppos) 7468 loff_t *ppos)
6900{ 7469{
6901 int ret;
6902 int old_period, old_runtime; 7470 int old_period, old_runtime;
6903 static DEFINE_MUTEX(mutex); 7471 static DEFINE_MUTEX(mutex);
7472 int ret;
6904 7473
6905 mutex_lock(&mutex); 7474 mutex_lock(&mutex);
6906 old_period = sysctl_sched_rt_period; 7475 old_period = sysctl_sched_rt_period;
@@ -6909,21 +7478,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6909 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7478 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6910 7479
6911 if (!ret && write) { 7480 if (!ret && write) {
7481 ret = sched_rt_global_validate();
7482 if (ret)
7483 goto undo;
7484
6912 ret = sched_rt_global_constraints(); 7485 ret = sched_rt_global_constraints();
6913 if (ret) { 7486 if (ret)
6914 sysctl_sched_rt_period = old_period; 7487 goto undo;
6915 sysctl_sched_rt_runtime = old_runtime; 7488
6916 } else { 7489 ret = sched_dl_global_constraints();
6917 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7490 if (ret)
6918 def_rt_bandwidth.rt_period = 7491 goto undo;
6919 ns_to_ktime(global_rt_period()); 7492
6920 } 7493 sched_rt_do_global();
7494 sched_dl_do_global();
7495 }
7496 if (0) {
7497undo:
7498 sysctl_sched_rt_period = old_period;
7499 sysctl_sched_rt_runtime = old_runtime;
6921 } 7500 }
6922 mutex_unlock(&mutex); 7501 mutex_unlock(&mutex);
6923 7502
6924 return ret; 7503 return ret;
6925} 7504}
6926 7505
7506int sched_rr_handler(struct ctl_table *table, int write,
7507 void __user *buffer, size_t *lenp,
7508 loff_t *ppos)
7509{
7510 int ret;
7511 static DEFINE_MUTEX(mutex);
7512
7513 mutex_lock(&mutex);
7514 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7515 /* make sure that internally we keep jiffies */
7516 /* also, writing zero resets timeslice to default */
7517 if (!ret && write) {
7518 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7519 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7520 }
7521 mutex_unlock(&mutex);
7522 return ret;
7523}
7524
6927#ifdef CONFIG_CGROUP_SCHED 7525#ifdef CONFIG_CGROUP_SCHED
6928 7526
6929static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7527static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7256,15 +7854,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7256 return ret; 7854 return ret;
7257} 7855}
7258 7856
7259static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7857static int cpu_stats_show(struct seq_file *sf, void *v)
7260 struct cgroup_map_cb *cb)
7261{ 7858{
7262 struct task_group *tg = css_tg(css); 7859 struct task_group *tg = css_tg(seq_css(sf));
7263 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7860 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7264 7861
7265 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7862 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7266 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7863 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7267 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7864 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7268 7865
7269 return 0; 7866 return 0;
7270} 7867}
@@ -7318,7 +7915,7 @@ static struct cftype cpu_files[] = {
7318 }, 7915 },
7319 { 7916 {
7320 .name = "stat", 7917 .name = "stat",
7321 .read_map = cpu_stats_show, 7918 .seq_show = cpu_stats_show,
7322 }, 7919 },
7323#endif 7920#endif
7324#ifdef CONFIG_RT_GROUP_SCHED 7921#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
163 return err; 163 return err;
164} 164}
165 165
166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 166static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
167 struct cftype *cft, struct seq_file *m)
168{ 167{
169 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(seq_css(m));
170 u64 percpu; 169 u64 percpu;
171 int i; 170 int i;
172 171
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
183 [CPUACCT_STAT_SYSTEM] = "system", 182 [CPUACCT_STAT_SYSTEM] = "system",
184}; 183};
185 184
186static int cpuacct_stats_show(struct cgroup_subsys_state *css, 185static int cpuacct_stats_show(struct seq_file *sf, void *v)
187 struct cftype *cft, struct cgroup_map_cb *cb)
188{ 186{
189 struct cpuacct *ca = css_ca(css); 187 struct cpuacct *ca = css_ca(seq_css(sf));
190 int cpu; 188 int cpu;
191 s64 val = 0; 189 s64 val = 0;
192 190
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 val += kcpustat->cpustat[CPUTIME_NICE]; 194 val += kcpustat->cpustat[CPUTIME_NICE];
197 } 195 }
198 val = cputime64_to_clock_t(val); 196 val = cputime64_to_clock_t(val);
199 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 197 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
200 198
201 val = 0; 199 val = 0;
202 for_each_online_cpu(cpu) { 200 for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
207 } 205 }
208 206
209 val = cputime64_to_clock_t(val); 207 val = cputime64_to_clock_t(val);
210 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 208 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
211 209
212 return 0; 210 return 0;
213} 211}
@@ -220,11 +218,11 @@ static struct cftype files[] = {
220 }, 218 },
221 { 219 {
222 .name = "usage_percpu", 220 .name = "usage_percpu",
223 .read_seq_string = cpuacct_percpu_seq_read, 221 .seq_show = cpuacct_percpu_seq_show,
224 }, 222 },
225 { 223 {
226 .name = "stat", 224 .name = "stat",
227 .read_map = cpuacct_stats_show, 225 .seq_show = cpuacct_stats_show,
228 }, 226 },
229 { } /* terminate */ 227 { } /* terminate */
230}; 228};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..045fc74e3f09
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
1/*
2 * kernel/sched/cpudl.c
3 *
4 * Global CPU deadline management
5 *
6 * Author: Juri Lelli <j.lelli@sssup.it>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 */
13
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include "cpudeadline.h"
17
18static inline int parent(int i)
19{
20 return (i - 1) >> 1;
21}
22
23static inline int left_child(int i)
24{
25 return (i << 1) + 1;
26}
27
28static inline int right_child(int i)
29{
30 return (i << 1) + 2;
31}
32
33static inline int dl_time_before(u64 a, u64 b)
34{
35 return (s64)(a - b) < 0;
36}
37
38static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41
42 swap(cp->elements[a], cp->elements[b]);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
44}
45
46static void cpudl_heapify(struct cpudl *cp, int idx)
47{
48 int l, r, largest;
49
50 /* adapted from lib/prio_heap.c */
51 while(1) {
52 l = left_child(idx);
53 r = right_child(idx);
54 largest = idx;
55
56 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57 cp->elements[l].dl))
58 largest = l;
59 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60 cp->elements[r].dl))
61 largest = r;
62 if (largest == idx)
63 break;
64
65 /* Push idx down the heap one level and bump one up */
66 cpudl_exchange(cp, largest, idx);
67 idx = largest;
68 }
69}
70
71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72{
73 WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
74
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl;
77 cpudl_heapify(cp, idx);
78 } else {
79 cp->elements[idx].dl = new_dl;
80 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81 cp->elements[idx].dl)) {
82 cpudl_exchange(cp, idx, parent(idx));
83 idx = parent(idx);
84 }
85 }
86}
87
88static inline int cpudl_maximum(struct cpudl *cp)
89{
90 return cp->elements[0].cpu;
91}
92
93/*
94 * cpudl_find - find the best (later-dl) CPU in the system
95 * @cp: the cpudl max-heap context
96 * @p: the task
97 * @later_mask: a mask to fill in with the selected CPUs (or NULL)
98 *
99 * Returns: int - best CPU (heap maximum if suitable)
100 */
101int cpudl_find(struct cpudl *cp, struct task_struct *p,
102 struct cpumask *later_mask)
103{
104 int best_cpu = -1;
105 const struct sched_dl_entity *dl_se = &p->dl;
106
107 if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108 &p->cpus_allowed) && cpumask_and(later_mask,
109 later_mask, cpu_active_mask)) {
110 best_cpu = cpumask_any(later_mask);
111 goto out;
112 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114 best_cpu = cpudl_maximum(cp);
115 if (later_mask)
116 cpumask_set_cpu(best_cpu, later_mask);
117 }
118
119out:
120 WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
121
122 return best_cpu;
123}
124
125/*
126 * cpudl_set - update the cpudl max-heap
127 * @cp: the cpudl max-heap context
128 * @cpu: the target cpu
129 * @dl: the new earliest deadline for this cpu
130 *
131 * Notes: assumes cpu_rq(cpu)->lock is locked
132 *
133 * Returns: (void)
134 */
135void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136{
137 int old_idx, new_cpu;
138 unsigned long flags;
139
140 WARN_ON(cpu > num_present_cpus());
141
142 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu];
144 if (!is_valid) {
145 /* remove item */
146 if (old_idx == IDX_INVALID) {
147 /*
148 * Nothing to remove if old_idx was invalid.
149 * This could happen if a rq_offline_dl is
150 * called for a CPU without -dl tasks running.
151 */
152 goto out;
153 }
154 new_cpu = cp->elements[cp->size - 1].cpu;
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) {
163 cpudl_exchange(cp, old_idx, parent(old_idx));
164 old_idx = parent(old_idx);
165 }
166 cpumask_set_cpu(cpu, cp->free_cpus);
167 cpudl_heapify(cp, old_idx);
168
169 goto out;
170 }
171
172 if (old_idx == IDX_INVALID) {
173 cp->size++;
174 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else {
180 cpudl_change_key(cp, old_idx, dl);
181 }
182
183out:
184 raw_spin_unlock_irqrestore(&cp->lock, flags);
185}
186
187/*
188 * cpudl_init - initialize the cpudl structure
189 * @cp: the cpudl max-heap context
190 */
191int cpudl_init(struct cpudl *cp)
192{
193 int i;
194
195 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock);
197 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++)
199 cp->cpu_to_idx[i] = IDX_INVALID;
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
201 return -ENOMEM;
202 cpumask_setall(cp->free_cpus);
203
204 return 0;
205}
206
207/*
208 * cpudl_cleanup - clean up the cpudl structure
209 * @cp: the cpudl max-heap context
210 */
211void cpudl_cleanup(struct cpudl *cp)
212{
213 /*
214 * nothing to do for the moment
215 */
216}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_CPUDL_H
2#define _LINUX_CPUDL_H
3
4#include <linux/sched.h>
5
6#define IDX_INVALID -1
7
8struct array_item {
9 u64 dl;
10 int cpu;
11};
12
13struct cpudl {
14 raw_spinlock_t lock;
15 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus;
19};
20
21
22#ifdef CONFIG_SMP
23int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */
32
33#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..0de248202879
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1640 @@
1/*
2 * Deadline Scheduling Class (SCHED_DEADLINE)
3 *
4 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
5 *
6 * Tasks that periodically executes their instances for less than their
7 * runtime won't miss any of their deadlines.
8 * Tasks that are not periodic or sporadic or that tries to execute more
9 * than their reserved bandwidth will be slowed down (and may potentially
10 * miss some of their deadlines), and won't affect any other task.
11 *
12 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
13 * Juri Lelli <juri.lelli@gmail.com>,
14 * Michael Trimarchi <michael@amarulasolutions.com>,
15 * Fabio Checconi <fchecconi@gmail.com>
16 */
17#include "sched.h"
18
19#include <linux/slab.h>
20
21struct dl_bandwidth def_dl_bandwidth;
22
23static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
24{
25 return container_of(dl_se, struct task_struct, dl);
26}
27
28static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
29{
30 return container_of(dl_rq, struct rq, dl);
31}
32
33static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
34{
35 struct task_struct *p = dl_task_of(dl_se);
36 struct rq *rq = task_rq(p);
37
38 return &rq->dl;
39}
40
41static inline int on_dl_rq(struct sched_dl_entity *dl_se)
42{
43 return !RB_EMPTY_NODE(&dl_se->rb_node);
44}
45
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{
48 struct sched_dl_entity *dl_se = &p->dl;
49
50 return dl_rq->rb_leftmost == &dl_se->rb_node;
51}
52
53void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
54{
55 raw_spin_lock_init(&dl_b->dl_runtime_lock);
56 dl_b->dl_period = period;
57 dl_b->dl_runtime = runtime;
58}
59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b)
63{
64 raw_spin_lock_init(&dl_b->lock);
65 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
66 if (global_rt_runtime() == RUNTIME_INF)
67 dl_b->bw = -1;
68 else
69 dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
70 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
71 dl_b->total_bw = 0;
72}
73
74void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
75{
76 dl_rq->rb_root = RB_ROOT;
77
78#ifdef CONFIG_SMP
79 /* zero means no -deadline tasks */
80 dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
81
82 dl_rq->dl_nr_migratory = 0;
83 dl_rq->overloaded = 0;
84 dl_rq->pushable_dl_tasks_root = RB_ROOT;
85#else
86 init_dl_bw(&dl_rq->dl_bw);
87#endif
88}
89
90#ifdef CONFIG_SMP
91
92static inline int dl_overloaded(struct rq *rq)
93{
94 return atomic_read(&rq->rd->dlo_count);
95}
96
97static inline void dl_set_overload(struct rq *rq)
98{
99 if (!rq->online)
100 return;
101
102 cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
103 /*
104 * Must be visible before the overload count is
105 * set (as in sched_rt.c).
106 *
107 * Matched by the barrier in pull_dl_task().
108 */
109 smp_wmb();
110 atomic_inc(&rq->rd->dlo_count);
111}
112
113static inline void dl_clear_overload(struct rq *rq)
114{
115 if (!rq->online)
116 return;
117
118 atomic_dec(&rq->rd->dlo_count);
119 cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
120}
121
122static void update_dl_migration(struct dl_rq *dl_rq)
123{
124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
125 if (!dl_rq->overloaded) {
126 dl_set_overload(rq_of_dl_rq(dl_rq));
127 dl_rq->overloaded = 1;
128 }
129 } else if (dl_rq->overloaded) {
130 dl_clear_overload(rq_of_dl_rq(dl_rq));
131 dl_rq->overloaded = 0;
132 }
133}
134
135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
136{
137 struct task_struct *p = dl_task_of(dl_se);
138 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
139
140 dl_rq->dl_nr_total++;
141 if (p->nr_cpus_allowed > 1)
142 dl_rq->dl_nr_migratory++;
143
144 update_dl_migration(dl_rq);
145}
146
147static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
148{
149 struct task_struct *p = dl_task_of(dl_se);
150 dl_rq = &rq_of_dl_rq(dl_rq)->dl;
151
152 dl_rq->dl_nr_total--;
153 if (p->nr_cpus_allowed > 1)
154 dl_rq->dl_nr_migratory--;
155
156 update_dl_migration(dl_rq);
157}
158
159/*
160 * The list of pushable -deadline task is not a plist, like in
161 * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
162 */
163static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
164{
165 struct dl_rq *dl_rq = &rq->dl;
166 struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
167 struct rb_node *parent = NULL;
168 struct task_struct *entry;
169 int leftmost = 1;
170
171 BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
172
173 while (*link) {
174 parent = *link;
175 entry = rb_entry(parent, struct task_struct,
176 pushable_dl_tasks);
177 if (dl_entity_preempt(&p->dl, &entry->dl))
178 link = &parent->rb_left;
179 else {
180 link = &parent->rb_right;
181 leftmost = 0;
182 }
183 }
184
185 if (leftmost)
186 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
187
188 rb_link_node(&p->pushable_dl_tasks, parent, link);
189 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
190}
191
192static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
193{
194 struct dl_rq *dl_rq = &rq->dl;
195
196 if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
197 return;
198
199 if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
200 struct rb_node *next_node;
201
202 next_node = rb_next(&p->pushable_dl_tasks);
203 dl_rq->pushable_dl_tasks_leftmost = next_node;
204 }
205
206 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
207 RB_CLEAR_NODE(&p->pushable_dl_tasks);
208}
209
210static inline int has_pushable_dl_tasks(struct rq *rq)
211{
212 return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
213}
214
215static int push_dl_task(struct rq *rq);
216
217#else
218
219static inline
220void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
221{
222}
223
224static inline
225void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
226{
227}
228
229static inline
230void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
231{
232}
233
234static inline
235void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
236{
237}
238
239#endif /* CONFIG_SMP */
240
241static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
242static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
243static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
244 int flags);
245
246/*
247 * We are being explicitly informed that a new instance is starting,
248 * and this means that:
249 * - the absolute deadline of the entity has to be placed at
250 * current time + relative deadline;
251 * - the runtime of the entity has to be set to the maximum value.
252 *
253 * The capability of specifying such event is useful whenever a -deadline
254 * entity wants to (try to!) synchronize its behaviour with the scheduler's
255 * one, and to (try to!) reconcile itself with its own scheduling
256 * parameters.
257 */
258static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
259 struct sched_dl_entity *pi_se)
260{
261 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
262 struct rq *rq = rq_of_dl_rq(dl_rq);
263
264 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
265
266 /*
267 * We use the regular wall clock time to set deadlines in the
268 * future; in fact, we must consider execution overheads (time
269 * spent on hardirq context, etc.).
270 */
271 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
272 dl_se->runtime = pi_se->dl_runtime;
273 dl_se->dl_new = 0;
274}
275
276/*
277 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
278 * possibility of a entity lasting more than what it declared, and thus
279 * exhausting its runtime.
280 *
281 * Here we are interested in making runtime overrun possible, but we do
282 * not want a entity which is misbehaving to affect the scheduling of all
283 * other entities.
284 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
285 * is used, in order to confine each entity within its own bandwidth.
286 *
287 * This function deals exactly with that, and ensures that when the runtime
288 * of a entity is replenished, its deadline is also postponed. That ensures
289 * the overrunning entity can't interfere with other entity in the system and
290 * can't make them miss their deadlines. Reasons why this kind of overruns
291 * could happen are, typically, a entity voluntarily trying to overcome its
292 * runtime, or it just underestimated it during sched_setscheduler_ex().
293 */
294static void replenish_dl_entity(struct sched_dl_entity *dl_se,
295 struct sched_dl_entity *pi_se)
296{
297 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
298 struct rq *rq = rq_of_dl_rq(dl_rq);
299
300 BUG_ON(pi_se->dl_runtime <= 0);
301
302 /*
303 * This could be the case for a !-dl task that is boosted.
304 * Just go with full inherited parameters.
305 */
306 if (dl_se->dl_deadline == 0) {
307 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
308 dl_se->runtime = pi_se->dl_runtime;
309 }
310
311 /*
312 * We keep moving the deadline away until we get some
313 * available runtime for the entity. This ensures correct
314 * handling of situations where the runtime overrun is
315 * arbitrary large.
316 */
317 while (dl_se->runtime <= 0) {
318 dl_se->deadline += pi_se->dl_period;
319 dl_se->runtime += pi_se->dl_runtime;
320 }
321
322 /*
323 * At this point, the deadline really should be "in
324 * the future" with respect to rq->clock. If it's
325 * not, we are, for some reason, lagging too much!
326 * Anyway, after having warn userspace abut that,
327 * we still try to keep the things running by
328 * resetting the deadline and the budget of the
329 * entity.
330 */
331 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
332 static bool lag_once = false;
333
334 if (!lag_once) {
335 lag_once = true;
336 printk_sched("sched: DL replenish lagged to much\n");
337 }
338 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
339 dl_se->runtime = pi_se->dl_runtime;
340 }
341}
342
343/*
344 * Here we check if --at time t-- an entity (which is probably being
345 * [re]activated or, in general, enqueued) can use its remaining runtime
346 * and its current deadline _without_ exceeding the bandwidth it is
347 * assigned (function returns true if it can't). We are in fact applying
348 * one of the CBS rules: when a task wakes up, if the residual runtime
349 * over residual deadline fits within the allocated bandwidth, then we
350 * can keep the current (absolute) deadline and residual budget without
351 * disrupting the schedulability of the system. Otherwise, we should
352 * refill the runtime and set the deadline a period in the future,
353 * because keeping the current (absolute) deadline of the task would
354 * result in breaking guarantees promised to other tasks.
355 *
356 * This function returns true if:
357 *
358 * runtime / (deadline - t) > dl_runtime / dl_period ,
359 *
360 * IOW we can't recycle current parameters.
361 *
362 * Notice that the bandwidth check is done against the period. For
363 * task with deadline equal to period this is the same of using
364 * dl_deadline instead of dl_period in the equation above.
365 */
366static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
367 struct sched_dl_entity *pi_se, u64 t)
368{
369 u64 left, right;
370
371 /*
372 * left and right are the two sides of the equation above,
373 * after a bit of shuffling to use multiplications instead
374 * of divisions.
375 *
376 * Note that none of the time values involved in the two
377 * multiplications are absolute: dl_deadline and dl_runtime
378 * are the relative deadline and the maximum runtime of each
379 * instance, runtime is the runtime left for the last instance
380 * and (deadline - t), since t is rq->clock, is the time left
381 * to the (absolute) deadline. Even if overflowing the u64 type
382 * is very unlikely to occur in both cases, here we scale down
383 * as we want to avoid that risk at all. Scaling down by 10
384 * means that we reduce granularity to 1us. We are fine with it,
385 * since this is only a true/false check and, anyway, thinking
386 * of anything below microseconds resolution is actually fiction
387 * (but still we want to give the user that illusion >;).
388 */
389 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
390 right = ((dl_se->deadline - t) >> DL_SCALE) *
391 (pi_se->dl_runtime >> DL_SCALE);
392
393 return dl_time_before(right, left);
394}
395
396/*
397 * When a -deadline entity is queued back on the runqueue, its runtime and
398 * deadline might need updating.
399 *
400 * The policy here is that we update the deadline of the entity only if:
401 * - the current deadline is in the past,
402 * - using the remaining runtime with the current deadline would make
403 * the entity exceed its bandwidth.
404 */
405static void update_dl_entity(struct sched_dl_entity *dl_se,
406 struct sched_dl_entity *pi_se)
407{
408 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
409 struct rq *rq = rq_of_dl_rq(dl_rq);
410
411 /*
412 * The arrival of a new instance needs special treatment, i.e.,
413 * the actual scheduling parameters have to be "renewed".
414 */
415 if (dl_se->dl_new) {
416 setup_new_dl_entity(dl_se, pi_se);
417 return;
418 }
419
420 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
421 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
422 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
423 dl_se->runtime = pi_se->dl_runtime;
424 }
425}
426
427/*
428 * If the entity depleted all its runtime, and if we want it to sleep
429 * while waiting for some new execution time to become available, we
430 * set the bandwidth enforcement timer to the replenishment instant
431 * and try to activate it.
432 *
433 * Notice that it is important for the caller to know if the timer
434 * actually started or not (i.e., the replenishment instant is in
435 * the future or in the past).
436 */
437static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
438{
439 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
440 struct rq *rq = rq_of_dl_rq(dl_rq);
441 ktime_t now, act;
442 ktime_t soft, hard;
443 unsigned long range;
444 s64 delta;
445
446 if (boosted)
447 return 0;
448 /*
449 * We want the timer to fire at the deadline, but considering
450 * that it is actually coming from rq->clock and not from
451 * hrtimer's time base reading.
452 */
453 act = ns_to_ktime(dl_se->deadline);
454 now = hrtimer_cb_get_time(&dl_se->dl_timer);
455 delta = ktime_to_ns(now) - rq_clock(rq);
456 act = ktime_add_ns(act, delta);
457
458 /*
459 * If the expiry time already passed, e.g., because the value
460 * chosen as the deadline is too small, don't even try to
461 * start the timer in the past!
462 */
463 if (ktime_us_delta(act, now) < 0)
464 return 0;
465
466 hrtimer_set_expires(&dl_se->dl_timer, act);
467
468 soft = hrtimer_get_softexpires(&dl_se->dl_timer);
469 hard = hrtimer_get_expires(&dl_se->dl_timer);
470 range = ktime_to_ns(ktime_sub(hard, soft));
471 __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
472 range, HRTIMER_MODE_ABS, 0);
473
474 return hrtimer_active(&dl_se->dl_timer);
475}
476
477/*
478 * This is the bandwidth enforcement timer callback. If here, we know
479 * a task is not on its dl_rq, since the fact that the timer was running
480 * means the task is throttled and needs a runtime replenishment.
481 *
482 * However, what we actually do depends on the fact the task is active,
483 * (it is on its rq) or has been removed from there by a call to
484 * dequeue_task_dl(). In the former case we must issue the runtime
485 * replenishment and add the task back to the dl_rq; in the latter, we just
486 * do nothing but clearing dl_throttled, so that runtime and deadline
487 * updating (and the queueing back to dl_rq) will be done by the
488 * next call to enqueue_task_dl().
489 */
490static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
491{
492 struct sched_dl_entity *dl_se = container_of(timer,
493 struct sched_dl_entity,
494 dl_timer);
495 struct task_struct *p = dl_task_of(dl_se);
496 struct rq *rq = task_rq(p);
497 raw_spin_lock(&rq->lock);
498
499 /*
500 * We need to take care of a possible races here. In fact, the
501 * task might have changed its scheduling policy to something
502 * different from SCHED_DEADLINE or changed its reservation
503 * parameters (through sched_setscheduler()).
504 */
505 if (!dl_task(p) || dl_se->dl_new)
506 goto unlock;
507
508 sched_clock_tick();
509 update_rq_clock(rq);
510 dl_se->dl_throttled = 0;
511 if (p->on_rq) {
512 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
513 if (task_has_dl_policy(rq->curr))
514 check_preempt_curr_dl(rq, p, 0);
515 else
516 resched_task(rq->curr);
517#ifdef CONFIG_SMP
518 /*
519 * Queueing this task back might have overloaded rq,
520 * check if we need to kick someone away.
521 */
522 if (has_pushable_dl_tasks(rq))
523 push_dl_task(rq);
524#endif
525 }
526unlock:
527 raw_spin_unlock(&rq->lock);
528
529 return HRTIMER_NORESTART;
530}
531
532void init_dl_task_timer(struct sched_dl_entity *dl_se)
533{
534 struct hrtimer *timer = &dl_se->dl_timer;
535
536 if (hrtimer_active(timer)) {
537 hrtimer_try_to_cancel(timer);
538 return;
539 }
540
541 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
542 timer->function = dl_task_timer;
543}
544
545static
546int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
547{
548 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
549 int rorun = dl_se->runtime <= 0;
550
551 if (!rorun && !dmiss)
552 return 0;
553
554 /*
555 * If we are beyond our current deadline and we are still
556 * executing, then we have already used some of the runtime of
557 * the next instance. Thus, if we do not account that, we are
558 * stealing bandwidth from the system at each deadline miss!
559 */
560 if (dmiss) {
561 dl_se->runtime = rorun ? dl_se->runtime : 0;
562 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
563 }
564
565 return 1;
566}
567
568/*
569 * Update the current task's runtime statistics (provided it is still
570 * a -deadline task and has not been removed from the dl_rq).
571 */
572static void update_curr_dl(struct rq *rq)
573{
574 struct task_struct *curr = rq->curr;
575 struct sched_dl_entity *dl_se = &curr->dl;
576 u64 delta_exec;
577
578 if (!dl_task(curr) || !on_dl_rq(dl_se))
579 return;
580
581 /*
582 * Consumed budget is computed considering the time as
583 * observed by schedulable tasks (excluding time spent
584 * in hardirq context, etc.). Deadlines are instead
585 * computed using hard walltime. This seems to be the more
586 * natural solution, but the full ramifications of this
587 * approach need further study.
588 */
589 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
590 if (unlikely((s64)delta_exec < 0))
591 delta_exec = 0;
592
593 schedstat_set(curr->se.statistics.exec_max,
594 max(curr->se.statistics.exec_max, delta_exec));
595
596 curr->se.sum_exec_runtime += delta_exec;
597 account_group_exec_runtime(curr, delta_exec);
598
599 curr->se.exec_start = rq_clock_task(rq);
600 cpuacct_charge(curr, delta_exec);
601
602 sched_rt_avg_update(rq, delta_exec);
603
604 dl_se->runtime -= delta_exec;
605 if (dl_runtime_exceeded(rq, dl_se)) {
606 __dequeue_task_dl(rq, curr, 0);
607 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
608 dl_se->dl_throttled = 1;
609 else
610 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
611
612 if (!is_leftmost(curr, &rq->dl))
613 resched_task(curr);
614 }
615
616 /*
617 * Because -- for now -- we share the rt bandwidth, we need to
618 * account our runtime there too, otherwise actual rt tasks
619 * would be able to exceed the shared quota.
620 *
621 * Account to the root rt group for now.
622 *
623 * The solution we're working towards is having the RT groups scheduled
624 * using deadline servers -- however there's a few nasties to figure
625 * out before that can happen.
626 */
627 if (rt_bandwidth_enabled()) {
628 struct rt_rq *rt_rq = &rq->rt;
629
630 raw_spin_lock(&rt_rq->rt_runtime_lock);
631 rt_rq->rt_time += delta_exec;
632 /*
633 * We'll let actual RT tasks worry about the overflow here, we
634 * have our own CBS to keep us inline -- see above.
635 */
636 raw_spin_unlock(&rt_rq->rt_runtime_lock);
637 }
638}
639
640#ifdef CONFIG_SMP
641
642static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
643
644static inline u64 next_deadline(struct rq *rq)
645{
646 struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
647
648 if (next && dl_prio(next->prio))
649 return next->dl.deadline;
650 else
651 return 0;
652}
653
654static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
655{
656 struct rq *rq = rq_of_dl_rq(dl_rq);
657
658 if (dl_rq->earliest_dl.curr == 0 ||
659 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
660 /*
661 * If the dl_rq had no -deadline tasks, or if the new task
662 * has shorter deadline than the current one on dl_rq, we
663 * know that the previous earliest becomes our next earliest,
664 * as the new task becomes the earliest itself.
665 */
666 dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
667 dl_rq->earliest_dl.curr = deadline;
668 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
669 } else if (dl_rq->earliest_dl.next == 0 ||
670 dl_time_before(deadline, dl_rq->earliest_dl.next)) {
671 /*
672 * On the other hand, if the new -deadline task has a
673 * a later deadline than the earliest one on dl_rq, but
674 * it is earlier than the next (if any), we must
675 * recompute the next-earliest.
676 */
677 dl_rq->earliest_dl.next = next_deadline(rq);
678 }
679}
680
681static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
682{
683 struct rq *rq = rq_of_dl_rq(dl_rq);
684
685 /*
686 * Since we may have removed our earliest (and/or next earliest)
687 * task we must recompute them.
688 */
689 if (!dl_rq->dl_nr_running) {
690 dl_rq->earliest_dl.curr = 0;
691 dl_rq->earliest_dl.next = 0;
692 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
693 } else {
694 struct rb_node *leftmost = dl_rq->rb_leftmost;
695 struct sched_dl_entity *entry;
696
697 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
698 dl_rq->earliest_dl.curr = entry->deadline;
699 dl_rq->earliest_dl.next = next_deadline(rq);
700 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
701 }
702}
703
704#else
705
706static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
707static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
708
709#endif /* CONFIG_SMP */
710
711static inline
712void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
713{
714 int prio = dl_task_of(dl_se)->prio;
715 u64 deadline = dl_se->deadline;
716
717 WARN_ON(!dl_prio(prio));
718 dl_rq->dl_nr_running++;
719
720 inc_dl_deadline(dl_rq, deadline);
721 inc_dl_migration(dl_se, dl_rq);
722}
723
724static inline
725void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
726{
727 int prio = dl_task_of(dl_se)->prio;
728
729 WARN_ON(!dl_prio(prio));
730 WARN_ON(!dl_rq->dl_nr_running);
731 dl_rq->dl_nr_running--;
732
733 dec_dl_deadline(dl_rq, dl_se->deadline);
734 dec_dl_migration(dl_se, dl_rq);
735}
736
737static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
738{
739 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
740 struct rb_node **link = &dl_rq->rb_root.rb_node;
741 struct rb_node *parent = NULL;
742 struct sched_dl_entity *entry;
743 int leftmost = 1;
744
745 BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
746
747 while (*link) {
748 parent = *link;
749 entry = rb_entry(parent, struct sched_dl_entity, rb_node);
750 if (dl_time_before(dl_se->deadline, entry->deadline))
751 link = &parent->rb_left;
752 else {
753 link = &parent->rb_right;
754 leftmost = 0;
755 }
756 }
757
758 if (leftmost)
759 dl_rq->rb_leftmost = &dl_se->rb_node;
760
761 rb_link_node(&dl_se->rb_node, parent, link);
762 rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
763
764 inc_dl_tasks(dl_se, dl_rq);
765}
766
767static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
768{
769 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
770
771 if (RB_EMPTY_NODE(&dl_se->rb_node))
772 return;
773
774 if (dl_rq->rb_leftmost == &dl_se->rb_node) {
775 struct rb_node *next_node;
776
777 next_node = rb_next(&dl_se->rb_node);
778 dl_rq->rb_leftmost = next_node;
779 }
780
781 rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
782 RB_CLEAR_NODE(&dl_se->rb_node);
783
784 dec_dl_tasks(dl_se, dl_rq);
785}
786
787static void
788enqueue_dl_entity(struct sched_dl_entity *dl_se,
789 struct sched_dl_entity *pi_se, int flags)
790{
791 BUG_ON(on_dl_rq(dl_se));
792
793 /*
794 * If this is a wakeup or a new instance, the scheduling
795 * parameters of the task might need updating. Otherwise,
796 * we want a replenishment of its runtime.
797 */
798 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
799 replenish_dl_entity(dl_se, pi_se);
800 else
801 update_dl_entity(dl_se, pi_se);
802
803 __enqueue_dl_entity(dl_se);
804}
805
806static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
807{
808 __dequeue_dl_entity(dl_se);
809}
810
811static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
812{
813 struct task_struct *pi_task = rt_mutex_get_top_task(p);
814 struct sched_dl_entity *pi_se = &p->dl;
815
816 /*
817 * Use the scheduling parameters of the top pi-waiter
818 * task if we have one and its (relative) deadline is
819 * smaller than our one... OTW we keep our runtime and
820 * deadline.
821 */
822 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
823 pi_se = &pi_task->dl;
824
825 /*
826 * If p is throttled, we do nothing. In fact, if it exhausted
827 * its budget it needs a replenishment and, since it now is on
828 * its rq, the bandwidth timer callback (which clearly has not
829 * run yet) will take care of this.
830 */
831 if (p->dl.dl_throttled)
832 return;
833
834 enqueue_dl_entity(&p->dl, pi_se, flags);
835
836 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
837 enqueue_pushable_dl_task(rq, p);
838
839 inc_nr_running(rq);
840}
841
842static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
843{
844 dequeue_dl_entity(&p->dl);
845 dequeue_pushable_dl_task(rq, p);
846}
847
848static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
849{
850 update_curr_dl(rq);
851 __dequeue_task_dl(rq, p, flags);
852
853 dec_nr_running(rq);
854}
855
856/*
857 * Yield task semantic for -deadline tasks is:
858 *
859 * get off from the CPU until our next instance, with
860 * a new runtime. This is of little use now, since we
861 * don't have a bandwidth reclaiming mechanism. Anyway,
862 * bandwidth reclaiming is planned for the future, and
863 * yield_task_dl will indicate that some spare budget
864 * is available for other task instances to use it.
865 */
866static void yield_task_dl(struct rq *rq)
867{
868 struct task_struct *p = rq->curr;
869
870 /*
871 * We make the task go to sleep until its current deadline by
872 * forcing its runtime to zero. This way, update_curr_dl() stops
873 * it and the bandwidth timer will wake it up and will give it
874 * new scheduling parameters (thanks to dl_new=1).
875 */
876 if (p->dl.runtime > 0) {
877 rq->curr->dl.dl_new = 1;
878 p->dl.runtime = 0;
879 }
880 update_curr_dl(rq);
881}
882
883#ifdef CONFIG_SMP
884
885static int find_later_rq(struct task_struct *task);
886
887static int
888select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
889{
890 struct task_struct *curr;
891 struct rq *rq;
892
893 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
894 goto out;
895
896 rq = cpu_rq(cpu);
897
898 rcu_read_lock();
899 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
900
901 /*
902 * If we are dealing with a -deadline task, we must
903 * decide where to wake it up.
904 * If it has a later deadline and the current task
905 * on this rq can't move (provided the waking task
906 * can!) we prefer to send it somewhere else. On the
907 * other hand, if it has a shorter deadline, we
908 * try to make it stay here, it might be important.
909 */
910 if (unlikely(dl_task(curr)) &&
911 (curr->nr_cpus_allowed < 2 ||
912 !dl_entity_preempt(&p->dl, &curr->dl)) &&
913 (p->nr_cpus_allowed > 1)) {
914 int target = find_later_rq(p);
915
916 if (target != -1)
917 cpu = target;
918 }
919 rcu_read_unlock();
920
921out:
922 return cpu;
923}
924
925static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
926{
927 /*
928 * Current can't be migrated, useless to reschedule,
929 * let's hope p can move out.
930 */
931 if (rq->curr->nr_cpus_allowed == 1 ||
932 cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
933 return;
934
935 /*
936 * p is migratable, so let's not schedule it and
937 * see if it is pushed or pulled somewhere else.
938 */
939 if (p->nr_cpus_allowed != 1 &&
940 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
941 return;
942
943 resched_task(rq->curr);
944}
945
946#endif /* CONFIG_SMP */
947
948/*
949 * Only called when both the current and waking task are -deadline
950 * tasks.
951 */
952static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
953 int flags)
954{
955 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
956 resched_task(rq->curr);
957 return;
958 }
959
960#ifdef CONFIG_SMP
961 /*
962 * In the unlikely case current and p have the same deadline
963 * let us try to decide what's the best thing to do...
964 */
965 if ((p->dl.deadline == rq->curr->dl.deadline) &&
966 !test_tsk_need_resched(rq->curr))
967 check_preempt_equal_dl(rq, p);
968#endif /* CONFIG_SMP */
969}
970
971#ifdef CONFIG_SCHED_HRTICK
972static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
973{
974 s64 delta = p->dl.dl_runtime - p->dl.runtime;
975
976 if (delta > 10000)
977 hrtick_start(rq, p->dl.runtime);
978}
979#endif
980
981static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
982 struct dl_rq *dl_rq)
983{
984 struct rb_node *left = dl_rq->rb_leftmost;
985
986 if (!left)
987 return NULL;
988
989 return rb_entry(left, struct sched_dl_entity, rb_node);
990}
991
992struct task_struct *pick_next_task_dl(struct rq *rq)
993{
994 struct sched_dl_entity *dl_se;
995 struct task_struct *p;
996 struct dl_rq *dl_rq;
997
998 dl_rq = &rq->dl;
999
1000 if (unlikely(!dl_rq->dl_nr_running))
1001 return NULL;
1002
1003 dl_se = pick_next_dl_entity(rq, dl_rq);
1004 BUG_ON(!dl_se);
1005
1006 p = dl_task_of(dl_se);
1007 p->se.exec_start = rq_clock_task(rq);
1008
1009 /* Running task will never be pushed. */
1010 dequeue_pushable_dl_task(rq, p);
1011
1012#ifdef CONFIG_SCHED_HRTICK
1013 if (hrtick_enabled(rq))
1014 start_hrtick_dl(rq, p);
1015#endif
1016
1017#ifdef CONFIG_SMP
1018 rq->post_schedule = has_pushable_dl_tasks(rq);
1019#endif /* CONFIG_SMP */
1020
1021 return p;
1022}
1023
1024static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1025{
1026 update_curr_dl(rq);
1027
1028 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1029 enqueue_pushable_dl_task(rq, p);
1030}
1031
1032static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1033{
1034 update_curr_dl(rq);
1035
1036#ifdef CONFIG_SCHED_HRTICK
1037 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1038 start_hrtick_dl(rq, p);
1039#endif
1040}
1041
1042static void task_fork_dl(struct task_struct *p)
1043{
1044 /*
1045 * SCHED_DEADLINE tasks cannot fork and this is achieved through
1046 * sched_fork()
1047 */
1048}
1049
1050static void task_dead_dl(struct task_struct *p)
1051{
1052 struct hrtimer *timer = &p->dl.dl_timer;
1053 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1054
1055 /*
1056 * Since we are TASK_DEAD we won't slip out of the domain!
1057 */
1058 raw_spin_lock_irq(&dl_b->lock);
1059 dl_b->total_bw -= p->dl.dl_bw;
1060 raw_spin_unlock_irq(&dl_b->lock);
1061
1062 hrtimer_cancel(timer);
1063}
1064
1065static void set_curr_task_dl(struct rq *rq)
1066{
1067 struct task_struct *p = rq->curr;
1068
1069 p->se.exec_start = rq_clock_task(rq);
1070
1071 /* You can't push away the running task */
1072 dequeue_pushable_dl_task(rq, p);
1073}
1074
1075#ifdef CONFIG_SMP
1076
1077/* Only try algorithms three times */
1078#define DL_MAX_TRIES 3
1079
1080static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1081{
1082 if (!task_running(rq, p) &&
1083 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1084 (p->nr_cpus_allowed > 1))
1085 return 1;
1086
1087 return 0;
1088}
1089
1090/* Returns the second earliest -deadline task, NULL otherwise */
1091static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
1092{
1093 struct rb_node *next_node = rq->dl.rb_leftmost;
1094 struct sched_dl_entity *dl_se;
1095 struct task_struct *p = NULL;
1096
1097next_node:
1098 next_node = rb_next(next_node);
1099 if (next_node) {
1100 dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
1101 p = dl_task_of(dl_se);
1102
1103 if (pick_dl_task(rq, p, cpu))
1104 return p;
1105
1106 goto next_node;
1107 }
1108
1109 return NULL;
1110}
1111
1112static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1113
1114static int find_later_rq(struct task_struct *task)
1115{
1116 struct sched_domain *sd;
1117 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
1118 int this_cpu = smp_processor_id();
1119 int best_cpu, cpu = task_cpu(task);
1120
1121 /* Make sure the mask is initialized first */
1122 if (unlikely(!later_mask))
1123 return -1;
1124
1125 if (task->nr_cpus_allowed == 1)
1126 return -1;
1127
1128 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1129 task, later_mask);
1130 if (best_cpu == -1)
1131 return -1;
1132
1133 /*
1134 * If we are here, some target has been found,
1135 * the most suitable of which is cached in best_cpu.
1136 * This is, among the runqueues where the current tasks
1137 * have later deadlines than the task's one, the rq
1138 * with the latest possible one.
1139 *
1140 * Now we check how well this matches with task's
1141 * affinity and system topology.
1142 *
1143 * The last cpu where the task run is our first
1144 * guess, since it is most likely cache-hot there.
1145 */
1146 if (cpumask_test_cpu(cpu, later_mask))
1147 return cpu;
1148 /*
1149 * Check if this_cpu is to be skipped (i.e., it is
1150 * not in the mask) or not.
1151 */
1152 if (!cpumask_test_cpu(this_cpu, later_mask))
1153 this_cpu = -1;
1154
1155 rcu_read_lock();
1156 for_each_domain(cpu, sd) {
1157 if (sd->flags & SD_WAKE_AFFINE) {
1158
1159 /*
1160 * If possible, preempting this_cpu is
1161 * cheaper than migrating.
1162 */
1163 if (this_cpu != -1 &&
1164 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1165 rcu_read_unlock();
1166 return this_cpu;
1167 }
1168
1169 /*
1170 * Last chance: if best_cpu is valid and is
1171 * in the mask, that becomes our choice.
1172 */
1173 if (best_cpu < nr_cpu_ids &&
1174 cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
1175 rcu_read_unlock();
1176 return best_cpu;
1177 }
1178 }
1179 }
1180 rcu_read_unlock();
1181
1182 /*
1183 * At this point, all our guesses failed, we just return
1184 * 'something', and let the caller sort the things out.
1185 */
1186 if (this_cpu != -1)
1187 return this_cpu;
1188
1189 cpu = cpumask_any(later_mask);
1190 if (cpu < nr_cpu_ids)
1191 return cpu;
1192
1193 return -1;
1194}
1195
1196/* Locks the rq it finds */
1197static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1198{
1199 struct rq *later_rq = NULL;
1200 int tries;
1201 int cpu;
1202
1203 for (tries = 0; tries < DL_MAX_TRIES; tries++) {
1204 cpu = find_later_rq(task);
1205
1206 if ((cpu == -1) || (cpu == rq->cpu))
1207 break;
1208
1209 later_rq = cpu_rq(cpu);
1210
1211 /* Retry if something changed. */
1212 if (double_lock_balance(rq, later_rq)) {
1213 if (unlikely(task_rq(task) != rq ||
1214 !cpumask_test_cpu(later_rq->cpu,
1215 &task->cpus_allowed) ||
1216 task_running(rq, task) || !task->on_rq)) {
1217 double_unlock_balance(rq, later_rq);
1218 later_rq = NULL;
1219 break;
1220 }
1221 }
1222
1223 /*
1224 * If the rq we found has no -deadline task, or
1225 * its earliest one has a later deadline than our
1226 * task, the rq is a good one.
1227 */
1228 if (!later_rq->dl.dl_nr_running ||
1229 dl_time_before(task->dl.deadline,
1230 later_rq->dl.earliest_dl.curr))
1231 break;
1232
1233 /* Otherwise we try again. */
1234 double_unlock_balance(rq, later_rq);
1235 later_rq = NULL;
1236 }
1237
1238 return later_rq;
1239}
1240
1241static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1242{
1243 struct task_struct *p;
1244
1245 if (!has_pushable_dl_tasks(rq))
1246 return NULL;
1247
1248 p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
1249 struct task_struct, pushable_dl_tasks);
1250
1251 BUG_ON(rq->cpu != task_cpu(p));
1252 BUG_ON(task_current(rq, p));
1253 BUG_ON(p->nr_cpus_allowed <= 1);
1254
1255 BUG_ON(!p->on_rq);
1256 BUG_ON(!dl_task(p));
1257
1258 return p;
1259}
1260
1261/*
1262 * See if the non running -deadline tasks on this rq
1263 * can be sent to some other CPU where they can preempt
1264 * and start executing.
1265 */
1266static int push_dl_task(struct rq *rq)
1267{
1268 struct task_struct *next_task;
1269 struct rq *later_rq;
1270
1271 if (!rq->dl.overloaded)
1272 return 0;
1273
1274 next_task = pick_next_pushable_dl_task(rq);
1275 if (!next_task)
1276 return 0;
1277
1278retry:
1279 if (unlikely(next_task == rq->curr)) {
1280 WARN_ON(1);
1281 return 0;
1282 }
1283
1284 /*
1285 * If next_task preempts rq->curr, and rq->curr
1286 * can move away, it makes sense to just reschedule
1287 * without going further in pushing next_task.
1288 */
1289 if (dl_task(rq->curr) &&
1290 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1291 rq->curr->nr_cpus_allowed > 1) {
1292 resched_task(rq->curr);
1293 return 0;
1294 }
1295
1296 /* We might release rq lock */
1297 get_task_struct(next_task);
1298
1299 /* Will lock the rq it'll find */
1300 later_rq = find_lock_later_rq(next_task, rq);
1301 if (!later_rq) {
1302 struct task_struct *task;
1303
1304 /*
1305 * We must check all this again, since
1306 * find_lock_later_rq releases rq->lock and it is
1307 * then possible that next_task has migrated.
1308 */
1309 task = pick_next_pushable_dl_task(rq);
1310 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1311 /*
1312 * The task is still there. We don't try
1313 * again, some other cpu will pull it when ready.
1314 */
1315 dequeue_pushable_dl_task(rq, next_task);
1316 goto out;
1317 }
1318
1319 if (!task)
1320 /* No more tasks */
1321 goto out;
1322
1323 put_task_struct(next_task);
1324 next_task = task;
1325 goto retry;
1326 }
1327
1328 deactivate_task(rq, next_task, 0);
1329 set_task_cpu(next_task, later_rq->cpu);
1330 activate_task(later_rq, next_task, 0);
1331
1332 resched_task(later_rq->curr);
1333
1334 double_unlock_balance(rq, later_rq);
1335
1336out:
1337 put_task_struct(next_task);
1338
1339 return 1;
1340}
1341
1342static void push_dl_tasks(struct rq *rq)
1343{
1344 /* Terminates as it moves a -deadline task */
1345 while (push_dl_task(rq))
1346 ;
1347}
1348
1349static int pull_dl_task(struct rq *this_rq)
1350{
1351 int this_cpu = this_rq->cpu, ret = 0, cpu;
1352 struct task_struct *p;
1353 struct rq *src_rq;
1354 u64 dmin = LONG_MAX;
1355
1356 if (likely(!dl_overloaded(this_rq)))
1357 return 0;
1358
1359 /*
1360 * Match the barrier from dl_set_overloaded; this guarantees that if we
1361 * see overloaded we must also see the dlo_mask bit.
1362 */
1363 smp_rmb();
1364
1365 for_each_cpu(cpu, this_rq->rd->dlo_mask) {
1366 if (this_cpu == cpu)
1367 continue;
1368
1369 src_rq = cpu_rq(cpu);
1370
1371 /*
1372 * It looks racy, abd it is! However, as in sched_rt.c,
1373 * we are fine with this.
1374 */
1375 if (this_rq->dl.dl_nr_running &&
1376 dl_time_before(this_rq->dl.earliest_dl.curr,
1377 src_rq->dl.earliest_dl.next))
1378 continue;
1379
1380 /* Might drop this_rq->lock */
1381 double_lock_balance(this_rq, src_rq);
1382
1383 /*
1384 * If there are no more pullable tasks on the
1385 * rq, we're done with it.
1386 */
1387 if (src_rq->dl.dl_nr_running <= 1)
1388 goto skip;
1389
1390 p = pick_next_earliest_dl_task(src_rq, this_cpu);
1391
1392 /*
1393 * We found a task to be pulled if:
1394 * - it preempts our current (if there's one),
1395 * - it will preempt the last one we pulled (if any).
1396 */
1397 if (p && dl_time_before(p->dl.deadline, dmin) &&
1398 (!this_rq->dl.dl_nr_running ||
1399 dl_time_before(p->dl.deadline,
1400 this_rq->dl.earliest_dl.curr))) {
1401 WARN_ON(p == src_rq->curr);
1402 WARN_ON(!p->on_rq);
1403
1404 /*
1405 * Then we pull iff p has actually an earlier
1406 * deadline than the current task of its runqueue.
1407 */
1408 if (dl_time_before(p->dl.deadline,
1409 src_rq->curr->dl.deadline))
1410 goto skip;
1411
1412 ret = 1;
1413
1414 deactivate_task(src_rq, p, 0);
1415 set_task_cpu(p, this_cpu);
1416 activate_task(this_rq, p, 0);
1417 dmin = p->dl.deadline;
1418
1419 /* Is there any other task even earlier? */
1420 }
1421skip:
1422 double_unlock_balance(this_rq, src_rq);
1423 }
1424
1425 return ret;
1426}
1427
1428static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1429{
1430 /* Try to pull other tasks here */
1431 if (dl_task(prev))
1432 pull_dl_task(rq);
1433}
1434
1435static void post_schedule_dl(struct rq *rq)
1436{
1437 push_dl_tasks(rq);
1438}
1439
1440/*
1441 * Since the task is not running and a reschedule is not going to happen
1442 * anytime soon on its runqueue, we try pushing it away now.
1443 */
1444static void task_woken_dl(struct rq *rq, struct task_struct *p)
1445{
1446 if (!task_running(rq, p) &&
1447 !test_tsk_need_resched(rq->curr) &&
1448 has_pushable_dl_tasks(rq) &&
1449 p->nr_cpus_allowed > 1 &&
1450 dl_task(rq->curr) &&
1451 (rq->curr->nr_cpus_allowed < 2 ||
1452 dl_entity_preempt(&rq->curr->dl, &p->dl))) {
1453 push_dl_tasks(rq);
1454 }
1455}
1456
1457static void set_cpus_allowed_dl(struct task_struct *p,
1458 const struct cpumask *new_mask)
1459{
1460 struct rq *rq;
1461 int weight;
1462
1463 BUG_ON(!dl_task(p));
1464
1465 /*
1466 * Update only if the task is actually running (i.e.,
1467 * it is on the rq AND it is not throttled).
1468 */
1469 if (!on_dl_rq(&p->dl))
1470 return;
1471
1472 weight = cpumask_weight(new_mask);
1473
1474 /*
1475 * Only update if the process changes its state from whether it
1476 * can migrate or not.
1477 */
1478 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1479 return;
1480
1481 rq = task_rq(p);
1482
1483 /*
1484 * The process used to be able to migrate OR it can now migrate
1485 */
1486 if (weight <= 1) {
1487 if (!task_current(rq, p))
1488 dequeue_pushable_dl_task(rq, p);
1489 BUG_ON(!rq->dl.dl_nr_migratory);
1490 rq->dl.dl_nr_migratory--;
1491 } else {
1492 if (!task_current(rq, p))
1493 enqueue_pushable_dl_task(rq, p);
1494 rq->dl.dl_nr_migratory++;
1495 }
1496
1497 update_dl_migration(&rq->dl);
1498}
1499
1500/* Assumes rq->lock is held */
1501static void rq_online_dl(struct rq *rq)
1502{
1503 if (rq->dl.overloaded)
1504 dl_set_overload(rq);
1505
1506 if (rq->dl.dl_nr_running > 0)
1507 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1508}
1509
1510/* Assumes rq->lock is held */
1511static void rq_offline_dl(struct rq *rq)
1512{
1513 if (rq->dl.overloaded)
1514 dl_clear_overload(rq);
1515
1516 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1517}
1518
1519void init_sched_dl_class(void)
1520{
1521 unsigned int i;
1522
1523 for_each_possible_cpu(i)
1524 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
1525 GFP_KERNEL, cpu_to_node(i));
1526}
1527
1528#endif /* CONFIG_SMP */
1529
1530static void switched_from_dl(struct rq *rq, struct task_struct *p)
1531{
1532 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1533 hrtimer_try_to_cancel(&p->dl.dl_timer);
1534
1535#ifdef CONFIG_SMP
1536 /*
1537 * Since this might be the only -deadline task on the rq,
1538 * this is the right place to try to pull some other one
1539 * from an overloaded cpu, if any.
1540 */
1541 if (!rq->dl.dl_nr_running)
1542 pull_dl_task(rq);
1543#endif
1544}
1545
1546/*
1547 * When switching to -deadline, we may overload the rq, then
1548 * we try to push someone off, if possible.
1549 */
1550static void switched_to_dl(struct rq *rq, struct task_struct *p)
1551{
1552 int check_resched = 1;
1553
1554 /*
1555 * If p is throttled, don't consider the possibility
1556 * of preempting rq->curr, the check will be done right
1557 * after its runtime will get replenished.
1558 */
1559 if (unlikely(p->dl.dl_throttled))
1560 return;
1561
1562 if (p->on_rq || rq->curr != p) {
1563#ifdef CONFIG_SMP
1564 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1565 /* Only reschedule if pushing failed */
1566 check_resched = 0;
1567#endif /* CONFIG_SMP */
1568 if (check_resched && task_has_dl_policy(rq->curr))
1569 check_preempt_curr_dl(rq, p, 0);
1570 }
1571}
1572
1573/*
1574 * If the scheduling parameters of a -deadline task changed,
1575 * a push or pull operation might be needed.
1576 */
1577static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1578 int oldprio)
1579{
1580 if (p->on_rq || rq->curr == p) {
1581#ifdef CONFIG_SMP
1582 /*
1583 * This might be too much, but unfortunately
1584 * we don't have the old deadline value, and
1585 * we can't argue if the task is increasing
1586 * or lowering its prio, so...
1587 */
1588 if (!rq->dl.overloaded)
1589 pull_dl_task(rq);
1590
1591 /*
1592 * If we now have a earlier deadline task than p,
1593 * then reschedule, provided p is still on this
1594 * runqueue.
1595 */
1596 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1597 rq->curr == p)
1598 resched_task(p);
1599#else
1600 /*
1601 * Again, we don't know if p has a earlier
1602 * or later deadline, so let's blindly set a
1603 * (maybe not needed) rescheduling point.
1604 */
1605 resched_task(p);
1606#endif /* CONFIG_SMP */
1607 } else
1608 switched_to_dl(rq, p);
1609}
1610
1611const struct sched_class dl_sched_class = {
1612 .next = &rt_sched_class,
1613 .enqueue_task = enqueue_task_dl,
1614 .dequeue_task = dequeue_task_dl,
1615 .yield_task = yield_task_dl,
1616
1617 .check_preempt_curr = check_preempt_curr_dl,
1618
1619 .pick_next_task = pick_next_task_dl,
1620 .put_prev_task = put_prev_task_dl,
1621
1622#ifdef CONFIG_SMP
1623 .select_task_rq = select_task_rq_dl,
1624 .set_cpus_allowed = set_cpus_allowed_dl,
1625 .rq_online = rq_online_dl,
1626 .rq_offline = rq_offline_dl,
1627 .pre_schedule = pre_schedule_dl,
1628 .post_schedule = post_schedule_dl,
1629 .task_woken = task_woken_dl,
1630#endif
1631
1632 .set_curr_task = set_curr_task_dl,
1633 .task_tick = task_tick_dl,
1634 .task_fork = task_fork_dl,
1635 .task_dead = task_dead_dl,
1636
1637 .prio_changed = prio_changed_dl,
1638 .switched_from = switched_from_dl,
1639 .switched_to = switched_to_dl,
1640};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
140#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING 141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); 142 SEQ_printf(m, " %d", task_node(p));
143#endif 143#endif
144#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
145 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
371 PN(cpu_clk); 371 PN(cpu_clk);
372 P(jiffies); 372 P(jiffies);
373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
374 P(sched_clock_stable); 374 P(sched_clock_stable());
375#endif 375#endif
376#undef PN 376#undef PN
377#undef P 377#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..867b0a4b0893 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
202 217
203 /* 218 __update_inv_weight(lw);
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212
213 if (!lw->inv_weight) {
214 unsigned long w = scale_load_down(lw->weight);
215 219
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 220 if (unlikely(fact >> 32)) {
217 lw->inv_weight = 1; 221 while (fact >> 32) {
218 else if (unlikely(!w)) 222 fact >>= 1;
219 lw->inv_weight = WMULT_CONST; 223 shift--;
220 else 224 }
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */ 229
227 if (unlikely(tmp > WMULT_CONST)) 230 while (fact >> 32) {
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 231 fact >>= 1;
229 WMULT_SHIFT/2); 232 shift--;
230 else 233 }
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 234
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703#endif 704#endif
704 705
705/* 706/*
706 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
707 * are not in our scheduling class.
708 */ 708 */
709static inline void
710__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
711 unsigned long delta_exec)
712{
713 unsigned long delta_exec_weighted;
714
715 schedstat_set(curr->statistics.exec_max,
716 max((u64)delta_exec, curr->statistics.exec_max));
717
718 curr->sum_exec_runtime += delta_exec;
719 schedstat_add(cfs_rq, exec_clock, delta_exec);
720 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722 curr->vruntime += delta_exec_weighted;
723 update_min_vruntime(cfs_rq);
724}
725
726static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
727{ 710{
728 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
729 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
730 unsigned long delta_exec; 713 u64 delta_exec;
731 714
732 if (unlikely(!curr)) 715 if (unlikely(!curr))
733 return; 716 return;
734 717
735 /* 718 delta_exec = now - curr->exec_start;
736 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
737 * since the last time we changed load (this cannot
738 * overflow on 32 bits):
739 */
740 delta_exec = (unsigned long)(now - curr->exec_start);
741 if (!delta_exec)
742 return; 720 return;
743 721
744 __update_curr(cfs_rq, curr, delta_exec);
745 curr->exec_start = now; 722 curr->exec_start = now;
746 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
747 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
748 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
749 735
@@ -886,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
886 return max(smin, smax); 872 return max(smin, smax);
887} 873}
888 874
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 875static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{ 876{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1); 877 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -944,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
944 if (!p->numa_group) 921 if (!p->numa_group)
945 return 0; 922 return 0;
946 923
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 924 return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 p->numa_group->faults[task_faults_idx(nid, 1)];
948} 926}
949 927
950/* 928/*
@@ -1037,7 +1015,7 @@ struct task_numa_env {
1037 1015
1038 struct numa_stats src_stats, dst_stats; 1016 struct numa_stats src_stats, dst_stats;
1039 1017
1040 int imbalance_pct, idx; 1018 int imbalance_pct;
1041 1019
1042 struct task_struct *best_task; 1020 struct task_struct *best_task;
1043 long best_imp; 1021 long best_imp;
@@ -1225,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
1225 * elsewhere, so there is no point in (re)trying. 1203 * elsewhere, so there is no point in (re)trying.
1226 */ 1204 */
1227 if (unlikely(!sd)) { 1205 if (unlikely(!sd)) {
1228 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1206 p->numa_preferred_nid = task_node(p);
1229 return -EINVAL; 1207 return -EINVAL;
1230 } 1208 }
1231 1209
@@ -1272,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
1272 p->numa_scan_period = task_scan_min(p); 1250 p->numa_scan_period = task_scan_min(p);
1273 1251
1274 if (env.best_task == NULL) { 1252 if (env.best_task == NULL) {
1275 int ret = migrate_task_to(p, env.best_cpu); 1253 ret = migrate_task_to(p, env.best_cpu);
1254 if (ret != 0)
1255 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1276 return ret; 1256 return ret;
1277 } 1257 }
1278 1258
1279 ret = migrate_swap(p, env.best_task); 1259 ret = migrate_swap(p, env.best_task);
1260 if (ret != 0)
1261 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1280 put_task_struct(env.best_task); 1262 put_task_struct(env.best_task);
1281 return ret; 1263 return ret;
1282} 1264}
@@ -1292,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1292 p->numa_migrate_retry = jiffies + HZ; 1274 p->numa_migrate_retry = jiffies + HZ;
1293 1275
1294 /* Success if task is already running on preferred CPU */ 1276 /* Success if task is already running on preferred CPU */
1295 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1277 if (task_node(p) == p->numa_preferred_nid)
1296 return; 1278 return;
1297 1279
1298 /* Otherwise, try migrate to a CPU on the preferred node */ 1280 /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1364,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
1364 * scanning faster if shared accesses dominate as it may 1346 * scanning faster if shared accesses dominate as it may
1365 * simply bounce migrations uselessly 1347 * simply bounce migrations uselessly
1366 */ 1348 */
1367 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1368 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1349 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1369 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1350 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1370 } 1351 }
@@ -1752,6 +1733,13 @@ void task_numa_work(struct callback_head *work)
1752 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 1733 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1753 continue; 1734 continue;
1754 1735
1736 /*
1737 * Skip inaccessible VMAs to avoid any confusion between
1738 * PROT_NONE and NUMA hinting ptes
1739 */
1740 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1741 continue;
1742
1755 do { 1743 do {
1756 start = max(start, vma->vm_start); 1744 start = max(start, vma->vm_start);
1757 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1745 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3003,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015 } 3003 }
3016} 3004}
3017 3005
3018static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3006static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 unsigned long delta_exec)
3020{ 3007{
3021 /* dock delta_exec before expiring quota (as it could span periods) */ 3008 /* dock delta_exec before expiring quota (as it could span periods) */
3022 cfs_rq->runtime_remaining -= delta_exec; 3009 cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3021,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034} 3021}
3035 3022
3036static __always_inline 3023static __always_inline
3037void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3024void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038{ 3025{
3039 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3026 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3040 return; 3027 return;
@@ -3574,8 +3561,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574 return rq_clock_task(rq_of(cfs_rq)); 3561 return rq_clock_task(rq_of(cfs_rq));
3575} 3562}
3576 3563
3577static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3564static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578 unsigned long delta_exec) {}
3579static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3565static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3566static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3567static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3932,7 +3918,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3932{ 3918{
3933 struct sched_entity *se = tg->se[cpu]; 3919 struct sched_entity *se = tg->se[cpu];
3934 3920
3935 if (!tg->parent || !wl) /* the trivial, non-cgroup case */ 3921 if (!tg->parent) /* the trivial, non-cgroup case */
3936 return wl; 3922 return wl;
3937 3923
3938 for_each_sched_entity(se) { 3924 for_each_sched_entity(se) {
@@ -4110,12 +4096,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4110 */ 4096 */
4111static struct sched_group * 4097static struct sched_group *
4112find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4098find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4113 int this_cpu, int load_idx) 4099 int this_cpu, int sd_flag)
4114{ 4100{
4115 struct sched_group *idlest = NULL, *group = sd->groups; 4101 struct sched_group *idlest = NULL, *group = sd->groups;
4116 unsigned long min_load = ULONG_MAX, this_load = 0; 4102 unsigned long min_load = ULONG_MAX, this_load = 0;
4103 int load_idx = sd->forkexec_idx;
4117 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4104 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4118 4105
4106 if (sd_flag & SD_BALANCE_WAKE)
4107 load_idx = sd->wake_idx;
4108
4119 do { 4109 do {
4120 unsigned long load, avg_load; 4110 unsigned long load, avg_load;
4121 int local_group; 4111 int local_group;
@@ -4283,7 +4273,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4283 } 4273 }
4284 4274
4285 while (sd) { 4275 while (sd) {
4286 int load_idx = sd->forkexec_idx;
4287 struct sched_group *group; 4276 struct sched_group *group;
4288 int weight; 4277 int weight;
4289 4278
@@ -4292,10 +4281,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4292 continue; 4281 continue;
4293 } 4282 }
4294 4283
4295 if (sd_flag & SD_BALANCE_WAKE) 4284 group = find_idlest_group(sd, p, cpu, sd_flag);
4296 load_idx = sd->wake_idx;
4297
4298 group = find_idlest_group(sd, p, cpu, load_idx);
4299 if (!group) { 4285 if (!group) {
4300 sd = sd->child; 4286 sd = sd->child;
4301 continue; 4287 continue;
@@ -5379,10 +5365,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5379 */ 5365 */
5380 5366
5381 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5367 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5382 struct sched_group *sg = cpu_rq(cpu)->sd->groups; 5368 struct sched_group_power *sgp;
5369 struct rq *rq = cpu_rq(cpu);
5370
5371 /*
5372 * build_sched_domains() -> init_sched_groups_power()
5373 * gets here before we've attached the domains to the
5374 * runqueues.
5375 *
5376 * Use power_of(), which is set irrespective of domains
5377 * in update_cpu_power().
5378 *
5379 * This avoids power/power_orig from being 0 and
5380 * causing divide-by-zero issues on boot.
5381 *
5382 * Runtime updates will correct power_orig.
5383 */
5384 if (unlikely(!rq->sd)) {
5385 power_orig += power_of(cpu);
5386 power += power_of(cpu);
5387 continue;
5388 }
5383 5389
5384 power_orig += sg->sgp->power_orig; 5390 sgp = rq->sd->groups->sgp;
5385 power += sg->sgp->power; 5391 power_orig += sgp->power_orig;
5392 power += sgp->power;
5386 } 5393 }
5387 } else { 5394 } else {
5388 /* 5395 /*
@@ -5500,7 +5507,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5500 struct sched_group *group, int load_idx, 5507 struct sched_group *group, int load_idx,
5501 int local_group, struct sg_lb_stats *sgs) 5508 int local_group, struct sg_lb_stats *sgs)
5502{ 5509{
5503 unsigned long nr_running;
5504 unsigned long load; 5510 unsigned long load;
5505 int i; 5511 int i;
5506 5512
@@ -5509,8 +5515,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5509 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5515 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5510 struct rq *rq = cpu_rq(i); 5516 struct rq *rq = cpu_rq(i);
5511 5517
5512 nr_running = rq->nr_running;
5513
5514 /* Bias balancing toward cpus of our domain */ 5518 /* Bias balancing toward cpus of our domain */
5515 if (local_group) 5519 if (local_group)
5516 load = target_load(i, load_idx); 5520 load = target_load(i, load_idx);
@@ -5518,7 +5522,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5518 load = source_load(i, load_idx); 5522 load = source_load(i, load_idx);
5519 5523
5520 sgs->group_load += load; 5524 sgs->group_load += load;
5521 sgs->sum_nr_running += nr_running; 5525 sgs->sum_nr_running += rq->nr_running;
5522#ifdef CONFIG_NUMA_BALANCING 5526#ifdef CONFIG_NUMA_BALANCING
5523 sgs->nr_numa_running += rq->nr_numa_running; 5527 sgs->nr_numa_running += rq->nr_numa_running;
5524 sgs->nr_preferred_running += rq->nr_preferred_running; 5528 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6509,7 +6513,7 @@ static struct {
6509 unsigned long next_balance; /* in jiffy units */ 6513 unsigned long next_balance; /* in jiffy units */
6510} nohz ____cacheline_aligned; 6514} nohz ____cacheline_aligned;
6511 6515
6512static inline int find_new_ilb(int call_cpu) 6516static inline int find_new_ilb(void)
6513{ 6517{
6514 int ilb = cpumask_first(nohz.idle_cpus_mask); 6518 int ilb = cpumask_first(nohz.idle_cpus_mask);
6515 6519
@@ -6524,13 +6528,13 @@ static inline int find_new_ilb(int call_cpu)
6524 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6528 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6525 * CPU (if there is one). 6529 * CPU (if there is one).
6526 */ 6530 */
6527static void nohz_balancer_kick(int cpu) 6531static void nohz_balancer_kick(void)
6528{ 6532{
6529 int ilb_cpu; 6533 int ilb_cpu;
6530 6534
6531 nohz.next_balance++; 6535 nohz.next_balance++;
6532 6536
6533 ilb_cpu = find_new_ilb(cpu); 6537 ilb_cpu = find_new_ilb();
6534 6538
6535 if (ilb_cpu >= nr_cpu_ids) 6539 if (ilb_cpu >= nr_cpu_ids)
6536 return; 6540 return;
@@ -6640,10 +6644,10 @@ void update_max_interval(void)
6640 * 6644 *
6641 * Balancing parameters are set up in init_sched_domains. 6645 * Balancing parameters are set up in init_sched_domains.
6642 */ 6646 */
6643static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6647static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6644{ 6648{
6645 int continue_balancing = 1; 6649 int continue_balancing = 1;
6646 struct rq *rq = cpu_rq(cpu); 6650 int cpu = rq->cpu;
6647 unsigned long interval; 6651 unsigned long interval;
6648 struct sched_domain *sd; 6652 struct sched_domain *sd;
6649 /* Earliest time when we have to do rebalance again */ 6653 /* Earliest time when we have to do rebalance again */
@@ -6740,9 +6744,9 @@ out:
6740 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6744 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6741 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6745 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6742 */ 6746 */
6743static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6747static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6744{ 6748{
6745 struct rq *this_rq = cpu_rq(this_cpu); 6749 int this_cpu = this_rq->cpu;
6746 struct rq *rq; 6750 struct rq *rq;
6747 int balance_cpu; 6751 int balance_cpu;
6748 6752
@@ -6769,7 +6773,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6769 update_idle_cpu_load(rq); 6773 update_idle_cpu_load(rq);
6770 raw_spin_unlock_irq(&rq->lock); 6774 raw_spin_unlock_irq(&rq->lock);
6771 6775
6772 rebalance_domains(balance_cpu, CPU_IDLE); 6776 rebalance_domains(rq, CPU_IDLE);
6773 6777
6774 if (time_after(this_rq->next_balance, rq->next_balance)) 6778 if (time_after(this_rq->next_balance, rq->next_balance))
6775 this_rq->next_balance = rq->next_balance; 6779 this_rq->next_balance = rq->next_balance;
@@ -6788,14 +6792,14 @@ end:
6788 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6792 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6789 * domain span are idle. 6793 * domain span are idle.
6790 */ 6794 */
6791static inline int nohz_kick_needed(struct rq *rq, int cpu) 6795static inline int nohz_kick_needed(struct rq *rq)
6792{ 6796{
6793 unsigned long now = jiffies; 6797 unsigned long now = jiffies;
6794 struct sched_domain *sd; 6798 struct sched_domain *sd;
6795 struct sched_group_power *sgp; 6799 struct sched_group_power *sgp;
6796 int nr_busy; 6800 int nr_busy, cpu = rq->cpu;
6797 6801
6798 if (unlikely(idle_cpu(cpu))) 6802 if (unlikely(rq->idle_balance))
6799 return 0; 6803 return 0;
6800 6804
6801 /* 6805 /*
@@ -6844,7 +6848,7 @@ need_kick:
6844 return 1; 6848 return 1;
6845} 6849}
6846#else 6850#else
6847static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6851static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6848#endif 6852#endif
6849 6853
6850/* 6854/*
@@ -6853,38 +6857,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6853 */ 6857 */
6854static void run_rebalance_domains(struct softirq_action *h) 6858static void run_rebalance_domains(struct softirq_action *h)
6855{ 6859{
6856 int this_cpu = smp_processor_id(); 6860 struct rq *this_rq = this_rq();
6857 struct rq *this_rq = cpu_rq(this_cpu);
6858 enum cpu_idle_type idle = this_rq->idle_balance ? 6861 enum cpu_idle_type idle = this_rq->idle_balance ?
6859 CPU_IDLE : CPU_NOT_IDLE; 6862 CPU_IDLE : CPU_NOT_IDLE;
6860 6863
6861 rebalance_domains(this_cpu, idle); 6864 rebalance_domains(this_rq, idle);
6862 6865
6863 /* 6866 /*
6864 * If this cpu has a pending nohz_balance_kick, then do the 6867 * If this cpu has a pending nohz_balance_kick, then do the
6865 * balancing on behalf of the other idle cpus whose ticks are 6868 * balancing on behalf of the other idle cpus whose ticks are
6866 * stopped. 6869 * stopped.
6867 */ 6870 */
6868 nohz_idle_balance(this_cpu, idle); 6871 nohz_idle_balance(this_rq, idle);
6869} 6872}
6870 6873
6871static inline int on_null_domain(int cpu) 6874static inline int on_null_domain(struct rq *rq)
6872{ 6875{
6873 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6876 return !rcu_dereference_sched(rq->sd);
6874} 6877}
6875 6878
6876/* 6879/*
6877 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6880 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6878 */ 6881 */
6879void trigger_load_balance(struct rq *rq, int cpu) 6882void trigger_load_balance(struct rq *rq)
6880{ 6883{
6881 /* Don't need to rebalance while attached to NULL domain */ 6884 /* Don't need to rebalance while attached to NULL domain */
6882 if (time_after_eq(jiffies, rq->next_balance) && 6885 if (unlikely(on_null_domain(rq)))
6883 likely(!on_null_domain(cpu))) 6886 return;
6887
6888 if (time_after_eq(jiffies, rq->next_balance))
6884 raise_softirq(SCHED_SOFTIRQ); 6889 raise_softirq(SCHED_SOFTIRQ);
6885#ifdef CONFIG_NO_HZ_COMMON 6890#ifdef CONFIG_NO_HZ_COMMON
6886 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6891 if (nohz_kick_needed(rq))
6887 nohz_balancer_kick(cpu); 6892 nohz_balancer_kick();
6888#endif 6893#endif
6889} 6894}
6890 6895
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..a2740b775b45 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
901{ 901{
902 struct rq *rq = rq_of_rt_rq(rt_rq); 902 struct rq *rq = rq_of_rt_rq(rt_rq);
903 903
904#ifdef CONFIG_RT_GROUP_SCHED
905 /*
906 * Change rq's cpupri only if rt_rq is the top queue.
907 */
908 if (&rq->rt != rt_rq)
909 return;
910#endif
904 if (rq->online && prio < prev_prio) 911 if (rq->online && prio < prev_prio)
905 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 912 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
906} 913}
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
910{ 917{
911 struct rq *rq = rq_of_rt_rq(rt_rq); 918 struct rq *rq = rq_of_rt_rq(rt_rq);
912 919
920#ifdef CONFIG_RT_GROUP_SCHED
921 /*
922 * Change rq's cpupri only if rt_rq is the top queue.
923 */
924 if (&rq->rt != rt_rq)
925 return;
926#endif
913 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 927 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
914 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 928 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
915} 929}
@@ -1724,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1724 !test_tsk_need_resched(rq->curr) && 1738 !test_tsk_need_resched(rq->curr) &&
1725 has_pushable_tasks(rq) && 1739 has_pushable_tasks(rq) &&
1726 p->nr_cpus_allowed > 1 && 1740 p->nr_cpus_allowed > 1 &&
1727 rt_task(rq->curr) && 1741 (dl_task(rq->curr) || rt_task(rq->curr)) &&
1728 (rq->curr->nr_cpus_allowed < 2 || 1742 (rq->curr->nr_cpus_allowed < 2 ||
1729 rq->curr->prio <= p->prio)) 1743 rq->curr->prio <= p->prio))
1730 push_rt_tasks(rq); 1744 push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..c2119fd20f8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
5#include <linux/mutex.h> 6#include <linux/mutex.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
9#include <linux/slab.h> 10#include <linux/slab.h>
10 11
11#include "cpupri.h" 12#include "cpupri.h"
13#include "cpudeadline.h"
12#include "cpuacct.h" 14#include "cpuacct.h"
13 15
14struct rq; 16struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
73#define NICE_0_SHIFT SCHED_LOAD_SHIFT 75#define NICE_0_SHIFT SCHED_LOAD_SHIFT
74 76
75/* 77/*
78 * Single value that decides SCHED_DEADLINE internal math precision.
79 * 10 -> just above 1us
80 * 9 -> just above 0.5us
81 */
82#define DL_SCALE (10)
83
84/*
76 * These are the 'tuning knobs' of the scheduler: 85 * These are the 'tuning knobs' of the scheduler:
77 */ 86 */
78 87
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
81 */ 90 */
82#define RUNTIME_INF ((u64)~0ULL) 91#define RUNTIME_INF ((u64)~0ULL)
83 92
93static inline int fair_policy(int policy)
94{
95 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
96}
97
84static inline int rt_policy(int policy) 98static inline int rt_policy(int policy)
85{ 99{
86 if (policy == SCHED_FIFO || policy == SCHED_RR) 100 return policy == SCHED_FIFO || policy == SCHED_RR;
87 return 1; 101}
88 return 0; 102
103static inline int dl_policy(int policy)
104{
105 return policy == SCHED_DEADLINE;
89} 106}
90 107
91static inline int task_has_rt_policy(struct task_struct *p) 108static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
93 return rt_policy(p->policy); 110 return rt_policy(p->policy);
94} 111}
95 112
113static inline int task_has_dl_policy(struct task_struct *p)
114{
115 return dl_policy(p->policy);
116}
117
118static inline bool dl_time_before(u64 a, u64 b)
119{
120 return (s64)(a - b) < 0;
121}
122
123/*
124 * Tells if entity @a should preempt entity @b.
125 */
126static inline bool
127dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
128{
129 return dl_time_before(a->deadline, b->deadline);
130}
131
96/* 132/*
97 * This is the priority-queue data structure of the RT scheduling class: 133 * This is the priority-queue data structure of the RT scheduling class:
98 */ 134 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
108 u64 rt_runtime; 144 u64 rt_runtime;
109 struct hrtimer rt_period_timer; 145 struct hrtimer rt_period_timer;
110}; 146};
147/*
148 * To keep the bandwidth of -deadline tasks and groups under control
149 * we need some place where:
150 * - store the maximum -deadline bandwidth of the system (the group);
151 * - cache the fraction of that bandwidth that is currently allocated.
152 *
153 * This is all done in the data structure below. It is similar to the
154 * one used for RT-throttling (rt_bandwidth), with the main difference
155 * that, since here we are only interested in admission control, we
156 * do not decrease any runtime while the group "executes", neither we
157 * need a timer to replenish it.
158 *
159 * With respect to SMP, the bandwidth is given on a per-CPU basis,
160 * meaning that:
161 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
162 * - dl_total_bw array contains, in the i-eth element, the currently
163 * allocated bandwidth on the i-eth CPU.
164 * Moreover, groups consume bandwidth on each CPU, while tasks only
165 * consume bandwidth on the CPU they're running on.
166 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
167 * that will be shown the next time the proc or cgroup controls will
168 * be red. It on its turn can be changed by writing on its own
169 * control.
170 */
171struct dl_bandwidth {
172 raw_spinlock_t dl_runtime_lock;
173 u64 dl_runtime;
174 u64 dl_period;
175};
176
177static inline int dl_bandwidth_enabled(void)
178{
179 return sysctl_sched_rt_runtime >= 0;
180}
181
182extern struct dl_bw *dl_bw_of(int i);
183
184struct dl_bw {
185 raw_spinlock_t lock;
186 u64 bw, total_bw;
187};
111 188
112extern struct mutex sched_domains_mutex; 189extern struct mutex sched_domains_mutex;
113 190
@@ -364,6 +441,42 @@ struct rt_rq {
364#endif 441#endif
365}; 442};
366 443
444/* Deadline class' related fields in a runqueue */
445struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */
447 struct rb_root rb_root;
448 struct rb_node *rb_leftmost;
449
450 unsigned long dl_nr_running;
451
452#ifdef CONFIG_SMP
453 /*
454 * Deadline values of the currently executing and the
455 * earliest ready task on this rq. Caching these facilitates
456 * the decision wether or not a ready but not running task
457 * should migrate somewhere else.
458 */
459 struct {
460 u64 curr;
461 u64 next;
462 } earliest_dl;
463
464 unsigned long dl_nr_migratory;
465 unsigned long dl_nr_total;
466 int overloaded;
467
468 /*
469 * Tasks on this rq that can be pushed away. They are kept in
470 * an rb-tree, ordered by tasks' deadlines, with caching
471 * of the leftmost (earliest deadline) element.
472 */
473 struct rb_root pushable_dl_tasks_root;
474 struct rb_node *pushable_dl_tasks_leftmost;
475#else
476 struct dl_bw dl_bw;
477#endif
478};
479
367#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
368 481
369/* 482/*
@@ -382,6 +495,15 @@ struct root_domain {
382 cpumask_var_t online; 495 cpumask_var_t online;
383 496
384 /* 497 /*
498 * The bit corresponding to a CPU gets set here if such CPU has more
499 * than one runnable -deadline task (as it is below for RT tasks).
500 */
501 cpumask_var_t dlo_mask;
502 atomic_t dlo_count;
503 struct dl_bw dl_bw;
504 struct cpudl cpudl;
505
506 /*
385 * The "RT overload" flag: it gets set if a CPU has more than 507 * The "RT overload" flag: it gets set if a CPU has more than
386 * one runnable RT task. 508 * one runnable RT task.
387 */ 509 */
@@ -432,6 +554,7 @@ struct rq {
432 554
433 struct cfs_rq cfs; 555 struct cfs_rq cfs;
434 struct rt_rq rt; 556 struct rt_rq rt;
557 struct dl_rq dl;
435 558
436#ifdef CONFIG_FAIR_GROUP_SCHED 559#ifdef CONFIG_FAIR_GROUP_SCHED
437 /* list of leaf cfs_rq on this cpu: */ 560 /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void)
827 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 950 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
828} 951}
829 952
830
831
832static inline int task_current(struct rq *rq, struct task_struct *p) 953static inline int task_current(struct rq *rq, struct task_struct *p)
833{ 954{
834 return rq->curr == p; 955 return rq->curr == p;
@@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = {
988#else 1109#else
989#define ENQUEUE_WAKING 0 1110#define ENQUEUE_WAKING 0
990#endif 1111#endif
1112#define ENQUEUE_REPLENISH 8
991 1113
992#define DEQUEUE_SLEEP 1 1114#define DEQUEUE_SLEEP 1
993 1115
@@ -1023,6 +1145,7 @@ struct sched_class {
1023 void (*set_curr_task) (struct rq *rq); 1145 void (*set_curr_task) (struct rq *rq);
1024 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1146 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1025 void (*task_fork) (struct task_struct *p); 1147 void (*task_fork) (struct task_struct *p);
1148 void (*task_dead) (struct task_struct *p);
1026 1149
1027 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1150 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1028 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1151 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1165,7 @@ struct sched_class {
1042 for (class = sched_class_highest; class; class = class->next) 1165 for (class = sched_class_highest; class; class = class->next)
1043 1166
1044extern const struct sched_class stop_sched_class; 1167extern const struct sched_class stop_sched_class;
1168extern const struct sched_class dl_sched_class;
1045extern const struct sched_class rt_sched_class; 1169extern const struct sched_class rt_sched_class;
1046extern const struct sched_class fair_sched_class; 1170extern const struct sched_class fair_sched_class;
1047extern const struct sched_class idle_sched_class; 1171extern const struct sched_class idle_sched_class;
@@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class;
1051 1175
1052extern void update_group_power(struct sched_domain *sd, int cpu); 1176extern void update_group_power(struct sched_domain *sd, int cpu);
1053 1177
1054extern void trigger_load_balance(struct rq *rq, int cpu); 1178extern void trigger_load_balance(struct rq *rq);
1055extern void idle_balance(int this_cpu, struct rq *this_rq); 1179extern void idle_balance(int this_cpu, struct rq *this_rq);
1056 1180
1057extern void idle_enter_fair(struct rq *this_rq); 1181extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
1068extern void sysrq_sched_debug_show(void); 1192extern void sysrq_sched_debug_show(void);
1069extern void sched_init_granularity(void); 1193extern void sched_init_granularity(void);
1070extern void update_max_interval(void); 1194extern void update_max_interval(void);
1195
1196extern void init_sched_dl_class(void);
1071extern void init_sched_rt_class(void); 1197extern void init_sched_rt_class(void);
1072extern void init_sched_fair_class(void); 1198extern void init_sched_fair_class(void);
1199extern void init_sched_dl_class(void);
1073 1200
1074extern void resched_task(struct task_struct *p); 1201extern void resched_task(struct task_struct *p);
1075extern void resched_cpu(int cpu); 1202extern void resched_cpu(int cpu);
@@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu);
1077extern struct rt_bandwidth def_rt_bandwidth; 1204extern struct rt_bandwidth def_rt_bandwidth;
1078extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1205extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1079 1206
1207extern struct dl_bandwidth def_dl_bandwidth;
1208extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1209extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1210
1211unsigned long to_ratio(u64 period, u64 runtime);
1212
1080extern void update_idle_cpu_load(struct rq *this_rq); 1213extern void update_idle_cpu_load(struct rq *this_rq);
1081 1214
1082extern void init_task_runnable_average(struct task_struct *p); 1215extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1353 1486
1354extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1487extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1488extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1489extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
1356 1490
1357extern void cfs_bandwidth_usage_inc(void); 1491extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void); 1492extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
103 * Simple, special scheduling class for the per-CPU stop tasks: 103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */ 104 */
105const struct sched_class stop_sched_class = { 105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class, 106 .next = &dl_sched_class,
107 107
108 .enqueue_task = enqueue_task_stop, 108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop, 109 .dequeue_task = dequeue_task_stop,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 11025ccc06dd..8a1e6e104892 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,7 @@ static void wakeup_softirqd(void)
89 * where hardirqs are disabled legitimately: 89 * where hardirqs are disabled legitimately:
90 */ 90 */
91#ifdef CONFIG_TRACE_IRQFLAGS 91#ifdef CONFIG_TRACE_IRQFLAGS
92static void __local_bh_disable(unsigned long ip, unsigned int cnt) 92void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
93{ 93{
94 unsigned long flags; 94 unsigned long flags;
95 95
@@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
107 /* 107 /*
108 * Were softirqs turned off above: 108 * Were softirqs turned off above:
109 */ 109 */
110 if (softirq_count() == cnt) 110 if (softirq_count() == (cnt & SOFTIRQ_MASK))
111 trace_softirqs_off(ip); 111 trace_softirqs_off(ip);
112 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
113 113
114 if (preempt_count() == cnt) 114 if (preempt_count() == cnt)
115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
116} 116}
117#else /* !CONFIG_TRACE_IRQFLAGS */ 117EXPORT_SYMBOL(__local_bh_disable_ip);
118static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
119{
120 preempt_count_add(cnt);
121 barrier();
122}
123#endif /* CONFIG_TRACE_IRQFLAGS */ 118#endif /* CONFIG_TRACE_IRQFLAGS */
124 119
125void local_bh_disable(void)
126{
127 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
128}
129
130EXPORT_SYMBOL(local_bh_disable);
131
132static void __local_bh_enable(unsigned int cnt) 120static void __local_bh_enable(unsigned int cnt)
133{ 121{
134 WARN_ON_ONCE(!irqs_disabled()); 122 WARN_ON_ONCE(!irqs_disabled());
135 123
136 if (softirq_count() == cnt) 124 if (softirq_count() == (cnt & SOFTIRQ_MASK))
137 trace_softirqs_on(_RET_IP_); 125 trace_softirqs_on(_RET_IP_);
138 preempt_count_sub(cnt); 126 preempt_count_sub(cnt);
139} 127}
@@ -151,7 +139,7 @@ void _local_bh_enable(void)
151 139
152EXPORT_SYMBOL(_local_bh_enable); 140EXPORT_SYMBOL(_local_bh_enable);
153 141
154static inline void _local_bh_enable_ip(unsigned long ip) 142void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
155{ 143{
156 WARN_ON_ONCE(in_irq() || irqs_disabled()); 144 WARN_ON_ONCE(in_irq() || irqs_disabled());
157#ifdef CONFIG_TRACE_IRQFLAGS 145#ifdef CONFIG_TRACE_IRQFLAGS
@@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
166 * Keep preemption disabled until we are done with 154 * Keep preemption disabled until we are done with
167 * softirq processing: 155 * softirq processing:
168 */ 156 */
169 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); 157 preempt_count_sub(cnt - 1);
170 158
171 if (unlikely(!in_interrupt() && local_softirq_pending())) { 159 if (unlikely(!in_interrupt() && local_softirq_pending())) {
172 /* 160 /*
@@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
182#endif 170#endif
183 preempt_check_resched(); 171 preempt_check_resched();
184} 172}
185 173EXPORT_SYMBOL(__local_bh_enable_ip);
186void local_bh_enable(void)
187{
188 _local_bh_enable_ip(_RET_IP_);
189}
190EXPORT_SYMBOL(local_bh_enable);
191
192void local_bh_enable_ip(unsigned long ip)
193{
194 _local_bh_enable_ip(ip);
195}
196EXPORT_SYMBOL(local_bh_enable_ip);
197 174
198/* 175/*
199 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, 176 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@@ -211,14 +188,48 @@ EXPORT_SYMBOL(local_bh_enable_ip);
211#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 188#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
212#define MAX_SOFTIRQ_RESTART 10 189#define MAX_SOFTIRQ_RESTART 10
213 190
191#ifdef CONFIG_TRACE_IRQFLAGS
192/*
193 * When we run softirqs from irq_exit() and thus on the hardirq stack we need
194 * to keep the lockdep irq context tracking as tight as possible in order to
195 * not miss-qualify lock contexts and miss possible deadlocks.
196 */
197
198static inline bool lockdep_softirq_start(void)
199{
200 bool in_hardirq = false;
201
202 if (trace_hardirq_context(current)) {
203 in_hardirq = true;
204 trace_hardirq_exit();
205 }
206
207 lockdep_softirq_enter();
208
209 return in_hardirq;
210}
211
212static inline void lockdep_softirq_end(bool in_hardirq)
213{
214 lockdep_softirq_exit();
215
216 if (in_hardirq)
217 trace_hardirq_enter();
218}
219#else
220static inline bool lockdep_softirq_start(void) { return false; }
221static inline void lockdep_softirq_end(bool in_hardirq) { }
222#endif
223
214asmlinkage void __do_softirq(void) 224asmlinkage void __do_softirq(void)
215{ 225{
216 struct softirq_action *h;
217 __u32 pending;
218 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 226 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
219 int cpu;
220 unsigned long old_flags = current->flags; 227 unsigned long old_flags = current->flags;
221 int max_restart = MAX_SOFTIRQ_RESTART; 228 int max_restart = MAX_SOFTIRQ_RESTART;
229 struct softirq_action *h;
230 bool in_hardirq;
231 __u32 pending;
232 int cpu;
222 233
223 /* 234 /*
224 * Mask out PF_MEMALLOC s current task context is borrowed for the 235 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -230,8 +241,8 @@ asmlinkage void __do_softirq(void)
230 pending = local_softirq_pending(); 241 pending = local_softirq_pending();
231 account_irq_enter_time(current); 242 account_irq_enter_time(current);
232 243
233 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); 244 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
234 lockdep_softirq_enter(); 245 in_hardirq = lockdep_softirq_start();
235 246
236 cpu = smp_processor_id(); 247 cpu = smp_processor_id();
237restart: 248restart:
@@ -278,16 +289,13 @@ restart:
278 wakeup_softirqd(); 289 wakeup_softirqd();
279 } 290 }
280 291
281 lockdep_softirq_exit(); 292 lockdep_softirq_end(in_hardirq);
282
283 account_irq_exit_time(current); 293 account_irq_exit_time(current);
284 __local_bh_enable(SOFTIRQ_OFFSET); 294 __local_bh_enable(SOFTIRQ_OFFSET);
285 WARN_ON_ONCE(in_interrupt()); 295 WARN_ON_ONCE(in_interrupt());
286 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 296 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
287} 297}
288 298
289
290
291asmlinkage void do_softirq(void) 299asmlinkage void do_softirq(void)
292{ 300{
293 __u32 pending; 301 __u32 pending;
@@ -311,8 +319,6 @@ asmlinkage void do_softirq(void)
311 */ 319 */
312void irq_enter(void) 320void irq_enter(void)
313{ 321{
314 int cpu = smp_processor_id();
315
316 rcu_irq_enter(); 322 rcu_irq_enter();
317 if (is_idle_task(current) && !in_interrupt()) { 323 if (is_idle_task(current) && !in_interrupt()) {
318 /* 324 /*
@@ -320,7 +326,7 @@ void irq_enter(void)
320 * here, as softirq will be serviced on return from interrupt. 326 * here, as softirq will be serviced on return from interrupt.
321 */ 327 */
322 local_bh_disable(); 328 local_bh_disable();
323 tick_check_idle(cpu); 329 tick_check_idle();
324 _local_bh_enable(); 330 _local_bh_enable();
325 } 331 }
326 332
@@ -375,13 +381,13 @@ void irq_exit(void)
375#endif 381#endif
376 382
377 account_irq_exit_time(current); 383 account_irq_exit_time(current);
378 trace_hardirq_exit();
379 preempt_count_sub(HARDIRQ_OFFSET); 384 preempt_count_sub(HARDIRQ_OFFSET);
380 if (!in_interrupt() && local_softirq_pending()) 385 if (!in_interrupt() && local_softirq_pending())
381 invoke_softirq(); 386 invoke_softirq();
382 387
383 tick_irq_exit(); 388 tick_irq_exit();
384 rcu_irq_exit(); 389 rcu_irq_exit();
390 trace_hardirq_exit(); /* must be last! */
385} 391}
386 392
387/* 393/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a604726d0b..332cefcdb04b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,8 +95,6 @@
95#if defined(CONFIG_SYSCTL) 95#if defined(CONFIG_SYSCTL)
96 96
97/* External variables not in a header file. */ 97/* External variables not in a header file. */
98extern int sysctl_overcommit_memory;
99extern int sysctl_overcommit_ratio;
100extern int max_threads; 98extern int max_threads;
101extern int suid_dumpable; 99extern int suid_dumpable;
102#ifdef CONFIG_COREDUMP 100#ifdef CONFIG_COREDUMP
@@ -385,13 +383,6 @@ static struct ctl_table kern_table[] = {
385 .proc_handler = proc_dointvec, 383 .proc_handler = proc_dointvec,
386 }, 384 },
387 { 385 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred", 386 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred, 387 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
@@ -1128,7 +1119,14 @@ static struct ctl_table vm_table[] = {
1128 .data = &sysctl_overcommit_ratio, 1119 .data = &sysctl_overcommit_ratio,
1129 .maxlen = sizeof(sysctl_overcommit_ratio), 1120 .maxlen = sizeof(sysctl_overcommit_ratio),
1130 .mode = 0644, 1121 .mode = 0644,
1131 .proc_handler = proc_dointvec, 1122 .proc_handler = overcommit_ratio_handler,
1123 },
1124 {
1125 .procname = "overcommit_kbytes",
1126 .data = &sysctl_overcommit_kbytes,
1127 .maxlen = sizeof(sysctl_overcommit_kbytes),
1128 .mode = 0644,
1129 .proc_handler = overcommit_kbytes_handler,
1132 }, 1130 },
1133 { 1131 {
1134 .procname = "page-cluster", 1132 .procname = "page-cluster",
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
index 4aef390671cb..3e9868d47535 100644
--- a/kernel/system_certificates.S
+++ b/kernel/system_certificates.S
@@ -3,8 +3,18 @@
3 3
4 __INITRODATA 4 __INITRODATA
5 5
6 .align 8
6 .globl VMLINUX_SYMBOL(system_certificate_list) 7 .globl VMLINUX_SYMBOL(system_certificate_list)
7VMLINUX_SYMBOL(system_certificate_list): 8VMLINUX_SYMBOL(system_certificate_list):
9__cert_list_start:
8 .incbin "kernel/x509_certificate_list" 10 .incbin "kernel/x509_certificate_list"
9 .globl VMLINUX_SYMBOL(system_certificate_list_end) 11__cert_list_end:
10VMLINUX_SYMBOL(system_certificate_list_end): 12
13 .align 8
14 .globl VMLINUX_SYMBOL(system_certificate_list_size)
15VMLINUX_SYMBOL(system_certificate_list_size):
16#ifdef CONFIG_64BIT
17 .quad __cert_list_end - __cert_list_start
18#else
19 .long __cert_list_end - __cert_list_start
20#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 564dd93430a2..52ebc70263f4 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -22,7 +22,7 @@ struct key *system_trusted_keyring;
22EXPORT_SYMBOL_GPL(system_trusted_keyring); 22EXPORT_SYMBOL_GPL(system_trusted_keyring);
23 23
24extern __initconst const u8 system_certificate_list[]; 24extern __initconst const u8 system_certificate_list[];
25extern __initconst const u8 system_certificate_list_end[]; 25extern __initconst const unsigned long system_certificate_list_size;
26 26
27/* 27/*
28 * Load the compiled-in keys 28 * Load the compiled-in keys
@@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void)
60 60
61 pr_notice("Loading compiled-in X.509 certificates\n"); 61 pr_notice("Loading compiled-in X.509 certificates\n");
62 62
63 end = system_certificate_list_end;
64 p = system_certificate_list; 63 p = system_certificate_list;
64 end = p + system_certificate_list_size;
65 while (p < end) { 65 while (p < end) {
66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more 66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
67 * than 256 bytes in size. 67 * than 256 bytes in size.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68b799375981..0abb36464281 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -74,7 +74,7 @@ unsigned long long notrace sched_clock(void)
74 return cd.epoch_ns; 74 return cd.epoch_ns;
75 75
76 do { 76 do {
77 seq = read_seqcount_begin(&cd.seq); 77 seq = raw_read_seqcount_begin(&cd.seq);
78 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
79 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
80 } while (read_seqcount_retry(&cd.seq, seq)); 80 } while (read_seqcount_retry(&cd.seq, seq));
@@ -99,10 +99,10 @@ static void notrace update_sched_clock(void)
99 cd.mult, cd.shift); 99 cd.mult, cd.shift);
100 100
101 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
102 write_seqcount_begin(&cd.seq); 102 raw_write_seqcount_begin(&cd.seq);
103 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
104 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 write_seqcount_end(&cd.seq); 105 raw_write_seqcount_end(&cd.seq);
106 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
107} 107}
108 108
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690daaa9..43780ab5e279 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -538,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
538 * Called from irq_enter() when idle was interrupted to reenable the 538 * Called from irq_enter() when idle was interrupted to reenable the
539 * per cpu device. 539 * per cpu device.
540 */ 540 */
541void tick_check_oneshot_broadcast(int cpu) 541void tick_check_oneshot_broadcast_this_cpu(void)
542{ 542{
543 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { 543 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
544 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 544 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
545 545
546 /* 546 /*
547 * We might be in the middle of switching over from 547 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..20b2fe37d105 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36
37/*
38 * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
39 * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
40 * variable has two functions:
41 *
42 * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
43 * timekeeping lock all at once. Only the CPU which is assigned to do the
44 * update is handling it.
45 *
46 * 2) Hand off the duty in the NOHZ idle case by setting the value to
47 * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
48 * at it will take over and keep the time keeping alive. The handover
49 * procedure also covers cpu hotplug.
50 */
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 51int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37 52
38/* 53/*
@@ -70,6 +85,7 @@ static void tick_periodic(int cpu)
70 85
71 do_timer(1); 86 do_timer(1);
72 write_sequnlock(&jiffies_lock); 87 write_sequnlock(&jiffies_lock);
88 update_wall_time();
73 } 89 }
74 90
75 update_process_times(user_mode(get_irq_regs())); 91 update_process_times(user_mode(get_irq_regs()));
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 18e71f7fbc2a..8329669b51ec 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
53extern int tick_broadcast_oneshot_active(void); 53extern int tick_broadcast_oneshot_active(void);
54extern void tick_check_oneshot_broadcast(int cpu); 54extern void tick_check_oneshot_broadcast_this_cpu(void);
55bool tick_broadcast_oneshot_available(void); 55bool tick_broadcast_oneshot_available(void);
56# else /* BROADCAST */ 56# else /* BROADCAST */
57static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 57static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
@@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
62static inline void tick_broadcast_switch_to_oneshot(void) { } 62static inline void tick_broadcast_switch_to_oneshot(void) { }
63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
64static inline int tick_broadcast_oneshot_active(void) { return 0; } 64static inline int tick_broadcast_oneshot_active(void) { return 0; }
65static inline void tick_check_oneshot_broadcast(int cpu) { } 65static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
66static inline bool tick_broadcast_oneshot_available(void) { return true; } 66static inline bool tick_broadcast_oneshot_available(void) { return true; }
67# endif /* !BROADCAST */ 67# endif /* !BROADCAST */
68 68
@@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
155#endif 155#endif
156 156
157extern void do_timer(unsigned long ticks); 157extern void do_timer(unsigned long ticks);
158extern void update_wall_time(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..08cb0c3b8ccb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now)
86 tick_next_period = ktime_add(last_jiffies_update, tick_period); 86 tick_next_period = ktime_add(last_jiffies_update, tick_period);
87 } 87 }
88 write_sequnlock(&jiffies_lock); 88 write_sequnlock(&jiffies_lock);
89 update_wall_time();
89} 90}
90 91
91/* 92/*
@@ -177,7 +178,7 @@ static bool can_stop_full_tick(void)
177 * TODO: kick full dynticks CPUs when 178 * TODO: kick full dynticks CPUs when
178 * sched_clock_stable is set. 179 * sched_clock_stable is set.
179 */ 180 */
180 if (!sched_clock_stable) { 181 if (!sched_clock_stable()) {
181 trace_tick_stop(0, "unstable sched clock\n"); 182 trace_tick_stop(0, "unstable sched clock\n");
182 /* 183 /*
183 * Don't allow the user to think they can get 184 * Don't allow the user to think they can get
@@ -361,8 +362,8 @@ void __init tick_nohz_init(void)
361/* 362/*
362 * NO HZ enabled ? 363 * NO HZ enabled ?
363 */ 364 */
364int tick_nohz_enabled __read_mostly = 1; 365static int tick_nohz_enabled __read_mostly = 1;
365 366int tick_nohz_active __read_mostly;
366/* 367/*
367 * Enable / Disable tickless mode 368 * Enable / Disable tickless mode
368 */ 369 */
@@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz);
391 */ 392 */
392static void tick_nohz_update_jiffies(ktime_t now) 393static void tick_nohz_update_jiffies(ktime_t now)
393{ 394{
394 int cpu = smp_processor_id();
395 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
396 unsigned long flags; 395 unsigned long flags;
397 396
398 ts->idle_waketime = now; 397 __this_cpu_write(tick_cpu_sched.idle_waketime, now);
399 398
400 local_irq_save(flags); 399 local_irq_save(flags);
401 tick_do_update_jiffies64(now); 400 tick_do_update_jiffies64(now);
@@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
426 425
427} 426}
428 427
429static void tick_nohz_stop_idle(int cpu, ktime_t now) 428static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
430{ 429{
431 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 430 update_ts_time_stats(smp_processor_id(), ts, now, NULL);
432
433 update_ts_time_stats(cpu, ts, now, NULL);
434 ts->idle_active = 0; 431 ts->idle_active = 0;
435 432
436 sched_clock_idle_wakeup_event(0); 433 sched_clock_idle_wakeup_event(0);
437} 434}
438 435
439static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 436static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
440{ 437{
441 ktime_t now = ktime_get(); 438 ktime_t now = ktime_get();
442 439
@@ -465,7 +462,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
465 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 462 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
466 ktime_t now, idle; 463 ktime_t now, idle;
467 464
468 if (!tick_nohz_enabled) 465 if (!tick_nohz_active)
469 return -1; 466 return -1;
470 467
471 now = ktime_get(); 468 now = ktime_get();
@@ -506,7 +503,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
506 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 503 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
507 ktime_t now, iowait; 504 ktime_t now, iowait;
508 505
509 if (!tick_nohz_enabled) 506 if (!tick_nohz_active)
510 return -1; 507 return -1;
511 508
512 now = ktime_get(); 509 now = ktime_get();
@@ -711,8 +708,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
711 return false; 708 return false;
712 } 709 }
713 710
714 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 711 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
712 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
715 return false; 713 return false;
714 }
716 715
717 if (need_resched()) 716 if (need_resched())
718 return false; 717 return false;
@@ -752,7 +751,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
752 ktime_t now, expires; 751 ktime_t now, expires;
753 int cpu = smp_processor_id(); 752 int cpu = smp_processor_id();
754 753
755 now = tick_nohz_start_idle(cpu, ts); 754 now = tick_nohz_start_idle(ts);
756 755
757 if (can_stop_idle_tick(cpu, ts)) { 756 if (can_stop_idle_tick(cpu, ts)) {
758 int was_stopped = ts->tick_stopped; 757 int was_stopped = ts->tick_stopped;
@@ -799,11 +798,6 @@ void tick_nohz_idle_enter(void)
799 local_irq_disable(); 798 local_irq_disable();
800 799
801 ts = &__get_cpu_var(tick_cpu_sched); 800 ts = &__get_cpu_var(tick_cpu_sched);
802 /*
803 * set ts->inidle unconditionally. even if the system did not
804 * switch to nohz mode the cpu frequency governers rely on the
805 * update of the idle time accounting in tick_nohz_start_idle().
806 */
807 ts->inidle = 1; 801 ts->inidle = 1;
808 __tick_nohz_idle_enter(ts); 802 __tick_nohz_idle_enter(ts);
809 803
@@ -914,8 +908,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
914 */ 908 */
915void tick_nohz_idle_exit(void) 909void tick_nohz_idle_exit(void)
916{ 910{
917 int cpu = smp_processor_id(); 911 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
918 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
919 ktime_t now; 912 ktime_t now;
920 913
921 local_irq_disable(); 914 local_irq_disable();
@@ -928,7 +921,7 @@ void tick_nohz_idle_exit(void)
928 now = ktime_get(); 921 now = ktime_get();
929 922
930 if (ts->idle_active) 923 if (ts->idle_active)
931 tick_nohz_stop_idle(cpu, now); 924 tick_nohz_stop_idle(ts, now);
932 925
933 if (ts->tick_stopped) { 926 if (ts->tick_stopped) {
934 tick_nohz_restart_sched_tick(ts, now); 927 tick_nohz_restart_sched_tick(ts, now);
@@ -973,7 +966,7 @@ static void tick_nohz_switch_to_nohz(void)
973 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 966 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
974 ktime_t next; 967 ktime_t next;
975 968
976 if (!tick_nohz_enabled) 969 if (!tick_nohz_active)
977 return; 970 return;
978 971
979 local_irq_disable(); 972 local_irq_disable();
@@ -981,7 +974,7 @@ static void tick_nohz_switch_to_nohz(void)
981 local_irq_enable(); 974 local_irq_enable();
982 return; 975 return;
983 } 976 }
984 977 tick_nohz_active = 1;
985 ts->nohz_mode = NOHZ_MODE_LOWRES; 978 ts->nohz_mode = NOHZ_MODE_LOWRES;
986 979
987 /* 980 /*
@@ -1012,12 +1005,10 @@ static void tick_nohz_switch_to_nohz(void)
1012 * timer and do not touch the other magic bits which need to be done 1005 * timer and do not touch the other magic bits which need to be done
1013 * when idle is left. 1006 * when idle is left.
1014 */ 1007 */
1015static void tick_nohz_kick_tick(int cpu, ktime_t now) 1008static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1016{ 1009{
1017#if 0 1010#if 0
1018 /* Switch back to 2.6.27 behaviour */ 1011 /* Switch back to 2.6.27 behaviour */
1019
1020 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1021 ktime_t delta; 1012 ktime_t delta;
1022 1013
1023 /* 1014 /*
@@ -1032,36 +1023,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)
1032#endif 1023#endif
1033} 1024}
1034 1025
1035static inline void tick_check_nohz(int cpu) 1026static inline void tick_check_nohz_this_cpu(void)
1036{ 1027{
1037 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1028 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1038 ktime_t now; 1029 ktime_t now;
1039 1030
1040 if (!ts->idle_active && !ts->tick_stopped) 1031 if (!ts->idle_active && !ts->tick_stopped)
1041 return; 1032 return;
1042 now = ktime_get(); 1033 now = ktime_get();
1043 if (ts->idle_active) 1034 if (ts->idle_active)
1044 tick_nohz_stop_idle(cpu, now); 1035 tick_nohz_stop_idle(ts, now);
1045 if (ts->tick_stopped) { 1036 if (ts->tick_stopped) {
1046 tick_nohz_update_jiffies(now); 1037 tick_nohz_update_jiffies(now);
1047 tick_nohz_kick_tick(cpu, now); 1038 tick_nohz_kick_tick(ts, now);
1048 } 1039 }
1049} 1040}
1050 1041
1051#else 1042#else
1052 1043
1053static inline void tick_nohz_switch_to_nohz(void) { } 1044static inline void tick_nohz_switch_to_nohz(void) { }
1054static inline void tick_check_nohz(int cpu) { } 1045static inline void tick_check_nohz_this_cpu(void) { }
1055 1046
1056#endif /* CONFIG_NO_HZ_COMMON */ 1047#endif /* CONFIG_NO_HZ_COMMON */
1057 1048
1058/* 1049/*
1059 * Called from irq_enter to notify about the possible interruption of idle() 1050 * Called from irq_enter to notify about the possible interruption of idle()
1060 */ 1051 */
1061void tick_check_idle(int cpu) 1052void tick_check_idle(void)
1062{ 1053{
1063 tick_check_oneshot_broadcast(cpu); 1054 tick_check_oneshot_broadcast_this_cpu();
1064 tick_check_nohz(cpu); 1055 tick_check_nohz_this_cpu();
1065} 1056}
1066 1057
1067/* 1058/*
@@ -1139,8 +1130,10 @@ void tick_setup_sched_timer(void)
1139 } 1130 }
1140 1131
1141#ifdef CONFIG_NO_HZ_COMMON 1132#ifdef CONFIG_NO_HZ_COMMON
1142 if (tick_nohz_enabled) 1133 if (tick_nohz_enabled) {
1143 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1134 ts->nohz_mode = NOHZ_MODE_HIGHRES;
1135 tick_nohz_active = 1;
1136 }
1144#endif 1137#endif
1145} 1138}
1146#endif /* HIGH_RES_TIMERS */ 1139#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..0aa4ce81bc16 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
77 tk->wall_to_monotonic = wtm; 77 tk->wall_to_monotonic = wtm;
78 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 78 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
79 tk->offs_real = timespec_to_ktime(tmp); 79 tk->offs_real = timespec_to_ktime(tmp);
80 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); 80 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
81} 81}
82 82
83static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 83static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
90} 90}
91 91
92/** 92/**
93 * timekeeper_setup_internals - Set up internals to use clocksource clock. 93 * tk_setup_internals - Set up internals to use clocksource clock.
94 * 94 *
95 * @tk: The target timekeeper to setup.
95 * @clock: Pointer to clocksource. 96 * @clock: Pointer to clocksource.
96 * 97 *
97 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment 98 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
@@ -595,7 +596,7 @@ s32 timekeeping_get_tai_offset(void)
595static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) 596static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
596{ 597{
597 tk->tai_offset = tai_offset; 598 tk->tai_offset = tai_offset;
598 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); 599 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
599} 600}
600 601
601/** 602/**
@@ -610,6 +611,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
610 raw_spin_lock_irqsave(&timekeeper_lock, flags); 611 raw_spin_lock_irqsave(&timekeeper_lock, flags);
611 write_seqcount_begin(&timekeeper_seq); 612 write_seqcount_begin(&timekeeper_seq);
612 __timekeeping_set_tai_offset(tk, tai_offset); 613 __timekeeping_set_tai_offset(tk, tai_offset);
614 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
613 write_seqcount_end(&timekeeper_seq); 615 write_seqcount_end(&timekeeper_seq);
614 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 616 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
615 clock_was_set(); 617 clock_was_set();
@@ -1023,6 +1025,8 @@ static int timekeeping_suspend(void)
1023 timekeeping_suspend_time = 1025 timekeeping_suspend_time =
1024 timespec_add(timekeeping_suspend_time, delta_delta); 1026 timespec_add(timekeeping_suspend_time, delta_delta);
1025 } 1027 }
1028
1029 timekeeping_update(tk, TK_MIRROR);
1026 write_seqcount_end(&timekeeper_seq); 1030 write_seqcount_end(&timekeeper_seq);
1027 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1031 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1028 1032
@@ -1130,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1130 * we can adjust by 1. 1134 * we can adjust by 1.
1131 */ 1135 */
1132 error >>= 2; 1136 error >>= 2;
1133 /*
1134 * XXX - In update_wall_time, we round up to the next
1135 * nanosecond, and store the amount rounded up into
1136 * the error. This causes the likely below to be unlikely.
1137 *
1138 * The proper fix is to avoid rounding up by using
1139 * the high precision tk->xtime_nsec instead of
1140 * xtime.tv_nsec everywhere. Fixing this will take some
1141 * time.
1142 */
1143 if (likely(error <= interval)) 1137 if (likely(error <= interval))
1144 adj = 1; 1138 adj = 1;
1145 else 1139 else
@@ -1255,7 +1249,7 @@ out_adjust:
1255static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1249static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1256{ 1250{
1257 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1251 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1258 unsigned int action = 0; 1252 unsigned int clock_set = 0;
1259 1253
1260 while (tk->xtime_nsec >= nsecps) { 1254 while (tk->xtime_nsec >= nsecps) {
1261 int leap; 1255 int leap;
@@ -1277,11 +1271,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1277 1271
1278 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1272 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1279 1273
1280 clock_was_set_delayed(); 1274 clock_set = TK_CLOCK_WAS_SET;
1281 action = TK_CLOCK_WAS_SET;
1282 } 1275 }
1283 } 1276 }
1284 return action; 1277 return clock_set;
1285} 1278}
1286 1279
1287/** 1280/**
@@ -1294,7 +1287,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1294 * Returns the unconsumed cycles. 1287 * Returns the unconsumed cycles.
1295 */ 1288 */
1296static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 1289static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1297 u32 shift) 1290 u32 shift,
1291 unsigned int *clock_set)
1298{ 1292{
1299 cycle_t interval = tk->cycle_interval << shift; 1293 cycle_t interval = tk->cycle_interval << shift;
1300 u64 raw_nsecs; 1294 u64 raw_nsecs;
@@ -1308,7 +1302,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1308 tk->cycle_last += interval; 1302 tk->cycle_last += interval;
1309 1303
1310 tk->xtime_nsec += tk->xtime_interval << shift; 1304 tk->xtime_nsec += tk->xtime_interval << shift;
1311 accumulate_nsecs_to_secs(tk); 1305 *clock_set |= accumulate_nsecs_to_secs(tk);
1312 1306
1313 /* Accumulate raw time */ 1307 /* Accumulate raw time */
1314 raw_nsecs = (u64)tk->raw_interval << shift; 1308 raw_nsecs = (u64)tk->raw_interval << shift;
@@ -1347,7 +1341,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1347 tk->xtime_nsec -= remainder; 1341 tk->xtime_nsec -= remainder;
1348 tk->xtime_nsec += 1ULL << tk->shift; 1342 tk->xtime_nsec += 1ULL << tk->shift;
1349 tk->ntp_error += remainder << tk->ntp_error_shift; 1343 tk->ntp_error += remainder << tk->ntp_error_shift;
1350 1344 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1351} 1345}
1352#else 1346#else
1353#define old_vsyscall_fixup(tk) 1347#define old_vsyscall_fixup(tk)
@@ -1359,14 +1353,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1359 * update_wall_time - Uses the current clocksource to increment the wall time 1353 * update_wall_time - Uses the current clocksource to increment the wall time
1360 * 1354 *
1361 */ 1355 */
1362static void update_wall_time(void) 1356void update_wall_time(void)
1363{ 1357{
1364 struct clocksource *clock; 1358 struct clocksource *clock;
1365 struct timekeeper *real_tk = &timekeeper; 1359 struct timekeeper *real_tk = &timekeeper;
1366 struct timekeeper *tk = &shadow_timekeeper; 1360 struct timekeeper *tk = &shadow_timekeeper;
1367 cycle_t offset; 1361 cycle_t offset;
1368 int shift = 0, maxshift; 1362 int shift = 0, maxshift;
1369 unsigned int action; 1363 unsigned int clock_set = 0;
1370 unsigned long flags; 1364 unsigned long flags;
1371 1365
1372 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1366 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1401,7 +1395,8 @@ static void update_wall_time(void)
1401 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1395 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1402 shift = min(shift, maxshift); 1396 shift = min(shift, maxshift);
1403 while (offset >= tk->cycle_interval) { 1397 while (offset >= tk->cycle_interval) {
1404 offset = logarithmic_accumulation(tk, offset, shift); 1398 offset = logarithmic_accumulation(tk, offset, shift,
1399 &clock_set);
1405 if (offset < tk->cycle_interval<<shift) 1400 if (offset < tk->cycle_interval<<shift)
1406 shift--; 1401 shift--;
1407 } 1402 }
@@ -1419,7 +1414,7 @@ static void update_wall_time(void)
1419 * Finally, make sure that after the rounding 1414 * Finally, make sure that after the rounding
1420 * xtime_nsec isn't larger than NSEC_PER_SEC 1415 * xtime_nsec isn't larger than NSEC_PER_SEC
1421 */ 1416 */
1422 action = accumulate_nsecs_to_secs(tk); 1417 clock_set |= accumulate_nsecs_to_secs(tk);
1423 1418
1424 write_seqcount_begin(&timekeeper_seq); 1419 write_seqcount_begin(&timekeeper_seq);
1425 /* Update clock->cycle_last with the new value */ 1420 /* Update clock->cycle_last with the new value */
@@ -1435,10 +1430,12 @@ static void update_wall_time(void)
1435 * updating. 1430 * updating.
1436 */ 1431 */
1437 memcpy(real_tk, tk, sizeof(*tk)); 1432 memcpy(real_tk, tk, sizeof(*tk));
1438 timekeeping_update(real_tk, action); 1433 timekeeping_update(real_tk, clock_set);
1439 write_seqcount_end(&timekeeper_seq); 1434 write_seqcount_end(&timekeeper_seq);
1440out: 1435out:
1441 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1437 if (clock_set)
1438 clock_was_set();
1442} 1439}
1443 1440
1444/** 1441/**
@@ -1583,7 +1580,6 @@ struct timespec get_monotonic_coarse(void)
1583void do_timer(unsigned long ticks) 1580void do_timer(unsigned long ticks)
1584{ 1581{
1585 jiffies_64 += ticks; 1582 jiffies_64 += ticks;
1586 update_wall_time();
1587 calc_global_load(ticks); 1583 calc_global_load(ticks);
1588} 1584}
1589 1585
@@ -1698,12 +1694,14 @@ int do_adjtimex(struct timex *txc)
1698 1694
1699 if (tai != orig_tai) { 1695 if (tai != orig_tai) {
1700 __timekeeping_set_tai_offset(tk, tai); 1696 __timekeeping_set_tai_offset(tk, tai);
1701 update_pvclock_gtod(tk, true); 1697 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1702 clock_was_set_delayed();
1703 } 1698 }
1704 write_seqcount_end(&timekeeper_seq); 1699 write_seqcount_end(&timekeeper_seq);
1705 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1700 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1706 1701
1702 if (tai != orig_tai)
1703 clock_was_set();
1704
1707 ntp_notify_cmos_timer(); 1705 ntp_notify_cmos_timer();
1708 1706
1709 return ret; 1707 return ret;
@@ -1739,4 +1737,5 @@ void xtime_update(unsigned long ticks)
1739 write_seqlock(&jiffies_lock); 1737 write_seqlock(&jiffies_lock);
1740 do_timer(ticks); 1738 do_timer(ticks);
1741 write_sequnlock(&jiffies_lock); 1739 write_sequnlock(&jiffies_lock);
1740 update_wall_time();
1742} 1741}
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82fa966..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
1518 /* 1518 /*
1519 * The APs use this path later in boot 1519 * The APs use this path later in boot
1520 */ 1520 */
1521 base = kmalloc_node(sizeof(*base), 1521 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1522 GFP_KERNEL | __GFP_ZERO, 1522 cpu_to_node(cpu));
1523 cpu_to_node(cpu));
1524 if (!base) 1523 if (!base)
1525 return -ENOMEM; 1524 return -ENOMEM;
1526 1525
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b71..1378e84fbe39 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
50obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o 50obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
51endif 51endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 52obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
53obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
53obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
54obj-$(CONFIG_TRACEPOINTS) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
55ifeq ($(CONFIG_PM_RUNTIME),y) 56ifeq ($(CONFIG_PM_RUNTIME),y)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..cd7f76d1eb86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly;
85 85
86/* Current function tracing op */ 86/* Current function tracing op */
87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; 87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
88/* What to set function_trace_op to */
89static struct ftrace_ops *set_function_trace_op;
88 90
89/* List for set_ftrace_pid's pids. */ 91/* List for set_ftrace_pid's pids. */
90LIST_HEAD(ftrace_pids); 92LIST_HEAD(ftrace_pids);
@@ -278,6 +280,29 @@ static void update_global_ops(void)
278 global_ops.func = func; 280 global_ops.func = func;
279} 281}
280 282
283static void ftrace_sync(struct work_struct *work)
284{
285 /*
286 * This function is just a stub to implement a hard force
287 * of synchronize_sched(). This requires synchronizing
288 * tasks even in userspace and idle.
289 *
290 * Yes, function tracing is rude.
291 */
292}
293
294static void ftrace_sync_ipi(void *data)
295{
296 /* Probably not needed, but do it anyway */
297 smp_rmb();
298}
299
300#ifdef CONFIG_FUNCTION_GRAPH_TRACER
301static void update_function_graph_func(void);
302#else
303static inline void update_function_graph_func(void) { }
304#endif
305
281static void update_ftrace_function(void) 306static void update_ftrace_function(void)
282{ 307{
283 ftrace_func_t func; 308 ftrace_func_t func;
@@ -296,16 +321,61 @@ static void update_ftrace_function(void)
296 !FTRACE_FORCE_LIST_FUNC)) { 321 !FTRACE_FORCE_LIST_FUNC)) {
297 /* Set the ftrace_ops that the arch callback uses */ 322 /* Set the ftrace_ops that the arch callback uses */
298 if (ftrace_ops_list == &global_ops) 323 if (ftrace_ops_list == &global_ops)
299 function_trace_op = ftrace_global_list; 324 set_function_trace_op = ftrace_global_list;
300 else 325 else
301 function_trace_op = ftrace_ops_list; 326 set_function_trace_op = ftrace_ops_list;
302 func = ftrace_ops_list->func; 327 func = ftrace_ops_list->func;
303 } else { 328 } else {
304 /* Just use the default ftrace_ops */ 329 /* Just use the default ftrace_ops */
305 function_trace_op = &ftrace_list_end; 330 set_function_trace_op = &ftrace_list_end;
306 func = ftrace_ops_list_func; 331 func = ftrace_ops_list_func;
307 } 332 }
308 333
334 /* If there's no change, then do nothing more here */
335 if (ftrace_trace_function == func)
336 return;
337
338 update_function_graph_func();
339
340 /*
341 * If we are using the list function, it doesn't care
342 * about the function_trace_ops.
343 */
344 if (func == ftrace_ops_list_func) {
345 ftrace_trace_function = func;
346 /*
347 * Don't even bother setting function_trace_ops,
348 * it would be racy to do so anyway.
349 */
350 return;
351 }
352
353#ifndef CONFIG_DYNAMIC_FTRACE
354 /*
355 * For static tracing, we need to be a bit more careful.
356 * The function change takes affect immediately. Thus,
357 * we need to coorditate the setting of the function_trace_ops
358 * with the setting of the ftrace_trace_function.
359 *
360 * Set the function to the list ops, which will call the
361 * function we want, albeit indirectly, but it handles the
362 * ftrace_ops and doesn't depend on function_trace_op.
363 */
364 ftrace_trace_function = ftrace_ops_list_func;
365 /*
366 * Make sure all CPUs see this. Yes this is slow, but static
367 * tracing is slow and nasty to have enabled.
368 */
369 schedule_on_each_cpu(ftrace_sync);
370 /* Now all cpus are using the list ops. */
371 function_trace_op = set_function_trace_op;
372 /* Make sure the function_trace_op is visible on all CPUs */
373 smp_wmb();
374 /* Nasty way to force a rmb on all cpus */
375 smp_call_function(ftrace_sync_ipi, NULL, 1);
376 /* OK, we are all set to update the ftrace_trace_function now! */
377#endif /* !CONFIG_DYNAMIC_FTRACE */
378
309 ftrace_trace_function = func; 379 ftrace_trace_function = func;
310} 380}
311 381
@@ -367,9 +437,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
367 437
368static int __register_ftrace_function(struct ftrace_ops *ops) 438static int __register_ftrace_function(struct ftrace_ops *ops)
369{ 439{
370 if (unlikely(ftrace_disabled))
371 return -ENODEV;
372
373 if (FTRACE_WARN_ON(ops == &global_ops)) 440 if (FTRACE_WARN_ON(ops == &global_ops))
374 return -EINVAL; 441 return -EINVAL;
375 442
@@ -413,24 +480,10 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
413 return 0; 480 return 0;
414} 481}
415 482
416static void ftrace_sync(struct work_struct *work)
417{
418 /*
419 * This function is just a stub to implement a hard force
420 * of synchronize_sched(). This requires synchronizing
421 * tasks even in userspace and idle.
422 *
423 * Yes, function tracing is rude.
424 */
425}
426
427static int __unregister_ftrace_function(struct ftrace_ops *ops) 483static int __unregister_ftrace_function(struct ftrace_ops *ops)
428{ 484{
429 int ret; 485 int ret;
430 486
431 if (ftrace_disabled)
432 return -ENODEV;
433
434 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 487 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
435 return -EBUSY; 488 return -EBUSY;
436 489
@@ -445,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
445 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { 498 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
446 ret = remove_ftrace_list_ops(&ftrace_control_list, 499 ret = remove_ftrace_list_ops(&ftrace_control_list,
447 &control_ops, ops); 500 &control_ops, ops);
448 if (!ret) {
449 /*
450 * The ftrace_ops is now removed from the list,
451 * so there'll be no new users. We must ensure
452 * all current users are done before we free
453 * the control data.
454 * Note synchronize_sched() is not enough, as we
455 * use preempt_disable() to do RCU, but the function
456 * tracer can be called where RCU is not active
457 * (before user_exit()).
458 */
459 schedule_on_each_cpu(ftrace_sync);
460 control_ops_free(ops);
461 }
462 } else 501 } else
463 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 502 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
464 503
@@ -468,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
468 if (ftrace_enabled) 507 if (ftrace_enabled)
469 update_ftrace_function(); 508 update_ftrace_function();
470 509
471 /*
472 * Dynamic ops may be freed, we must make sure that all
473 * callers are done before leaving this function.
474 *
475 * Again, normal synchronize_sched() is not good enough.
476 * We need to do a hard force of sched synchronization.
477 */
478 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
479 schedule_on_each_cpu(ftrace_sync);
480
481
482 return 0; 510 return 0;
483} 511}
484 512
@@ -781,7 +809,7 @@ static int ftrace_profile_init(void)
781 int cpu; 809 int cpu;
782 int ret = 0; 810 int ret = 0;
783 811
784 for_each_online_cpu(cpu) { 812 for_each_possible_cpu(cpu) {
785 ret = ftrace_profile_init_cpu(cpu); 813 ret = ftrace_profile_init_cpu(cpu);
786 if (ret) 814 if (ret)
787 break; 815 break;
@@ -1088,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1088 1116
1089static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1117static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1090 1118
1091loff_t
1092ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1093{
1094 loff_t ret;
1095
1096 if (file->f_mode & FMODE_READ)
1097 ret = seq_lseek(file, offset, whence);
1098 else
1099 file->f_pos = ret = 1;
1100
1101 return ret;
1102}
1103
1104#ifdef CONFIG_DYNAMIC_FTRACE 1119#ifdef CONFIG_DYNAMIC_FTRACE
1105 1120
1106#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1121#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1998,8 +2013,14 @@ void ftrace_modify_all_code(int command)
1998 else if (command & FTRACE_DISABLE_CALLS) 2013 else if (command & FTRACE_DISABLE_CALLS)
1999 ftrace_replace_code(0); 2014 ftrace_replace_code(0);
2000 2015
2001 if (update && ftrace_trace_function != ftrace_ops_list_func) 2016 if (update && ftrace_trace_function != ftrace_ops_list_func) {
2017 function_trace_op = set_function_trace_op;
2018 smp_wmb();
2019 /* If irqs are disabled, we are in stop machine */
2020 if (!irqs_disabled())
2021 smp_call_function(ftrace_sync_ipi, NULL, 1);
2002 ftrace_update_ftrace_func(ftrace_trace_function); 2022 ftrace_update_ftrace_func(ftrace_trace_function);
2023 }
2003 2024
2004 if (command & FTRACE_START_FUNC_RET) 2025 if (command & FTRACE_START_FUNC_RET)
2005 ftrace_enable_ftrace_graph_caller(); 2026 ftrace_enable_ftrace_graph_caller();
@@ -2088,10 +2109,15 @@ static void ftrace_startup_enable(int command)
2088static int ftrace_startup(struct ftrace_ops *ops, int command) 2109static int ftrace_startup(struct ftrace_ops *ops, int command)
2089{ 2110{
2090 bool hash_enable = true; 2111 bool hash_enable = true;
2112 int ret;
2091 2113
2092 if (unlikely(ftrace_disabled)) 2114 if (unlikely(ftrace_disabled))
2093 return -ENODEV; 2115 return -ENODEV;
2094 2116
2117 ret = __register_ftrace_function(ops);
2118 if (ret)
2119 return ret;
2120
2095 ftrace_start_up++; 2121 ftrace_start_up++;
2096 command |= FTRACE_UPDATE_CALLS; 2122 command |= FTRACE_UPDATE_CALLS;
2097 2123
@@ -2113,12 +2139,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2113 return 0; 2139 return 0;
2114} 2140}
2115 2141
2116static void ftrace_shutdown(struct ftrace_ops *ops, int command) 2142static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2117{ 2143{
2118 bool hash_disable = true; 2144 bool hash_disable = true;
2145 int ret;
2119 2146
2120 if (unlikely(ftrace_disabled)) 2147 if (unlikely(ftrace_disabled))
2121 return; 2148 return -ENODEV;
2149
2150 ret = __unregister_ftrace_function(ops);
2151 if (ret)
2152 return ret;
2122 2153
2123 ftrace_start_up--; 2154 ftrace_start_up--;
2124 /* 2155 /*
@@ -2152,10 +2183,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
2152 command |= FTRACE_UPDATE_TRACE_FUNC; 2183 command |= FTRACE_UPDATE_TRACE_FUNC;
2153 } 2184 }
2154 2185
2155 if (!command || !ftrace_enabled) 2186 if (!command || !ftrace_enabled) {
2156 return; 2187 /*
2188 * If these are control ops, they still need their
2189 * per_cpu field freed. Since, function tracing is
2190 * not currently active, we can just free them
2191 * without synchronizing all CPUs.
2192 */
2193 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2194 control_ops_free(ops);
2195 return 0;
2196 }
2157 2197
2158 ftrace_run_update_code(command); 2198 ftrace_run_update_code(command);
2199
2200 /*
2201 * Dynamic ops may be freed, we must make sure that all
2202 * callers are done before leaving this function.
2203 * The same goes for freeing the per_cpu data of the control
2204 * ops.
2205 *
2206 * Again, normal synchronize_sched() is not good enough.
2207 * We need to do a hard force of sched synchronization.
2208 * This is because we use preempt_disable() to do RCU, but
2209 * the function tracers can be called where RCU is not watching
2210 * (like before user_exit()). We can not rely on the RCU
2211 * infrastructure to do the synchronization, thus we must do it
2212 * ourselves.
2213 */
2214 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2215 schedule_on_each_cpu(ftrace_sync);
2216
2217 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2218 control_ops_free(ops);
2219 }
2220
2221 return 0;
2159} 2222}
2160 2223
2161static void ftrace_startup_sysctl(void) 2224static void ftrace_startup_sysctl(void)
@@ -2734,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2734 * routine, you can use ftrace_filter_write() for the write 2797 * routine, you can use ftrace_filter_write() for the write
2735 * routine if @flag has FTRACE_ITER_FILTER set, or 2798 * routine if @flag has FTRACE_ITER_FILTER set, or
2736 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2799 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2737 * ftrace_filter_lseek() should be used as the lseek routine, and 2800 * tracing_lseek() should be used as the lseek routine, and
2738 * release must call ftrace_regex_release(). 2801 * release must call ftrace_regex_release().
2739 */ 2802 */
2740int 2803int
@@ -3060,16 +3123,13 @@ static void __enable_ftrace_function_probe(void)
3060 if (i == FTRACE_FUNC_HASHSIZE) 3123 if (i == FTRACE_FUNC_HASHSIZE)
3061 return; 3124 return;
3062 3125
3063 ret = __register_ftrace_function(&trace_probe_ops); 3126 ret = ftrace_startup(&trace_probe_ops, 0);
3064 if (!ret)
3065 ret = ftrace_startup(&trace_probe_ops, 0);
3066 3127
3067 ftrace_probe_registered = 1; 3128 ftrace_probe_registered = 1;
3068} 3129}
3069 3130
3070static void __disable_ftrace_function_probe(void) 3131static void __disable_ftrace_function_probe(void)
3071{ 3132{
3072 int ret;
3073 int i; 3133 int i;
3074 3134
3075 if (!ftrace_probe_registered) 3135 if (!ftrace_probe_registered)
@@ -3082,9 +3142,7 @@ static void __disable_ftrace_function_probe(void)
3082 } 3142 }
3083 3143
3084 /* no more funcs left */ 3144 /* no more funcs left */
3085 ret = __unregister_ftrace_function(&trace_probe_ops); 3145 ftrace_shutdown(&trace_probe_ops, 0);
3086 if (!ret)
3087 ftrace_shutdown(&trace_probe_ops, 0);
3088 3146
3089 ftrace_probe_registered = 0; 3147 ftrace_probe_registered = 0;
3090} 3148}
@@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = {
3767 .open = ftrace_filter_open, 3825 .open = ftrace_filter_open,
3768 .read = seq_read, 3826 .read = seq_read,
3769 .write = ftrace_filter_write, 3827 .write = ftrace_filter_write,
3770 .llseek = ftrace_filter_lseek, 3828 .llseek = tracing_lseek,
3771 .release = ftrace_regex_release, 3829 .release = ftrace_regex_release,
3772}; 3830};
3773 3831
@@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = {
3775 .open = ftrace_notrace_open, 3833 .open = ftrace_notrace_open,
3776 .read = seq_read, 3834 .read = seq_read,
3777 .write = ftrace_notrace_write, 3835 .write = ftrace_notrace_write,
3778 .llseek = ftrace_filter_lseek, 3836 .llseek = tracing_lseek,
3779 .release = ftrace_regex_release, 3837 .release = ftrace_regex_release,
3780}; 3838};
3781 3839
@@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = {
4038 .open = ftrace_graph_open, 4096 .open = ftrace_graph_open,
4039 .read = seq_read, 4097 .read = seq_read,
4040 .write = ftrace_graph_write, 4098 .write = ftrace_graph_write,
4041 .llseek = ftrace_filter_lseek, 4099 .llseek = tracing_lseek,
4042 .release = ftrace_graph_release, 4100 .release = ftrace_graph_release,
4043}; 4101};
4044 4102
@@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = {
4046 .open = ftrace_graph_notrace_open, 4104 .open = ftrace_graph_notrace_open,
4047 .read = seq_read, 4105 .read = seq_read,
4048 .write = ftrace_graph_write, 4106 .write = ftrace_graph_write,
4049 .llseek = ftrace_filter_lseek, 4107 .llseek = tracing_lseek,
4050 .release = ftrace_graph_release, 4108 .release = ftrace_graph_release,
4051}; 4109};
4052#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4110#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4366,12 +4424,15 @@ core_initcall(ftrace_nodyn_init);
4366static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4424static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4367static inline void ftrace_startup_enable(int command) { } 4425static inline void ftrace_startup_enable(int command) { }
4368/* Keep as macros so we do not need to define the commands */ 4426/* Keep as macros so we do not need to define the commands */
4369# define ftrace_startup(ops, command) \ 4427# define ftrace_startup(ops, command) \
4370 ({ \ 4428 ({ \
4371 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4429 int ___ret = __register_ftrace_function(ops); \
4372 0; \ 4430 if (!___ret) \
4431 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4432 ___ret; \
4373 }) 4433 })
4374# define ftrace_shutdown(ops, command) do { } while (0) 4434# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
4435
4375# define ftrace_startup_sysctl() do { } while (0) 4436# define ftrace_startup_sysctl() do { } while (0)
4376# define ftrace_shutdown_sysctl() do { } while (0) 4437# define ftrace_shutdown_sysctl() do { } while (0)
4377 4438
@@ -4716,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = {
4716 .open = ftrace_pid_open, 4777 .open = ftrace_pid_open,
4717 .write = ftrace_pid_write, 4778 .write = ftrace_pid_write,
4718 .read = seq_read, 4779 .read = seq_read,
4719 .llseek = ftrace_filter_lseek, 4780 .llseek = tracing_lseek,
4720 .release = ftrace_pid_release, 4781 .release = ftrace_pid_release,
4721}; 4782};
4722 4783
@@ -4780,9 +4841,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
4780 4841
4781 mutex_lock(&ftrace_lock); 4842 mutex_lock(&ftrace_lock);
4782 4843
4783 ret = __register_ftrace_function(ops); 4844 ret = ftrace_startup(ops, 0);
4784 if (!ret)
4785 ret = ftrace_startup(ops, 0);
4786 4845
4787 mutex_unlock(&ftrace_lock); 4846 mutex_unlock(&ftrace_lock);
4788 4847
@@ -4801,9 +4860,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
4801 int ret; 4860 int ret;
4802 4861
4803 mutex_lock(&ftrace_lock); 4862 mutex_lock(&ftrace_lock);
4804 ret = __unregister_ftrace_function(ops); 4863 ret = ftrace_shutdown(ops, 0);
4805 if (!ret)
4806 ftrace_shutdown(ops, 0);
4807 mutex_unlock(&ftrace_lock); 4864 mutex_unlock(&ftrace_lock);
4808 4865
4809 return ret; 4866 return ret;
@@ -4863,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4863trace_func_graph_ret_t ftrace_graph_return = 4920trace_func_graph_ret_t ftrace_graph_return =
4864 (trace_func_graph_ret_t)ftrace_stub; 4921 (trace_func_graph_ret_t)ftrace_stub;
4865trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; 4922trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
4923static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
4866 4924
4867/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ 4925/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
4868static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) 4926static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -4997,6 +5055,37 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
4997 return NOTIFY_DONE; 5055 return NOTIFY_DONE;
4998} 5056}
4999 5057
5058/* Just a place holder for function graph */
5059static struct ftrace_ops fgraph_ops __read_mostly = {
5060 .func = ftrace_stub,
5061 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5062 FTRACE_OPS_FL_RECURSION_SAFE,
5063};
5064
5065static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5066{
5067 if (!ftrace_ops_test(&global_ops, trace->func, NULL))
5068 return 0;
5069 return __ftrace_graph_entry(trace);
5070}
5071
5072/*
5073 * The function graph tracer should only trace the functions defined
5074 * by set_ftrace_filter and set_ftrace_notrace. If another function
5075 * tracer ops is registered, the graph tracer requires testing the
5076 * function against the global ops, and not just trace any function
5077 * that any ftrace_ops registered.
5078 */
5079static void update_function_graph_func(void)
5080{
5081 if (ftrace_ops_list == &ftrace_list_end ||
5082 (ftrace_ops_list == &global_ops &&
5083 global_ops.next == &ftrace_list_end))
5084 ftrace_graph_entry = __ftrace_graph_entry;
5085 else
5086 ftrace_graph_entry = ftrace_graph_entry_test;
5087}
5088
5000int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5089int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5001 trace_func_graph_ent_t entryfunc) 5090 trace_func_graph_ent_t entryfunc)
5002{ 5091{
@@ -5021,9 +5110,18 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5021 } 5110 }
5022 5111
5023 ftrace_graph_return = retfunc; 5112 ftrace_graph_return = retfunc;
5024 ftrace_graph_entry = entryfunc;
5025 5113
5026 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 5114 /*
5115 * Update the indirect function to the entryfunc, and the
5116 * function that gets called to the entry_test first. Then
5117 * call the update fgraph entry function to determine if
5118 * the entryfunc should be called directly or not.
5119 */
5120 __ftrace_graph_entry = entryfunc;
5121 ftrace_graph_entry = ftrace_graph_entry_test;
5122 update_function_graph_func();
5123
5124 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
5027 5125
5028out: 5126out:
5029 mutex_unlock(&ftrace_lock); 5127 mutex_unlock(&ftrace_lock);
@@ -5040,7 +5138,8 @@ void unregister_ftrace_graph(void)
5040 ftrace_graph_active--; 5138 ftrace_graph_active--;
5041 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5139 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5042 ftrace_graph_entry = ftrace_graph_entry_stub; 5140 ftrace_graph_entry = ftrace_graph_entry_stub;
5043 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5141 __ftrace_graph_entry = ftrace_graph_entry_stub;
5142 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
5044 unregister_pm_notifier(&ftrace_suspend_notifier); 5143 unregister_pm_notifier(&ftrace_suspend_notifier);
5045 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5144 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5046 5145
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc2f66f68dc5..294b8a271a04 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2558 if (unlikely(test_time_stamp(delta))) { 2558 if (unlikely(test_time_stamp(delta))) {
2559 int local_clock_stable = 1; 2559 int local_clock_stable = 1;
2560#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 2560#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2561 local_clock_stable = sched_clock_stable; 2561 local_clock_stable = sched_clock_stable();
2562#endif 2562#endif
2563 WARN_ONCE(delta > (1ULL << 59), 2563 WARN_ONCE(delta > (1ULL << 59),
2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", 2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d20cd9743ef..20c755e018ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -595,6 +595,28 @@ void free_snapshot(struct trace_array *tr)
595} 595}
596 596
597/** 597/**
598 * tracing_alloc_snapshot - allocate snapshot buffer.
599 *
600 * This only allocates the snapshot buffer if it isn't already
601 * allocated - it doesn't also take a snapshot.
602 *
603 * This is meant to be used in cases where the snapshot buffer needs
604 * to be set up for events that can't sleep but need to be able to
605 * trigger a snapshot.
606 */
607int tracing_alloc_snapshot(void)
608{
609 struct trace_array *tr = &global_trace;
610 int ret;
611
612 ret = alloc_snapshot(tr);
613 WARN_ON(ret < 0);
614
615 return ret;
616}
617EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
618
619/**
598 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. 620 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
599 * 621 *
600 * This is similar to trace_snapshot(), but it will allocate the 622 * This is similar to trace_snapshot(), but it will allocate the
@@ -607,11 +629,10 @@ void free_snapshot(struct trace_array *tr)
607 */ 629 */
608void tracing_snapshot_alloc(void) 630void tracing_snapshot_alloc(void)
609{ 631{
610 struct trace_array *tr = &global_trace;
611 int ret; 632 int ret;
612 633
613 ret = alloc_snapshot(tr); 634 ret = tracing_alloc_snapshot();
614 if (WARN_ON(ret < 0)) 635 if (ret < 0)
615 return; 636 return;
616 637
617 tracing_snapshot(); 638 tracing_snapshot();
@@ -623,6 +644,12 @@ void tracing_snapshot(void)
623 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); 644 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
624} 645}
625EXPORT_SYMBOL_GPL(tracing_snapshot); 646EXPORT_SYMBOL_GPL(tracing_snapshot);
647int tracing_alloc_snapshot(void)
648{
649 WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
650 return -ENODEV;
651}
652EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
626void tracing_snapshot_alloc(void) 653void tracing_snapshot_alloc(void)
627{ 654{
628 /* Give warning */ 655 /* Give warning */
@@ -3156,19 +3183,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
3156 return count; 3183 return count;
3157} 3184}
3158 3185
3159static loff_t tracing_seek(struct file *file, loff_t offset, int origin) 3186loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
3160{ 3187{
3188 int ret;
3189
3161 if (file->f_mode & FMODE_READ) 3190 if (file->f_mode & FMODE_READ)
3162 return seq_lseek(file, offset, origin); 3191 ret = seq_lseek(file, offset, whence);
3163 else 3192 else
3164 return 0; 3193 file->f_pos = ret = 0;
3194
3195 return ret;
3165} 3196}
3166 3197
3167static const struct file_operations tracing_fops = { 3198static const struct file_operations tracing_fops = {
3168 .open = tracing_open, 3199 .open = tracing_open,
3169 .read = seq_read, 3200 .read = seq_read,
3170 .write = tracing_write_stub, 3201 .write = tracing_write_stub,
3171 .llseek = tracing_seek, 3202 .llseek = tracing_lseek,
3172 .release = tracing_release, 3203 .release = tracing_release,
3173}; 3204};
3174 3205
@@ -4212,12 +4243,6 @@ out:
4212 return sret; 4243 return sret;
4213} 4244}
4214 4245
4215static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
4216 struct pipe_buffer *buf)
4217{
4218 __free_page(buf->page);
4219}
4220
4221static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, 4246static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
4222 unsigned int idx) 4247 unsigned int idx)
4223{ 4248{
@@ -4229,7 +4254,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = {
4229 .map = generic_pipe_buf_map, 4254 .map = generic_pipe_buf_map,
4230 .unmap = generic_pipe_buf_unmap, 4255 .unmap = generic_pipe_buf_unmap,
4231 .confirm = generic_pipe_buf_confirm, 4256 .confirm = generic_pipe_buf_confirm,
4232 .release = tracing_pipe_buf_release, 4257 .release = generic_pipe_buf_release,
4233 .steal = generic_pipe_buf_steal, 4258 .steal = generic_pipe_buf_steal,
4234 .get = generic_pipe_buf_get, 4259 .get = generic_pipe_buf_get,
4235}; 4260};
@@ -4913,7 +4938,7 @@ static const struct file_operations snapshot_fops = {
4913 .open = tracing_snapshot_open, 4938 .open = tracing_snapshot_open,
4914 .read = seq_read, 4939 .read = seq_read,
4915 .write = tracing_snapshot_write, 4940 .write = tracing_snapshot_write,
4916 .llseek = tracing_seek, 4941 .llseek = tracing_lseek,
4917 .release = tracing_snapshot_release, 4942 .release = tracing_snapshot_release,
4918}; 4943};
4919 4944
@@ -5883,6 +5908,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5883 5908
5884 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; 5909 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5885 5910
5911 buf->tr = tr;
5912
5886 buf->buffer = ring_buffer_alloc(size, rb_flags); 5913 buf->buffer = ring_buffer_alloc(size, rb_flags);
5887 if (!buf->buffer) 5914 if (!buf->buffer)
5888 return -ENOMEM; 5915 return -ENOMEM;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea189e027b80..02b592f2d4b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,3 +1,4 @@
1
1#ifndef _LINUX_KERNEL_TRACE_H 2#ifndef _LINUX_KERNEL_TRACE_H
2#define _LINUX_KERNEL_TRACE_H 3#define _LINUX_KERNEL_TRACE_H
3 4
@@ -587,6 +588,8 @@ void tracing_start_sched_switch_record(void);
587int register_tracer(struct tracer *type); 588int register_tracer(struct tracer *type);
588int is_tracing_stopped(void); 589int is_tracing_stopped(void);
589 590
591loff_t tracing_lseek(struct file *file, loff_t offset, int whence);
592
590extern cpumask_var_t __read_mostly tracing_buffer_mask; 593extern cpumask_var_t __read_mostly tracing_buffer_mask;
591 594
592#define for_each_tracing_cpu(cpu) \ 595#define for_each_tracing_cpu(cpu) \
@@ -1020,6 +1023,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1020extern void print_subsystem_event_filter(struct event_subsystem *system, 1023extern void print_subsystem_event_filter(struct event_subsystem *system,
1021 struct trace_seq *s); 1024 struct trace_seq *s);
1022extern int filter_assign_type(const char *type); 1025extern int filter_assign_type(const char *type);
1026extern int create_event_filter(struct ftrace_event_call *call,
1027 char *filter_str, bool set_str,
1028 struct event_filter **filterp);
1029extern void free_event_filter(struct event_filter *filter);
1023 1030
1024struct ftrace_event_field * 1031struct ftrace_event_field *
1025trace_find_event_field(struct ftrace_event_call *call, char *name); 1032trace_find_event_field(struct ftrace_event_call *call, char *name);
@@ -1028,9 +1035,195 @@ extern void trace_event_enable_cmd_record(bool enable);
1028extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1035extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1029extern int event_trace_del_tracer(struct trace_array *tr); 1036extern int event_trace_del_tracer(struct trace_array *tr);
1030 1037
1038extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
1039 const char *system,
1040 const char *event);
1041
1042static inline void *event_file_data(struct file *filp)
1043{
1044 return ACCESS_ONCE(file_inode(filp)->i_private);
1045}
1046
1031extern struct mutex event_mutex; 1047extern struct mutex event_mutex;
1032extern struct list_head ftrace_events; 1048extern struct list_head ftrace_events;
1033 1049
1050extern const struct file_operations event_trigger_fops;
1051
1052extern int register_trigger_cmds(void);
1053extern void clear_event_triggers(struct trace_array *tr);
1054
1055struct event_trigger_data {
1056 unsigned long count;
1057 int ref;
1058 struct event_trigger_ops *ops;
1059 struct event_command *cmd_ops;
1060 struct event_filter __rcu *filter;
1061 char *filter_str;
1062 void *private_data;
1063 struct list_head list;
1064};
1065
1066/**
1067 * struct event_trigger_ops - callbacks for trace event triggers
1068 *
1069 * The methods in this structure provide per-event trigger hooks for
1070 * various trigger operations.
1071 *
1072 * All the methods below, except for @init() and @free(), must be
1073 * implemented.
1074 *
1075 * @func: The trigger 'probe' function called when the triggering
1076 * event occurs. The data passed into this callback is the data
1077 * that was supplied to the event_command @reg() function that
1078 * registered the trigger (see struct event_command).
1079 *
1080 * @init: An optional initialization function called for the trigger
1081 * when the trigger is registered (via the event_command reg()
1082 * function). This can be used to perform per-trigger
1083 * initialization such as incrementing a per-trigger reference
1084 * count, for instance. This is usually implemented by the
1085 * generic utility function @event_trigger_init() (see
1086 * trace_event_triggers.c).
1087 *
1088 * @free: An optional de-initialization function called for the
1089 * trigger when the trigger is unregistered (via the
1090 * event_command @reg() function). This can be used to perform
1091 * per-trigger de-initialization such as decrementing a
1092 * per-trigger reference count and freeing corresponding trigger
1093 * data, for instance. This is usually implemented by the
1094 * generic utility function @event_trigger_free() (see
1095 * trace_event_triggers.c).
1096 *
1097 * @print: The callback function invoked to have the trigger print
1098 * itself. This is usually implemented by a wrapper function
1099 * that calls the generic utility function @event_trigger_print()
1100 * (see trace_event_triggers.c).
1101 */
1102struct event_trigger_ops {
1103 void (*func)(struct event_trigger_data *data);
1104 int (*init)(struct event_trigger_ops *ops,
1105 struct event_trigger_data *data);
1106 void (*free)(struct event_trigger_ops *ops,
1107 struct event_trigger_data *data);
1108 int (*print)(struct seq_file *m,
1109 struct event_trigger_ops *ops,
1110 struct event_trigger_data *data);
1111};
1112
1113/**
1114 * struct event_command - callbacks and data members for event commands
1115 *
1116 * Event commands are invoked by users by writing the command name
1117 * into the 'trigger' file associated with a trace event. The
1118 * parameters associated with a specific invocation of an event
1119 * command are used to create an event trigger instance, which is
1120 * added to the list of trigger instances associated with that trace
1121 * event. When the event is hit, the set of triggers associated with
1122 * that event is invoked.
1123 *
1124 * The data members in this structure provide per-event command data
1125 * for various event commands.
1126 *
1127 * All the data members below, except for @post_trigger, must be set
1128 * for each event command.
1129 *
1130 * @name: The unique name that identifies the event command. This is
1131 * the name used when setting triggers via trigger files.
1132 *
1133 * @trigger_type: A unique id that identifies the event command
1134 * 'type'. This value has two purposes, the first to ensure that
1135 * only one trigger of the same type can be set at a given time
1136 * for a particular event e.g. it doesn't make sense to have both
1137 * a traceon and traceoff trigger attached to a single event at
1138 * the same time, so traceon and traceoff have the same type
1139 * though they have different names. The @trigger_type value is
1140 * also used as a bit value for deferring the actual trigger
1141 * action until after the current event is finished. Some
1142 * commands need to do this if they themselves log to the trace
1143 * buffer (see the @post_trigger() member below). @trigger_type
1144 * values are defined by adding new values to the trigger_type
1145 * enum in include/linux/ftrace_event.h.
1146 *
1147 * @post_trigger: A flag that says whether or not this command needs
1148 * to have its action delayed until after the current event has
1149 * been closed. Some triggers need to avoid being invoked while
1150 * an event is currently in the process of being logged, since
1151 * the trigger may itself log data into the trace buffer. Thus
1152 * we make sure the current event is committed before invoking
1153 * those triggers. To do that, the trigger invocation is split
1154 * in two - the first part checks the filter using the current
1155 * trace record; if a command has the @post_trigger flag set, it
1156 * sets a bit for itself in the return value, otherwise it
1157 * directly invokes the trigger. Once all commands have been
1158 * either invoked or set their return flag, the current record is
1159 * either committed or discarded. At that point, if any commands
1160 * have deferred their triggers, those commands are finally
1161 * invoked following the close of the current event. In other
1162 * words, if the event_trigger_ops @func() probe implementation
1163 * itself logs to the trace buffer, this flag should be set,
1164 * otherwise it can be left unspecified.
1165 *
1166 * All the methods below, except for @set_filter(), must be
1167 * implemented.
1168 *
1169 * @func: The callback function responsible for parsing and
1170 * registering the trigger written to the 'trigger' file by the
1171 * user. It allocates the trigger instance and registers it with
1172 * the appropriate trace event. It makes use of the other
1173 * event_command callback functions to orchestrate this, and is
1174 * usually implemented by the generic utility function
1175 * @event_trigger_callback() (see trace_event_triggers.c).
1176 *
1177 * @reg: Adds the trigger to the list of triggers associated with the
1178 * event, and enables the event trigger itself, after
1179 * initializing it (via the event_trigger_ops @init() function).
1180 * This is also where commands can use the @trigger_type value to
1181 * make the decision as to whether or not multiple instances of
1182 * the trigger should be allowed. This is usually implemented by
1183 * the generic utility function @register_trigger() (see
1184 * trace_event_triggers.c).
1185 *
1186 * @unreg: Removes the trigger from the list of triggers associated
1187 * with the event, and disables the event trigger itself, after
1188 * initializing it (via the event_trigger_ops @free() function).
1189 * This is usually implemented by the generic utility function
1190 * @unregister_trigger() (see trace_event_triggers.c).
1191 *
1192 * @set_filter: An optional function called to parse and set a filter
1193 * for the trigger. If no @set_filter() method is set for the
1194 * event command, filters set by the user for the command will be
1195 * ignored. This is usually implemented by the generic utility
1196 * function @set_trigger_filter() (see trace_event_triggers.c).
1197 *
1198 * @get_trigger_ops: The callback function invoked to retrieve the
1199 * event_trigger_ops implementation associated with the command.
1200 */
1201struct event_command {
1202 struct list_head list;
1203 char *name;
1204 enum event_trigger_type trigger_type;
1205 bool post_trigger;
1206 int (*func)(struct event_command *cmd_ops,
1207 struct ftrace_event_file *file,
1208 char *glob, char *cmd, char *params);
1209 int (*reg)(char *glob,
1210 struct event_trigger_ops *ops,
1211 struct event_trigger_data *data,
1212 struct ftrace_event_file *file);
1213 void (*unreg)(char *glob,
1214 struct event_trigger_ops *ops,
1215 struct event_trigger_data *data,
1216 struct ftrace_event_file *file);
1217 int (*set_filter)(char *filter_str,
1218 struct event_trigger_data *data,
1219 struct ftrace_event_file *file);
1220 struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
1221};
1222
1223extern int trace_event_enable_disable(struct ftrace_event_file *file,
1224 int enable, int soft_disable);
1225extern int tracing_alloc_snapshot(void);
1226
1034extern const char *__start___trace_bprintk_fmt[]; 1227extern const char *__start___trace_bprintk_fmt[];
1035extern const char *__stop___trace_bprintk_fmt[]; 1228extern const char *__stop___trace_bprintk_fmt[];
1036 1229
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3b52ac..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,12 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 if (tp_event->perf_perm) {
28 int ret = tp_event->perf_perm(tp_event, p_event);
29 if (ret)
30 return ret;
31 }
32
27 /* The ftrace function trace is allowed only for root. */ 33 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 34 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 35 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
173int perf_trace_init(struct perf_event *p_event) 179int perf_trace_init(struct perf_event *p_event)
174{ 180{
175 struct ftrace_event_call *tp_event; 181 struct ftrace_event_call *tp_event;
176 int event_id = p_event->attr.config; 182 u64 event_id = p_event->attr.config;
177 int ret = -EINVAL; 183 int ret = -EINVAL;
178 184
179 mutex_lock(&event_mutex); 185 mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f919a2e21bf3..e71ffd4eccb5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -342,6 +342,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
342 return ret; 342 return ret;
343} 343}
344 344
345int trace_event_enable_disable(struct ftrace_event_file *file,
346 int enable, int soft_disable)
347{
348 return __ftrace_event_enable_disable(file, enable, soft_disable);
349}
350
345static int ftrace_event_enable_disable(struct ftrace_event_file *file, 351static int ftrace_event_enable_disable(struct ftrace_event_file *file,
346 int enable) 352 int enable)
347{ 353{
@@ -421,11 +427,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
421 } 427 }
422} 428}
423 429
424static void *event_file_data(struct file *filp)
425{
426 return ACCESS_ONCE(file_inode(filp)->i_private);
427}
428
429static void remove_event_file_dir(struct ftrace_event_file *file) 430static void remove_event_file_dir(struct ftrace_event_file *file)
430{ 431{
431 struct dentry *dir = file->dir; 432 struct dentry *dir = file->dir;
@@ -1549,6 +1550,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1549 trace_create_file("filter", 0644, file->dir, file, 1550 trace_create_file("filter", 0644, file->dir, file,
1550 &ftrace_event_filter_fops); 1551 &ftrace_event_filter_fops);
1551 1552
1553 trace_create_file("trigger", 0644, file->dir, file,
1554 &event_trigger_fops);
1555
1552 trace_create_file("format", 0444, file->dir, call, 1556 trace_create_file("format", 0444, file->dir, call,
1553 &ftrace_event_format_fops); 1557 &ftrace_event_format_fops);
1554 1558
@@ -1645,6 +1649,8 @@ trace_create_new_event(struct ftrace_event_call *call,
1645 file->event_call = call; 1649 file->event_call = call;
1646 file->tr = tr; 1650 file->tr = tr;
1647 atomic_set(&file->sm_ref, 0); 1651 atomic_set(&file->sm_ref, 0);
1652 atomic_set(&file->tm_ref, 0);
1653 INIT_LIST_HEAD(&file->triggers);
1648 list_add(&file->list, &tr->events); 1654 list_add(&file->list, &tr->events);
1649 1655
1650 return file; 1656 return file;
@@ -1849,20 +1855,7 @@ __trace_add_event_dirs(struct trace_array *tr)
1849 } 1855 }
1850} 1856}
1851 1857
1852#ifdef CONFIG_DYNAMIC_FTRACE 1858struct ftrace_event_file *
1853
1854/* Avoid typos */
1855#define ENABLE_EVENT_STR "enable_event"
1856#define DISABLE_EVENT_STR "disable_event"
1857
1858struct event_probe_data {
1859 struct ftrace_event_file *file;
1860 unsigned long count;
1861 int ref;
1862 bool enable;
1863};
1864
1865static struct ftrace_event_file *
1866find_event_file(struct trace_array *tr, const char *system, const char *event) 1859find_event_file(struct trace_array *tr, const char *system, const char *event)
1867{ 1860{
1868 struct ftrace_event_file *file; 1861 struct ftrace_event_file *file;
@@ -1885,6 +1878,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
1885 return NULL; 1878 return NULL;
1886} 1879}
1887 1880
1881#ifdef CONFIG_DYNAMIC_FTRACE
1882
1883/* Avoid typos */
1884#define ENABLE_EVENT_STR "enable_event"
1885#define DISABLE_EVENT_STR "disable_event"
1886
1887struct event_probe_data {
1888 struct ftrace_event_file *file;
1889 unsigned long count;
1890 int ref;
1891 bool enable;
1892};
1893
1888static void 1894static void
1889event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) 1895event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1890{ 1896{
@@ -2311,9 +2317,15 @@ int event_trace_del_tracer(struct trace_array *tr)
2311{ 2317{
2312 mutex_lock(&event_mutex); 2318 mutex_lock(&event_mutex);
2313 2319
2320 /* Disable any event triggers and associated soft-disabled events */
2321 clear_event_triggers(tr);
2322
2314 /* Disable any running events */ 2323 /* Disable any running events */
2315 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); 2324 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2316 2325
2326 /* Access to events are within rcu_read_lock_sched() */
2327 synchronize_sched();
2328
2317 down_write(&trace_event_sem); 2329 down_write(&trace_event_sem);
2318 __trace_remove_event_dirs(tr); 2330 __trace_remove_event_dirs(tr);
2319 debugfs_remove_recursive(tr->event_dir); 2331 debugfs_remove_recursive(tr->event_dir);
@@ -2374,6 +2386,8 @@ static __init int event_trace_enable(void)
2374 2386
2375 register_event_cmds(); 2387 register_event_cmds();
2376 2388
2389 register_trigger_cmds();
2390
2377 return 0; 2391 return 0;
2378} 2392}
2379 2393
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2468f56dc5db..8a8631926a07 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter)
799 kfree(filter); 799 kfree(filter);
800} 800}
801 801
802void free_event_filter(struct event_filter *filter)
803{
804 __free_filter(filter);
805}
806
802void destroy_call_preds(struct ftrace_event_call *call) 807void destroy_call_preds(struct ftrace_event_call *call)
803{ 808{
804 __free_filter(call->filter); 809 __free_filter(call->filter);
@@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call,
1938 return err; 1943 return err;
1939} 1944}
1940 1945
1946int create_event_filter(struct ftrace_event_call *call,
1947 char *filter_str, bool set_str,
1948 struct event_filter **filterp)
1949{
1950 return create_filter(call, filter_str, set_str, filterp);
1951}
1952
1941/** 1953/**
1942 * create_system_filter - create a filter for an event_subsystem 1954 * create_system_filter - create a filter for an event_subsystem
1943 * @system: event_subsystem to create a filter for 1955 * @system: event_subsystem to create a filter for
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
new file mode 100644
index 000000000000..8efbb69b04f0
--- /dev/null
+++ b/kernel/trace/trace_events_trigger.c
@@ -0,0 +1,1437 @@
1/*
2 * trace_events_trigger - trace event triggers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
19 */
20
21#include <linux/module.h>
22#include <linux/ctype.h>
23#include <linux/mutex.h>
24#include <linux/slab.h>
25
26#include "trace.h"
27
28static LIST_HEAD(trigger_commands);
29static DEFINE_MUTEX(trigger_cmd_mutex);
30
31static void
32trigger_data_free(struct event_trigger_data *data)
33{
34 if (data->cmd_ops->set_filter)
35 data->cmd_ops->set_filter(NULL, data, NULL);
36
37 synchronize_sched(); /* make sure current triggers exit before free */
38 kfree(data);
39}
40
41/**
42 * event_triggers_call - Call triggers associated with a trace event
43 * @file: The ftrace_event_file associated with the event
44 * @rec: The trace entry for the event, NULL for unconditional invocation
45 *
46 * For each trigger associated with an event, invoke the trigger
47 * function registered with the associated trigger command. If rec is
48 * non-NULL, it means that the trigger requires further processing and
49 * shouldn't be unconditionally invoked. If rec is non-NULL and the
50 * trigger has a filter associated with it, rec will checked against
51 * the filter and if the record matches the trigger will be invoked.
52 * If the trigger is a 'post_trigger', meaning it shouldn't be invoked
53 * in any case until the current event is written, the trigger
54 * function isn't invoked but the bit associated with the deferred
55 * trigger is set in the return value.
56 *
57 * Returns an enum event_trigger_type value containing a set bit for
58 * any trigger that should be deferred, ETT_NONE if nothing to defer.
59 *
60 * Called from tracepoint handlers (with rcu_read_lock_sched() held).
61 *
62 * Return: an enum event_trigger_type value containing a set bit for
63 * any trigger that should be deferred, ETT_NONE if nothing to defer.
64 */
65enum event_trigger_type
66event_triggers_call(struct ftrace_event_file *file, void *rec)
67{
68 struct event_trigger_data *data;
69 enum event_trigger_type tt = ETT_NONE;
70 struct event_filter *filter;
71
72 if (list_empty(&file->triggers))
73 return tt;
74
75 list_for_each_entry_rcu(data, &file->triggers, list) {
76 if (!rec) {
77 data->ops->func(data);
78 continue;
79 }
80 filter = rcu_dereference(data->filter);
81 if (filter && !filter_match_preds(filter, rec))
82 continue;
83 if (data->cmd_ops->post_trigger) {
84 tt |= data->cmd_ops->trigger_type;
85 continue;
86 }
87 data->ops->func(data);
88 }
89 return tt;
90}
91EXPORT_SYMBOL_GPL(event_triggers_call);
92
93/**
94 * event_triggers_post_call - Call 'post_triggers' for a trace event
95 * @file: The ftrace_event_file associated with the event
96 * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
97 *
98 * For each trigger associated with an event, invoke the trigger
99 * function registered with the associated trigger command, if the
100 * corresponding bit is set in the tt enum passed into this function.
101 * See @event_triggers_call for details on how those bits are set.
102 *
103 * Called from tracepoint handlers (with rcu_read_lock_sched() held).
104 */
105void
106event_triggers_post_call(struct ftrace_event_file *file,
107 enum event_trigger_type tt)
108{
109 struct event_trigger_data *data;
110
111 list_for_each_entry_rcu(data, &file->triggers, list) {
112 if (data->cmd_ops->trigger_type & tt)
113 data->ops->func(data);
114 }
115}
116EXPORT_SYMBOL_GPL(event_triggers_post_call);
117
118#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL)
119
120static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
121{
122 struct ftrace_event_file *event_file = event_file_data(m->private);
123
124 if (t == SHOW_AVAILABLE_TRIGGERS)
125 return NULL;
126
127 return seq_list_next(t, &event_file->triggers, pos);
128}
129
130static void *trigger_start(struct seq_file *m, loff_t *pos)
131{
132 struct ftrace_event_file *event_file;
133
134 /* ->stop() is called even if ->start() fails */
135 mutex_lock(&event_mutex);
136 event_file = event_file_data(m->private);
137 if (unlikely(!event_file))
138 return ERR_PTR(-ENODEV);
139
140 if (list_empty(&event_file->triggers))
141 return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
142
143 return seq_list_start(&event_file->triggers, *pos);
144}
145
146static void trigger_stop(struct seq_file *m, void *t)
147{
148 mutex_unlock(&event_mutex);
149}
150
151static int trigger_show(struct seq_file *m, void *v)
152{
153 struct event_trigger_data *data;
154 struct event_command *p;
155
156 if (v == SHOW_AVAILABLE_TRIGGERS) {
157 seq_puts(m, "# Available triggers:\n");
158 seq_putc(m, '#');
159 mutex_lock(&trigger_cmd_mutex);
160 list_for_each_entry_reverse(p, &trigger_commands, list)
161 seq_printf(m, " %s", p->name);
162 seq_putc(m, '\n');
163 mutex_unlock(&trigger_cmd_mutex);
164 return 0;
165 }
166
167 data = list_entry(v, struct event_trigger_data, list);
168 data->ops->print(m, data->ops, data);
169
170 return 0;
171}
172
173static const struct seq_operations event_triggers_seq_ops = {
174 .start = trigger_start,
175 .next = trigger_next,
176 .stop = trigger_stop,
177 .show = trigger_show,
178};
179
180static int event_trigger_regex_open(struct inode *inode, struct file *file)
181{
182 int ret = 0;
183
184 mutex_lock(&event_mutex);
185
186 if (unlikely(!event_file_data(file))) {
187 mutex_unlock(&event_mutex);
188 return -ENODEV;
189 }
190
191 if (file->f_mode & FMODE_READ) {
192 ret = seq_open(file, &event_triggers_seq_ops);
193 if (!ret) {
194 struct seq_file *m = file->private_data;
195 m->private = file;
196 }
197 }
198
199 mutex_unlock(&event_mutex);
200
201 return ret;
202}
203
204static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
205{
206 char *command, *next = buff;
207 struct event_command *p;
208 int ret = -EINVAL;
209
210 command = strsep(&next, ": \t");
211 command = (command[0] != '!') ? command : command + 1;
212
213 mutex_lock(&trigger_cmd_mutex);
214 list_for_each_entry(p, &trigger_commands, list) {
215 if (strcmp(p->name, command) == 0) {
216 ret = p->func(p, file, buff, command, next);
217 goto out_unlock;
218 }
219 }
220 out_unlock:
221 mutex_unlock(&trigger_cmd_mutex);
222
223 return ret;
224}
225
226static ssize_t event_trigger_regex_write(struct file *file,
227 const char __user *ubuf,
228 size_t cnt, loff_t *ppos)
229{
230 struct ftrace_event_file *event_file;
231 ssize_t ret;
232 char *buf;
233
234 if (!cnt)
235 return 0;
236
237 if (cnt >= PAGE_SIZE)
238 return -EINVAL;
239
240 buf = (char *)__get_free_page(GFP_TEMPORARY);
241 if (!buf)
242 return -ENOMEM;
243
244 if (copy_from_user(buf, ubuf, cnt)) {
245 free_page((unsigned long)buf);
246 return -EFAULT;
247 }
248 buf[cnt] = '\0';
249 strim(buf);
250
251 mutex_lock(&event_mutex);
252 event_file = event_file_data(file);
253 if (unlikely(!event_file)) {
254 mutex_unlock(&event_mutex);
255 free_page((unsigned long)buf);
256 return -ENODEV;
257 }
258 ret = trigger_process_regex(event_file, buf);
259 mutex_unlock(&event_mutex);
260
261 free_page((unsigned long)buf);
262 if (ret < 0)
263 goto out;
264
265 *ppos += cnt;
266 ret = cnt;
267 out:
268 return ret;
269}
270
271static int event_trigger_regex_release(struct inode *inode, struct file *file)
272{
273 mutex_lock(&event_mutex);
274
275 if (file->f_mode & FMODE_READ)
276 seq_release(inode, file);
277
278 mutex_unlock(&event_mutex);
279
280 return 0;
281}
282
283static ssize_t
284event_trigger_write(struct file *filp, const char __user *ubuf,
285 size_t cnt, loff_t *ppos)
286{
287 return event_trigger_regex_write(filp, ubuf, cnt, ppos);
288}
289
290static int
291event_trigger_open(struct inode *inode, struct file *filp)
292{
293 return event_trigger_regex_open(inode, filp);
294}
295
296static int
297event_trigger_release(struct inode *inode, struct file *file)
298{
299 return event_trigger_regex_release(inode, file);
300}
301
302const struct file_operations event_trigger_fops = {
303 .open = event_trigger_open,
304 .read = seq_read,
305 .write = event_trigger_write,
306 .llseek = tracing_lseek,
307 .release = event_trigger_release,
308};
309
310/*
311 * Currently we only register event commands from __init, so mark this
312 * __init too.
313 */
314static __init int register_event_command(struct event_command *cmd)
315{
316 struct event_command *p;
317 int ret = 0;
318
319 mutex_lock(&trigger_cmd_mutex);
320 list_for_each_entry(p, &trigger_commands, list) {
321 if (strcmp(cmd->name, p->name) == 0) {
322 ret = -EBUSY;
323 goto out_unlock;
324 }
325 }
326 list_add(&cmd->list, &trigger_commands);
327 out_unlock:
328 mutex_unlock(&trigger_cmd_mutex);
329
330 return ret;
331}
332
333/*
334 * Currently we only unregister event commands from __init, so mark
335 * this __init too.
336 */
337static __init int unregister_event_command(struct event_command *cmd)
338{
339 struct event_command *p, *n;
340 int ret = -ENODEV;
341
342 mutex_lock(&trigger_cmd_mutex);
343 list_for_each_entry_safe(p, n, &trigger_commands, list) {
344 if (strcmp(cmd->name, p->name) == 0) {
345 ret = 0;
346 list_del_init(&p->list);
347 goto out_unlock;
348 }
349 }
350 out_unlock:
351 mutex_unlock(&trigger_cmd_mutex);
352
353 return ret;
354}
355
356/**
357 * event_trigger_print - Generic event_trigger_ops @print implementation
358 * @name: The name of the event trigger
359 * @m: The seq_file being printed to
360 * @data: Trigger-specific data
361 * @filter_str: filter_str to print, if present
362 *
363 * Common implementation for event triggers to print themselves.
364 *
365 * Usually wrapped by a function that simply sets the @name of the
366 * trigger command and then invokes this.
367 *
368 * Return: 0 on success, errno otherwise
369 */
370static int
371event_trigger_print(const char *name, struct seq_file *m,
372 void *data, char *filter_str)
373{
374 long count = (long)data;
375
376 seq_printf(m, "%s", name);
377
378 if (count == -1)
379 seq_puts(m, ":unlimited");
380 else
381 seq_printf(m, ":count=%ld", count);
382
383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str);
385 else
386 seq_puts(m, "\n");
387
388 return 0;
389}
390
391/**
392 * event_trigger_init - Generic event_trigger_ops @init implementation
393 * @ops: The trigger ops associated with the trigger
394 * @data: Trigger-specific data
395 *
396 * Common implementation of event trigger initialization.
397 *
398 * Usually used directly as the @init method in event trigger
399 * implementations.
400 *
401 * Return: 0 on success, errno otherwise
402 */
403static int
404event_trigger_init(struct event_trigger_ops *ops,
405 struct event_trigger_data *data)
406{
407 data->ref++;
408 return 0;
409}
410
411/**
412 * event_trigger_free - Generic event_trigger_ops @free implementation
413 * @ops: The trigger ops associated with the trigger
414 * @data: Trigger-specific data
415 *
416 * Common implementation of event trigger de-initialization.
417 *
418 * Usually used directly as the @free method in event trigger
419 * implementations.
420 */
421static void
422event_trigger_free(struct event_trigger_ops *ops,
423 struct event_trigger_data *data)
424{
425 if (WARN_ON_ONCE(data->ref <= 0))
426 return;
427
428 data->ref--;
429 if (!data->ref)
430 trigger_data_free(data);
431}
432
433static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
434 int trigger_enable)
435{
436 int ret = 0;
437
438 if (trigger_enable) {
439 if (atomic_inc_return(&file->tm_ref) > 1)
440 return ret;
441 set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
442 ret = trace_event_enable_disable(file, 1, 1);
443 } else {
444 if (atomic_dec_return(&file->tm_ref) > 0)
445 return ret;
446 clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
447 ret = trace_event_enable_disable(file, 0, 1);
448 }
449
450 return ret;
451}
452
453/**
454 * clear_event_triggers - Clear all triggers associated with a trace array
455 * @tr: The trace array to clear
456 *
457 * For each trigger, the triggering event has its tm_ref decremented
458 * via trace_event_trigger_enable_disable(), and any associated event
459 * (in the case of enable/disable_event triggers) will have its sm_ref
460 * decremented via free()->trace_event_enable_disable(). That
461 * combination effectively reverses the soft-mode/trigger state added
462 * by trigger registration.
463 *
464 * Must be called with event_mutex held.
465 */
466void
467clear_event_triggers(struct trace_array *tr)
468{
469 struct ftrace_event_file *file;
470
471 list_for_each_entry(file, &tr->events, list) {
472 struct event_trigger_data *data;
473 list_for_each_entry_rcu(data, &file->triggers, list) {
474 trace_event_trigger_enable_disable(file, 0);
475 if (data->ops->free)
476 data->ops->free(data->ops, data);
477 }
478 }
479}
480
481/**
482 * update_cond_flag - Set or reset the TRIGGER_COND bit
483 * @file: The ftrace_event_file associated with the event
484 *
485 * If an event has triggers and any of those triggers has a filter or
486 * a post_trigger, trigger invocation needs to be deferred until after
487 * the current event has logged its data, and the event should have
488 * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
489 * cleared.
490 */
491static void update_cond_flag(struct ftrace_event_file *file)
492{
493 struct event_trigger_data *data;
494 bool set_cond = false;
495
496 list_for_each_entry_rcu(data, &file->triggers, list) {
497 if (data->filter || data->cmd_ops->post_trigger) {
498 set_cond = true;
499 break;
500 }
501 }
502
503 if (set_cond)
504 set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
505 else
506 clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
507}
508
509/**
510 * register_trigger - Generic event_command @reg implementation
511 * @glob: The raw string used to register the trigger
512 * @ops: The trigger ops associated with the trigger
513 * @data: Trigger-specific data to associate with the trigger
514 * @file: The ftrace_event_file associated with the event
515 *
516 * Common implementation for event trigger registration.
517 *
518 * Usually used directly as the @reg method in event command
519 * implementations.
520 *
521 * Return: 0 on success, errno otherwise
522 */
523static int register_trigger(char *glob, struct event_trigger_ops *ops,
524 struct event_trigger_data *data,
525 struct ftrace_event_file *file)
526{
527 struct event_trigger_data *test;
528 int ret = 0;
529
530 list_for_each_entry_rcu(test, &file->triggers, list) {
531 if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
532 ret = -EEXIST;
533 goto out;
534 }
535 }
536
537 if (data->ops->init) {
538 ret = data->ops->init(data->ops, data);
539 if (ret < 0)
540 goto out;
541 }
542
543 list_add_rcu(&data->list, &file->triggers);
544 ret++;
545
546 if (trace_event_trigger_enable_disable(file, 1) < 0) {
547 list_del_rcu(&data->list);
548 ret--;
549 }
550 update_cond_flag(file);
551out:
552 return ret;
553}
554
555/**
556 * unregister_trigger - Generic event_command @unreg implementation
557 * @glob: The raw string used to register the trigger
558 * @ops: The trigger ops associated with the trigger
559 * @test: Trigger-specific data used to find the trigger to remove
560 * @file: The ftrace_event_file associated with the event
561 *
562 * Common implementation for event trigger unregistration.
563 *
564 * Usually used directly as the @unreg method in event command
565 * implementations.
566 */
567static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
568 struct event_trigger_data *test,
569 struct ftrace_event_file *file)
570{
571 struct event_trigger_data *data;
572 bool unregistered = false;
573
574 list_for_each_entry_rcu(data, &file->triggers, list) {
575 if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
576 unregistered = true;
577 list_del_rcu(&data->list);
578 update_cond_flag(file);
579 trace_event_trigger_enable_disable(file, 0);
580 break;
581 }
582 }
583
584 if (unregistered && data->ops->free)
585 data->ops->free(data->ops, data);
586}
587
588/**
589 * event_trigger_callback - Generic event_command @func implementation
590 * @cmd_ops: The command ops, used for trigger registration
591 * @file: The ftrace_event_file associated with the event
592 * @glob: The raw string used to register the trigger
593 * @cmd: The cmd portion of the string used to register the trigger
594 * @param: The params portion of the string used to register the trigger
595 *
596 * Common implementation for event command parsing and trigger
597 * instantiation.
598 *
599 * Usually used directly as the @func method in event command
600 * implementations.
601 *
602 * Return: 0 on success, errno otherwise
603 */
604static int
605event_trigger_callback(struct event_command *cmd_ops,
606 struct ftrace_event_file *file,
607 char *glob, char *cmd, char *param)
608{
609 struct event_trigger_data *trigger_data;
610 struct event_trigger_ops *trigger_ops;
611 char *trigger = NULL;
612 char *number;
613 int ret;
614
615 /* separate the trigger from the filter (t:n [if filter]) */
616 if (param && isdigit(param[0]))
617 trigger = strsep(&param, " \t");
618
619 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
620
621 ret = -ENOMEM;
622 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
623 if (!trigger_data)
624 goto out;
625
626 trigger_data->count = -1;
627 trigger_data->ops = trigger_ops;
628 trigger_data->cmd_ops = cmd_ops;
629 INIT_LIST_HEAD(&trigger_data->list);
630
631 if (glob[0] == '!') {
632 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
633 kfree(trigger_data);
634 ret = 0;
635 goto out;
636 }
637
638 if (trigger) {
639 number = strsep(&trigger, ":");
640
641 ret = -EINVAL;
642 if (!strlen(number))
643 goto out_free;
644
645 /*
646 * We use the callback data field (which is a pointer)
647 * as our counter.
648 */
649 ret = kstrtoul(number, 0, &trigger_data->count);
650 if (ret)
651 goto out_free;
652 }
653
654 if (!param) /* if param is non-empty, it's supposed to be a filter */
655 goto out_reg;
656
657 if (!cmd_ops->set_filter)
658 goto out_reg;
659
660 ret = cmd_ops->set_filter(param, trigger_data, file);
661 if (ret < 0)
662 goto out_free;
663
664 out_reg:
665 ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
666 /*
667 * The above returns on success the # of functions enabled,
668 * but if it didn't find any functions it returns zero.
669 * Consider no functions a failure too.
670 */
671 if (!ret) {
672 ret = -ENOENT;
673 goto out_free;
674 } else if (ret < 0)
675 goto out_free;
676 ret = 0;
677 out:
678 return ret;
679
680 out_free:
681 if (cmd_ops->set_filter)
682 cmd_ops->set_filter(NULL, trigger_data, NULL);
683 kfree(trigger_data);
684 goto out;
685}
686
687/**
688 * set_trigger_filter - Generic event_command @set_filter implementation
689 * @filter_str: The filter string for the trigger, NULL to remove filter
690 * @trigger_data: Trigger-specific data
691 * @file: The ftrace_event_file associated with the event
692 *
693 * Common implementation for event command filter parsing and filter
694 * instantiation.
695 *
696 * Usually used directly as the @set_filter method in event command
697 * implementations.
698 *
699 * Also used to remove a filter (if filter_str = NULL).
700 *
701 * Return: 0 on success, errno otherwise
702 */
703static int set_trigger_filter(char *filter_str,
704 struct event_trigger_data *trigger_data,
705 struct ftrace_event_file *file)
706{
707 struct event_trigger_data *data = trigger_data;
708 struct event_filter *filter = NULL, *tmp;
709 int ret = -EINVAL;
710 char *s;
711
712 if (!filter_str) /* clear the current filter */
713 goto assign;
714
715 s = strsep(&filter_str, " \t");
716
717 if (!strlen(s) || strcmp(s, "if") != 0)
718 goto out;
719
720 if (!filter_str)
721 goto out;
722
723 /* The filter is for the 'trigger' event, not the triggered event */
724 ret = create_event_filter(file->event_call, filter_str, false, &filter);
725 if (ret)
726 goto out;
727 assign:
728 tmp = rcu_access_pointer(data->filter);
729
730 rcu_assign_pointer(data->filter, filter);
731
732 if (tmp) {
733 /* Make sure the call is done with the filter */
734 synchronize_sched();
735 free_event_filter(tmp);
736 }
737
738 kfree(data->filter_str);
739 data->filter_str = NULL;
740
741 if (filter_str) {
742 data->filter_str = kstrdup(filter_str, GFP_KERNEL);
743 if (!data->filter_str) {
744 free_event_filter(rcu_access_pointer(data->filter));
745 data->filter = NULL;
746 ret = -ENOMEM;
747 }
748 }
749 out:
750 return ret;
751}
752
753static void
754traceon_trigger(struct event_trigger_data *data)
755{
756 if (tracing_is_on())
757 return;
758
759 tracing_on();
760}
761
762static void
763traceon_count_trigger(struct event_trigger_data *data)
764{
765 if (tracing_is_on())
766 return;
767
768 if (!data->count)
769 return;
770
771 if (data->count != -1)
772 (data->count)--;
773
774 tracing_on();
775}
776
777static void
778traceoff_trigger(struct event_trigger_data *data)
779{
780 if (!tracing_is_on())
781 return;
782
783 tracing_off();
784}
785
786static void
787traceoff_count_trigger(struct event_trigger_data *data)
788{
789 if (!tracing_is_on())
790 return;
791
792 if (!data->count)
793 return;
794
795 if (data->count != -1)
796 (data->count)--;
797
798 tracing_off();
799}
800
801static int
802traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
803 struct event_trigger_data *data)
804{
805 return event_trigger_print("traceon", m, (void *)data->count,
806 data->filter_str);
807}
808
809static int
810traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
811 struct event_trigger_data *data)
812{
813 return event_trigger_print("traceoff", m, (void *)data->count,
814 data->filter_str);
815}
816
817static struct event_trigger_ops traceon_trigger_ops = {
818 .func = traceon_trigger,
819 .print = traceon_trigger_print,
820 .init = event_trigger_init,
821 .free = event_trigger_free,
822};
823
824static struct event_trigger_ops traceon_count_trigger_ops = {
825 .func = traceon_count_trigger,
826 .print = traceon_trigger_print,
827 .init = event_trigger_init,
828 .free = event_trigger_free,
829};
830
831static struct event_trigger_ops traceoff_trigger_ops = {
832 .func = traceoff_trigger,
833 .print = traceoff_trigger_print,
834 .init = event_trigger_init,
835 .free = event_trigger_free,
836};
837
838static struct event_trigger_ops traceoff_count_trigger_ops = {
839 .func = traceoff_count_trigger,
840 .print = traceoff_trigger_print,
841 .init = event_trigger_init,
842 .free = event_trigger_free,
843};
844
845static struct event_trigger_ops *
846onoff_get_trigger_ops(char *cmd, char *param)
847{
848 struct event_trigger_ops *ops;
849
850 /* we register both traceon and traceoff to this callback */
851 if (strcmp(cmd, "traceon") == 0)
852 ops = param ? &traceon_count_trigger_ops :
853 &traceon_trigger_ops;
854 else
855 ops = param ? &traceoff_count_trigger_ops :
856 &traceoff_trigger_ops;
857
858 return ops;
859}
860
861static struct event_command trigger_traceon_cmd = {
862 .name = "traceon",
863 .trigger_type = ETT_TRACE_ONOFF,
864 .func = event_trigger_callback,
865 .reg = register_trigger,
866 .unreg = unregister_trigger,
867 .get_trigger_ops = onoff_get_trigger_ops,
868 .set_filter = set_trigger_filter,
869};
870
871static struct event_command trigger_traceoff_cmd = {
872 .name = "traceoff",
873 .trigger_type = ETT_TRACE_ONOFF,
874 .func = event_trigger_callback,
875 .reg = register_trigger,
876 .unreg = unregister_trigger,
877 .get_trigger_ops = onoff_get_trigger_ops,
878 .set_filter = set_trigger_filter,
879};
880
881#ifdef CONFIG_TRACER_SNAPSHOT
882static void
883snapshot_trigger(struct event_trigger_data *data)
884{
885 tracing_snapshot();
886}
887
888static void
889snapshot_count_trigger(struct event_trigger_data *data)
890{
891 if (!data->count)
892 return;
893
894 if (data->count != -1)
895 (data->count)--;
896
897 snapshot_trigger(data);
898}
899
900static int
901register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
902 struct event_trigger_data *data,
903 struct ftrace_event_file *file)
904{
905 int ret = register_trigger(glob, ops, data, file);
906
907 if (ret > 0 && tracing_alloc_snapshot() != 0) {
908 unregister_trigger(glob, ops, data, file);
909 ret = 0;
910 }
911
912 return ret;
913}
914
915static int
916snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
917 struct event_trigger_data *data)
918{
919 return event_trigger_print("snapshot", m, (void *)data->count,
920 data->filter_str);
921}
922
923static struct event_trigger_ops snapshot_trigger_ops = {
924 .func = snapshot_trigger,
925 .print = snapshot_trigger_print,
926 .init = event_trigger_init,
927 .free = event_trigger_free,
928};
929
930static struct event_trigger_ops snapshot_count_trigger_ops = {
931 .func = snapshot_count_trigger,
932 .print = snapshot_trigger_print,
933 .init = event_trigger_init,
934 .free = event_trigger_free,
935};
936
937static struct event_trigger_ops *
938snapshot_get_trigger_ops(char *cmd, char *param)
939{
940 return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
941}
942
943static struct event_command trigger_snapshot_cmd = {
944 .name = "snapshot",
945 .trigger_type = ETT_SNAPSHOT,
946 .func = event_trigger_callback,
947 .reg = register_snapshot_trigger,
948 .unreg = unregister_trigger,
949 .get_trigger_ops = snapshot_get_trigger_ops,
950 .set_filter = set_trigger_filter,
951};
952
953static __init int register_trigger_snapshot_cmd(void)
954{
955 int ret;
956
957 ret = register_event_command(&trigger_snapshot_cmd);
958 WARN_ON(ret < 0);
959
960 return ret;
961}
962#else
963static __init int register_trigger_snapshot_cmd(void) { return 0; }
964#endif /* CONFIG_TRACER_SNAPSHOT */
965
966#ifdef CONFIG_STACKTRACE
967/*
968 * Skip 3:
969 * stacktrace_trigger()
970 * event_triggers_post_call()
971 * ftrace_raw_event_xxx()
972 */
973#define STACK_SKIP 3
974
975static void
976stacktrace_trigger(struct event_trigger_data *data)
977{
978 trace_dump_stack(STACK_SKIP);
979}
980
981static void
982stacktrace_count_trigger(struct event_trigger_data *data)
983{
984 if (!data->count)
985 return;
986
987 if (data->count != -1)
988 (data->count)--;
989
990 stacktrace_trigger(data);
991}
992
993static int
994stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
995 struct event_trigger_data *data)
996{
997 return event_trigger_print("stacktrace", m, (void *)data->count,
998 data->filter_str);
999}
1000
1001static struct event_trigger_ops stacktrace_trigger_ops = {
1002 .func = stacktrace_trigger,
1003 .print = stacktrace_trigger_print,
1004 .init = event_trigger_init,
1005 .free = event_trigger_free,
1006};
1007
1008static struct event_trigger_ops stacktrace_count_trigger_ops = {
1009 .func = stacktrace_count_trigger,
1010 .print = stacktrace_trigger_print,
1011 .init = event_trigger_init,
1012 .free = event_trigger_free,
1013};
1014
1015static struct event_trigger_ops *
1016stacktrace_get_trigger_ops(char *cmd, char *param)
1017{
1018 return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
1019}
1020
1021static struct event_command trigger_stacktrace_cmd = {
1022 .name = "stacktrace",
1023 .trigger_type = ETT_STACKTRACE,
1024 .post_trigger = true,
1025 .func = event_trigger_callback,
1026 .reg = register_trigger,
1027 .unreg = unregister_trigger,
1028 .get_trigger_ops = stacktrace_get_trigger_ops,
1029 .set_filter = set_trigger_filter,
1030};
1031
1032static __init int register_trigger_stacktrace_cmd(void)
1033{
1034 int ret;
1035
1036 ret = register_event_command(&trigger_stacktrace_cmd);
1037 WARN_ON(ret < 0);
1038
1039 return ret;
1040}
1041#else
1042static __init int register_trigger_stacktrace_cmd(void) { return 0; }
1043#endif /* CONFIG_STACKTRACE */
1044
1045static __init void unregister_trigger_traceon_traceoff_cmds(void)
1046{
1047 unregister_event_command(&trigger_traceon_cmd);
1048 unregister_event_command(&trigger_traceoff_cmd);
1049}
1050
1051/* Avoid typos */
1052#define ENABLE_EVENT_STR "enable_event"
1053#define DISABLE_EVENT_STR "disable_event"
1054
1055struct enable_trigger_data {
1056 struct ftrace_event_file *file;
1057 bool enable;
1058};
1059
1060static void
1061event_enable_trigger(struct event_trigger_data *data)
1062{
1063 struct enable_trigger_data *enable_data = data->private_data;
1064
1065 if (enable_data->enable)
1066 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
1067 else
1068 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
1069}
1070
1071static void
1072event_enable_count_trigger(struct event_trigger_data *data)
1073{
1074 struct enable_trigger_data *enable_data = data->private_data;
1075
1076 if (!data->count)
1077 return;
1078
1079 /* Skip if the event is in a state we want to switch to */
1080 if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1081 return;
1082
1083 if (data->count != -1)
1084 (data->count)--;
1085
1086 event_enable_trigger(data);
1087}
1088
1089static int
1090event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1091 struct event_trigger_data *data)
1092{
1093 struct enable_trigger_data *enable_data = data->private_data;
1094
1095 seq_printf(m, "%s:%s:%s",
1096 enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1097 enable_data->file->event_call->class->system,
1098 enable_data->file->event_call->name);
1099
1100 if (data->count == -1)
1101 seq_puts(m, ":unlimited");
1102 else
1103 seq_printf(m, ":count=%ld", data->count);
1104
1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else
1108 seq_puts(m, "\n");
1109
1110 return 0;
1111}
1112
1113static void
1114event_enable_trigger_free(struct event_trigger_ops *ops,
1115 struct event_trigger_data *data)
1116{
1117 struct enable_trigger_data *enable_data = data->private_data;
1118
1119 if (WARN_ON_ONCE(data->ref <= 0))
1120 return;
1121
1122 data->ref--;
1123 if (!data->ref) {
1124 /* Remove the SOFT_MODE flag */
1125 trace_event_enable_disable(enable_data->file, 0, 1);
1126 module_put(enable_data->file->event_call->mod);
1127 trigger_data_free(data);
1128 kfree(enable_data);
1129 }
1130}
1131
1132static struct event_trigger_ops event_enable_trigger_ops = {
1133 .func = event_enable_trigger,
1134 .print = event_enable_trigger_print,
1135 .init = event_trigger_init,
1136 .free = event_enable_trigger_free,
1137};
1138
1139static struct event_trigger_ops event_enable_count_trigger_ops = {
1140 .func = event_enable_count_trigger,
1141 .print = event_enable_trigger_print,
1142 .init = event_trigger_init,
1143 .free = event_enable_trigger_free,
1144};
1145
1146static struct event_trigger_ops event_disable_trigger_ops = {
1147 .func = event_enable_trigger,
1148 .print = event_enable_trigger_print,
1149 .init = event_trigger_init,
1150 .free = event_enable_trigger_free,
1151};
1152
1153static struct event_trigger_ops event_disable_count_trigger_ops = {
1154 .func = event_enable_count_trigger,
1155 .print = event_enable_trigger_print,
1156 .init = event_trigger_init,
1157 .free = event_enable_trigger_free,
1158};
1159
1160static int
1161event_enable_trigger_func(struct event_command *cmd_ops,
1162 struct ftrace_event_file *file,
1163 char *glob, char *cmd, char *param)
1164{
1165 struct ftrace_event_file *event_enable_file;
1166 struct enable_trigger_data *enable_data;
1167 struct event_trigger_data *trigger_data;
1168 struct event_trigger_ops *trigger_ops;
1169 struct trace_array *tr = file->tr;
1170 const char *system;
1171 const char *event;
1172 char *trigger;
1173 char *number;
1174 bool enable;
1175 int ret;
1176
1177 if (!param)
1178 return -EINVAL;
1179
1180 /* separate the trigger from the filter (s:e:n [if filter]) */
1181 trigger = strsep(&param, " \t");
1182 if (!trigger)
1183 return -EINVAL;
1184
1185 system = strsep(&trigger, ":");
1186 if (!trigger)
1187 return -EINVAL;
1188
1189 event = strsep(&trigger, ":");
1190
1191 ret = -EINVAL;
1192 event_enable_file = find_event_file(tr, system, event);
1193 if (!event_enable_file)
1194 goto out;
1195
1196 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
1197
1198 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
1199
1200 ret = -ENOMEM;
1201 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
1202 if (!trigger_data)
1203 goto out;
1204
1205 enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
1206 if (!enable_data) {
1207 kfree(trigger_data);
1208 goto out;
1209 }
1210
1211 trigger_data->count = -1;
1212 trigger_data->ops = trigger_ops;
1213 trigger_data->cmd_ops = cmd_ops;
1214 INIT_LIST_HEAD(&trigger_data->list);
1215 RCU_INIT_POINTER(trigger_data->filter, NULL);
1216
1217 enable_data->enable = enable;
1218 enable_data->file = event_enable_file;
1219 trigger_data->private_data = enable_data;
1220
1221 if (glob[0] == '!') {
1222 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
1223 kfree(trigger_data);
1224 kfree(enable_data);
1225 ret = 0;
1226 goto out;
1227 }
1228
1229 if (trigger) {
1230 number = strsep(&trigger, ":");
1231
1232 ret = -EINVAL;
1233 if (!strlen(number))
1234 goto out_free;
1235
1236 /*
1237 * We use the callback data field (which is a pointer)
1238 * as our counter.
1239 */
1240 ret = kstrtoul(number, 0, &trigger_data->count);
1241 if (ret)
1242 goto out_free;
1243 }
1244
1245 if (!param) /* if param is non-empty, it's supposed to be a filter */
1246 goto out_reg;
1247
1248 if (!cmd_ops->set_filter)
1249 goto out_reg;
1250
1251 ret = cmd_ops->set_filter(param, trigger_data, file);
1252 if (ret < 0)
1253 goto out_free;
1254
1255 out_reg:
1256 /* Don't let event modules unload while probe registered */
1257 ret = try_module_get(event_enable_file->event_call->mod);
1258 if (!ret) {
1259 ret = -EBUSY;
1260 goto out_free;
1261 }
1262
1263 ret = trace_event_enable_disable(event_enable_file, 1, 1);
1264 if (ret < 0)
1265 goto out_put;
1266 ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
1267 /*
1268 * The above returns on success the # of functions enabled,
1269 * but if it didn't find any functions it returns zero.
1270 * Consider no functions a failure too.
1271 */
1272 if (!ret) {
1273 ret = -ENOENT;
1274 goto out_disable;
1275 } else if (ret < 0)
1276 goto out_disable;
1277 /* Just return zero, not the number of enabled functions */
1278 ret = 0;
1279 out:
1280 return ret;
1281
1282 out_disable:
1283 trace_event_enable_disable(event_enable_file, 0, 1);
1284 out_put:
1285 module_put(event_enable_file->event_call->mod);
1286 out_free:
1287 if (cmd_ops->set_filter)
1288 cmd_ops->set_filter(NULL, trigger_data, NULL);
1289 kfree(trigger_data);
1290 kfree(enable_data);
1291 goto out;
1292}
1293
1294static int event_enable_register_trigger(char *glob,
1295 struct event_trigger_ops *ops,
1296 struct event_trigger_data *data,
1297 struct ftrace_event_file *file)
1298{
1299 struct enable_trigger_data *enable_data = data->private_data;
1300 struct enable_trigger_data *test_enable_data;
1301 struct event_trigger_data *test;
1302 int ret = 0;
1303
1304 list_for_each_entry_rcu(test, &file->triggers, list) {
1305 test_enable_data = test->private_data;
1306 if (test_enable_data &&
1307 (test_enable_data->file == enable_data->file)) {
1308 ret = -EEXIST;
1309 goto out;
1310 }
1311 }
1312
1313 if (data->ops->init) {
1314 ret = data->ops->init(data->ops, data);
1315 if (ret < 0)
1316 goto out;
1317 }
1318
1319 list_add_rcu(&data->list, &file->triggers);
1320 ret++;
1321
1322 if (trace_event_trigger_enable_disable(file, 1) < 0) {
1323 list_del_rcu(&data->list);
1324 ret--;
1325 }
1326 update_cond_flag(file);
1327out:
1328 return ret;
1329}
1330
1331static void event_enable_unregister_trigger(char *glob,
1332 struct event_trigger_ops *ops,
1333 struct event_trigger_data *test,
1334 struct ftrace_event_file *file)
1335{
1336 struct enable_trigger_data *test_enable_data = test->private_data;
1337 struct enable_trigger_data *enable_data;
1338 struct event_trigger_data *data;
1339 bool unregistered = false;
1340
1341 list_for_each_entry_rcu(data, &file->triggers, list) {
1342 enable_data = data->private_data;
1343 if (enable_data &&
1344 (enable_data->file == test_enable_data->file)) {
1345 unregistered = true;
1346 list_del_rcu(&data->list);
1347 update_cond_flag(file);
1348 trace_event_trigger_enable_disable(file, 0);
1349 break;
1350 }
1351 }
1352
1353 if (unregistered && data->ops->free)
1354 data->ops->free(data->ops, data);
1355}
1356
1357static struct event_trigger_ops *
1358event_enable_get_trigger_ops(char *cmd, char *param)
1359{
1360 struct event_trigger_ops *ops;
1361 bool enable;
1362
1363 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
1364
1365 if (enable)
1366 ops = param ? &event_enable_count_trigger_ops :
1367 &event_enable_trigger_ops;
1368 else
1369 ops = param ? &event_disable_count_trigger_ops :
1370 &event_disable_trigger_ops;
1371
1372 return ops;
1373}
1374
1375static struct event_command trigger_enable_cmd = {
1376 .name = ENABLE_EVENT_STR,
1377 .trigger_type = ETT_EVENT_ENABLE,
1378 .func = event_enable_trigger_func,
1379 .reg = event_enable_register_trigger,
1380 .unreg = event_enable_unregister_trigger,
1381 .get_trigger_ops = event_enable_get_trigger_ops,
1382 .set_filter = set_trigger_filter,
1383};
1384
1385static struct event_command trigger_disable_cmd = {
1386 .name = DISABLE_EVENT_STR,
1387 .trigger_type = ETT_EVENT_ENABLE,
1388 .func = event_enable_trigger_func,
1389 .reg = event_enable_register_trigger,
1390 .unreg = event_enable_unregister_trigger,
1391 .get_trigger_ops = event_enable_get_trigger_ops,
1392 .set_filter = set_trigger_filter,
1393};
1394
1395static __init void unregister_trigger_enable_disable_cmds(void)
1396{
1397 unregister_event_command(&trigger_enable_cmd);
1398 unregister_event_command(&trigger_disable_cmd);
1399}
1400
1401static __init int register_trigger_enable_disable_cmds(void)
1402{
1403 int ret;
1404
1405 ret = register_event_command(&trigger_enable_cmd);
1406 if (WARN_ON(ret < 0))
1407 return ret;
1408 ret = register_event_command(&trigger_disable_cmd);
1409 if (WARN_ON(ret < 0))
1410 unregister_trigger_enable_disable_cmds();
1411
1412 return ret;
1413}
1414
1415static __init int register_trigger_traceon_traceoff_cmds(void)
1416{
1417 int ret;
1418
1419 ret = register_event_command(&trigger_traceon_cmd);
1420 if (WARN_ON(ret < 0))
1421 return ret;
1422 ret = register_event_command(&trigger_traceoff_cmd);
1423 if (WARN_ON(ret < 0))
1424 unregister_trigger_traceon_traceoff_cmds();
1425
1426 return ret;
1427}
1428
1429__init int register_trigger_cmds(void)
1430{
1431 register_trigger_traceon_traceoff_cmds();
1432 register_trigger_snapshot_cmd();
1433 register_trigger_stacktrace_cmd();
1434 register_trigger_enable_disable_cmds();
1435
1436 return 0;
1437}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index dae9541ada9e..bdbae450c13e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,18 +27,12 @@
27/** 27/**
28 * Kprobe event core functions 28 * Kprobe event core functions
29 */ 29 */
30struct trace_probe { 30struct trace_kprobe {
31 struct list_head list; 31 struct list_head list;
32 struct kretprobe rp; /* Use rp.kp for kprobe use */ 32 struct kretprobe rp; /* Use rp.kp for kprobe use */
33 unsigned long nhit; 33 unsigned long nhit;
34 unsigned int flags; /* For TP_FLAG_* */
35 const char *symbol; /* symbol name */ 34 const char *symbol; /* symbol name */
36 struct ftrace_event_class class; 35 struct trace_probe tp;
37 struct ftrace_event_call call;
38 struct list_head files;
39 ssize_t size; /* trace entry size */
40 unsigned int nr_args;
41 struct probe_arg args[];
42}; 36};
43 37
44struct event_file_link { 38struct event_file_link {
@@ -46,56 +40,46 @@ struct event_file_link {
46 struct list_head list; 40 struct list_head list;
47}; 41};
48 42
49#define SIZEOF_TRACE_PROBE(n) \ 43#define SIZEOF_TRACE_KPROBE(n) \
50 (offsetof(struct trace_probe, args) + \ 44 (offsetof(struct trace_kprobe, tp.args) + \
51 (sizeof(struct probe_arg) * (n))) 45 (sizeof(struct probe_arg) * (n)))
52 46
53 47
54static __kprobes bool trace_probe_is_return(struct trace_probe *tp) 48static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
55{ 49{
56 return tp->rp.handler != NULL; 50 return tk->rp.handler != NULL;
57} 51}
58 52
59static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) 53static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
60{ 54{
61 return tp->symbol ? tp->symbol : "unknown"; 55 return tk->symbol ? tk->symbol : "unknown";
62} 56}
63 57
64static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) 58static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
65{ 59{
66 return tp->rp.kp.offset; 60 return tk->rp.kp.offset;
67} 61}
68 62
69static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) 63static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
70{ 64{
71 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); 65 return !!(kprobe_gone(&tk->rp.kp));
72} 66}
73 67
74static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) 68static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
75{ 69 struct module *mod)
76 return !!(tp->flags & TP_FLAG_REGISTERED);
77}
78
79static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
80{
81 return !!(kprobe_gone(&tp->rp.kp));
82}
83
84static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
85 struct module *mod)
86{ 70{
87 int len = strlen(mod->name); 71 int len = strlen(mod->name);
88 const char *name = trace_probe_symbol(tp); 72 const char *name = trace_kprobe_symbol(tk);
89 return strncmp(mod->name, name, len) == 0 && name[len] == ':'; 73 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
90} 74}
91 75
92static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) 76static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
93{ 77{
94 return !!strchr(trace_probe_symbol(tp), ':'); 78 return !!strchr(trace_kprobe_symbol(tk), ':');
95} 79}
96 80
97static int register_probe_event(struct trace_probe *tp); 81static int register_kprobe_event(struct trace_kprobe *tk);
98static int unregister_probe_event(struct trace_probe *tp); 82static int unregister_kprobe_event(struct trace_kprobe *tk);
99 83
100static DEFINE_MUTEX(probe_lock); 84static DEFINE_MUTEX(probe_lock);
101static LIST_HEAD(probe_list); 85static LIST_HEAD(probe_list);
@@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
104static int kretprobe_dispatcher(struct kretprobe_instance *ri, 88static int kretprobe_dispatcher(struct kretprobe_instance *ri,
105 struct pt_regs *regs); 89 struct pt_regs *regs);
106 90
91/* Memory fetching by symbol */
92struct symbol_cache {
93 char *symbol;
94 long offset;
95 unsigned long addr;
96};
97
98unsigned long update_symbol_cache(struct symbol_cache *sc)
99{
100 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
101
102 if (sc->addr)
103 sc->addr += sc->offset;
104
105 return sc->addr;
106}
107
108void free_symbol_cache(struct symbol_cache *sc)
109{
110 kfree(sc->symbol);
111 kfree(sc);
112}
113
114struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
115{
116 struct symbol_cache *sc;
117
118 if (!sym || strlen(sym) == 0)
119 return NULL;
120
121 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
122 if (!sc)
123 return NULL;
124
125 sc->symbol = kstrdup(sym, GFP_KERNEL);
126 if (!sc->symbol) {
127 kfree(sc);
128 return NULL;
129 }
130 sc->offset = offset;
131 update_symbol_cache(sc);
132
133 return sc;
134}
135
136/*
137 * Kprobes-specific fetch functions
138 */
139#define DEFINE_FETCH_stack(type) \
140static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
141 void *offset, void *dest) \
142{ \
143 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
144 (unsigned int)((unsigned long)offset)); \
145}
146DEFINE_BASIC_FETCH_FUNCS(stack)
147/* No string on the stack entry */
148#define fetch_stack_string NULL
149#define fetch_stack_string_size NULL
150
151#define DEFINE_FETCH_memory(type) \
152static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
153 void *addr, void *dest) \
154{ \
155 type retval; \
156 if (probe_kernel_address(addr, retval)) \
157 *(type *)dest = 0; \
158 else \
159 *(type *)dest = retval; \
160}
161DEFINE_BASIC_FETCH_FUNCS(memory)
162/*
163 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
164 * length and relative data location.
165 */
166static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
167 void *addr, void *dest)
168{
169 long ret;
170 int maxlen = get_rloc_len(*(u32 *)dest);
171 u8 *dst = get_rloc_data(dest);
172 u8 *src = addr;
173 mm_segment_t old_fs = get_fs();
174
175 if (!maxlen)
176 return;
177
178 /*
179 * Try to get string again, since the string can be changed while
180 * probing.
181 */
182 set_fs(KERNEL_DS);
183 pagefault_disable();
184
185 do
186 ret = __copy_from_user_inatomic(dst++, src++, 1);
187 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
188
189 dst[-1] = '\0';
190 pagefault_enable();
191 set_fs(old_fs);
192
193 if (ret < 0) { /* Failed to fetch string */
194 ((u8 *)get_rloc_data(dest))[0] = '\0';
195 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
196 } else {
197 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
198 get_rloc_offs(*(u32 *)dest));
199 }
200}
201
202/* Return the length of string -- including null terminal byte */
203static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
204 void *addr, void *dest)
205{
206 mm_segment_t old_fs;
207 int ret, len = 0;
208 u8 c;
209
210 old_fs = get_fs();
211 set_fs(KERNEL_DS);
212 pagefault_disable();
213
214 do {
215 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
216 len++;
217 } while (c && ret == 0 && len < MAX_STRING_SIZE);
218
219 pagefault_enable();
220 set_fs(old_fs);
221
222 if (ret < 0) /* Failed to check the length */
223 *(u32 *)dest = 0;
224 else
225 *(u32 *)dest = len;
226}
227
228#define DEFINE_FETCH_symbol(type) \
229__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \
230 void *data, void *dest) \
231{ \
232 struct symbol_cache *sc = data; \
233 if (sc->addr) \
234 fetch_memory_##type(regs, (void *)sc->addr, dest); \
235 else \
236 *(type *)dest = 0; \
237}
238DEFINE_BASIC_FETCH_FUNCS(symbol)
239DEFINE_FETCH_symbol(string)
240DEFINE_FETCH_symbol(string_size)
241
242/* kprobes don't support file_offset fetch methods */
243#define fetch_file_offset_u8 NULL
244#define fetch_file_offset_u16 NULL
245#define fetch_file_offset_u32 NULL
246#define fetch_file_offset_u64 NULL
247#define fetch_file_offset_string NULL
248#define fetch_file_offset_string_size NULL
249
250/* Fetch type information table */
251const struct fetch_type kprobes_fetch_type_table[] = {
252 /* Special types */
253 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
254 sizeof(u32), 1, "__data_loc char[]"),
255 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
256 string_size, sizeof(u32), 0, "u32"),
257 /* Basic types */
258 ASSIGN_FETCH_TYPE(u8, u8, 0),
259 ASSIGN_FETCH_TYPE(u16, u16, 0),
260 ASSIGN_FETCH_TYPE(u32, u32, 0),
261 ASSIGN_FETCH_TYPE(u64, u64, 0),
262 ASSIGN_FETCH_TYPE(s8, u8, 1),
263 ASSIGN_FETCH_TYPE(s16, u16, 1),
264 ASSIGN_FETCH_TYPE(s32, u32, 1),
265 ASSIGN_FETCH_TYPE(s64, u64, 1),
266
267 ASSIGN_FETCH_TYPE_END
268};
269
107/* 270/*
108 * Allocate new trace_probe and initialize it (including kprobes). 271 * Allocate new trace_probe and initialize it (including kprobes).
109 */ 272 */
110static struct trace_probe *alloc_trace_probe(const char *group, 273static struct trace_kprobe *alloc_trace_kprobe(const char *group,
111 const char *event, 274 const char *event,
112 void *addr, 275 void *addr,
113 const char *symbol, 276 const char *symbol,
114 unsigned long offs, 277 unsigned long offs,
115 int nargs, bool is_return) 278 int nargs, bool is_return)
116{ 279{
117 struct trace_probe *tp; 280 struct trace_kprobe *tk;
118 int ret = -ENOMEM; 281 int ret = -ENOMEM;
119 282
120 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); 283 tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
121 if (!tp) 284 if (!tk)
122 return ERR_PTR(ret); 285 return ERR_PTR(ret);
123 286
124 if (symbol) { 287 if (symbol) {
125 tp->symbol = kstrdup(symbol, GFP_KERNEL); 288 tk->symbol = kstrdup(symbol, GFP_KERNEL);
126 if (!tp->symbol) 289 if (!tk->symbol)
127 goto error; 290 goto error;
128 tp->rp.kp.symbol_name = tp->symbol; 291 tk->rp.kp.symbol_name = tk->symbol;
129 tp->rp.kp.offset = offs; 292 tk->rp.kp.offset = offs;
130 } else 293 } else
131 tp->rp.kp.addr = addr; 294 tk->rp.kp.addr = addr;
132 295
133 if (is_return) 296 if (is_return)
134 tp->rp.handler = kretprobe_dispatcher; 297 tk->rp.handler = kretprobe_dispatcher;
135 else 298 else
136 tp->rp.kp.pre_handler = kprobe_dispatcher; 299 tk->rp.kp.pre_handler = kprobe_dispatcher;
137 300
138 if (!event || !is_good_name(event)) { 301 if (!event || !is_good_name(event)) {
139 ret = -EINVAL; 302 ret = -EINVAL;
140 goto error; 303 goto error;
141 } 304 }
142 305
143 tp->call.class = &tp->class; 306 tk->tp.call.class = &tk->tp.class;
144 tp->call.name = kstrdup(event, GFP_KERNEL); 307 tk->tp.call.name = kstrdup(event, GFP_KERNEL);
145 if (!tp->call.name) 308 if (!tk->tp.call.name)
146 goto error; 309 goto error;
147 310
148 if (!group || !is_good_name(group)) { 311 if (!group || !is_good_name(group)) {
@@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group,
150 goto error; 313 goto error;
151 } 314 }
152 315
153 tp->class.system = kstrdup(group, GFP_KERNEL); 316 tk->tp.class.system = kstrdup(group, GFP_KERNEL);
154 if (!tp->class.system) 317 if (!tk->tp.class.system)
155 goto error; 318 goto error;
156 319
157 INIT_LIST_HEAD(&tp->list); 320 INIT_LIST_HEAD(&tk->list);
158 INIT_LIST_HEAD(&tp->files); 321 INIT_LIST_HEAD(&tk->tp.files);
159 return tp; 322 return tk;
160error: 323error:
161 kfree(tp->call.name); 324 kfree(tk->tp.call.name);
162 kfree(tp->symbol); 325 kfree(tk->symbol);
163 kfree(tp); 326 kfree(tk);
164 return ERR_PTR(ret); 327 return ERR_PTR(ret);
165} 328}
166 329
167static void free_trace_probe(struct trace_probe *tp) 330static void free_trace_kprobe(struct trace_kprobe *tk)
168{ 331{
169 int i; 332 int i;
170 333
171 for (i = 0; i < tp->nr_args; i++) 334 for (i = 0; i < tk->tp.nr_args; i++)
172 traceprobe_free_probe_arg(&tp->args[i]); 335 traceprobe_free_probe_arg(&tk->tp.args[i]);
173 336
174 kfree(tp->call.class->system); 337 kfree(tk->tp.call.class->system);
175 kfree(tp->call.name); 338 kfree(tk->tp.call.name);
176 kfree(tp->symbol); 339 kfree(tk->symbol);
177 kfree(tp); 340 kfree(tk);
178} 341}
179 342
180static struct trace_probe *find_trace_probe(const char *event, 343static struct trace_kprobe *find_trace_kprobe(const char *event,
181 const char *group) 344 const char *group)
182{ 345{
183 struct trace_probe *tp; 346 struct trace_kprobe *tk;
184 347
185 list_for_each_entry(tp, &probe_list, list) 348 list_for_each_entry(tk, &probe_list, list)
186 if (strcmp(tp->call.name, event) == 0 && 349 if (strcmp(tk->tp.call.name, event) == 0 &&
187 strcmp(tp->call.class->system, group) == 0) 350 strcmp(tk->tp.call.class->system, group) == 0)
188 return tp; 351 return tk;
189 return NULL; 352 return NULL;
190} 353}
191 354
@@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event,
194 * if the file is NULL, enable "perf" handler, or enable "trace" handler. 357 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
195 */ 358 */
196static int 359static int
197enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 360enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
198{ 361{
199 int ret = 0; 362 int ret = 0;
200 363
@@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
208 } 371 }
209 372
210 link->file = file; 373 link->file = file;
211 list_add_tail_rcu(&link->list, &tp->files); 374 list_add_tail_rcu(&link->list, &tk->tp.files);
212 375
213 tp->flags |= TP_FLAG_TRACE; 376 tk->tp.flags |= TP_FLAG_TRACE;
214 } else 377 } else
215 tp->flags |= TP_FLAG_PROFILE; 378 tk->tp.flags |= TP_FLAG_PROFILE;
216 379
217 if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { 380 if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
218 if (trace_probe_is_return(tp)) 381 if (trace_kprobe_is_return(tk))
219 ret = enable_kretprobe(&tp->rp); 382 ret = enable_kretprobe(&tk->rp);
220 else 383 else
221 ret = enable_kprobe(&tp->rp.kp); 384 ret = enable_kprobe(&tk->rp.kp);
222 } 385 }
223 out: 386 out:
224 return ret; 387 return ret;
@@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
241 * if the file is NULL, disable "perf" handler, or disable "trace" handler. 404 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
242 */ 405 */
243static int 406static int
244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 407disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
245{ 408{
246 struct event_file_link *link = NULL; 409 struct event_file_link *link = NULL;
247 int wait = 0; 410 int wait = 0;
248 int ret = 0; 411 int ret = 0;
249 412
250 if (file) { 413 if (file) {
251 link = find_event_file_link(tp, file); 414 link = find_event_file_link(&tk->tp, file);
252 if (!link) { 415 if (!link) {
253 ret = -EINVAL; 416 ret = -EINVAL;
254 goto out; 417 goto out;
@@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
256 419
257 list_del_rcu(&link->list); 420 list_del_rcu(&link->list);
258 wait = 1; 421 wait = 1;
259 if (!list_empty(&tp->files)) 422 if (!list_empty(&tk->tp.files))
260 goto out; 423 goto out;
261 424
262 tp->flags &= ~TP_FLAG_TRACE; 425 tk->tp.flags &= ~TP_FLAG_TRACE;
263 } else 426 } else
264 tp->flags &= ~TP_FLAG_PROFILE; 427 tk->tp.flags &= ~TP_FLAG_PROFILE;
265 428
266 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { 429 if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
267 if (trace_probe_is_return(tp)) 430 if (trace_kprobe_is_return(tk))
268 disable_kretprobe(&tp->rp); 431 disable_kretprobe(&tk->rp);
269 else 432 else
270 disable_kprobe(&tp->rp.kp); 433 disable_kprobe(&tk->rp.kp);
271 wait = 1; 434 wait = 1;
272 } 435 }
273 out: 436 out:
@@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
288} 451}
289 452
290/* Internal register function - just handle k*probes and flags */ 453/* Internal register function - just handle k*probes and flags */
291static int __register_trace_probe(struct trace_probe *tp) 454static int __register_trace_kprobe(struct trace_kprobe *tk)
292{ 455{
293 int i, ret; 456 int i, ret;
294 457
295 if (trace_probe_is_registered(tp)) 458 if (trace_probe_is_registered(&tk->tp))
296 return -EINVAL; 459 return -EINVAL;
297 460
298 for (i = 0; i < tp->nr_args; i++) 461 for (i = 0; i < tk->tp.nr_args; i++)
299 traceprobe_update_arg(&tp->args[i]); 462 traceprobe_update_arg(&tk->tp.args[i]);
300 463
301 /* Set/clear disabled flag according to tp->flag */ 464 /* Set/clear disabled flag according to tp->flag */
302 if (trace_probe_is_enabled(tp)) 465 if (trace_probe_is_enabled(&tk->tp))
303 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; 466 tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
304 else 467 else
305 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 468 tk->rp.kp.flags |= KPROBE_FLAG_DISABLED;
306 469
307 if (trace_probe_is_return(tp)) 470 if (trace_kprobe_is_return(tk))
308 ret = register_kretprobe(&tp->rp); 471 ret = register_kretprobe(&tk->rp);
309 else 472 else
310 ret = register_kprobe(&tp->rp.kp); 473 ret = register_kprobe(&tk->rp.kp);
311 474
312 if (ret == 0) 475 if (ret == 0)
313 tp->flags |= TP_FLAG_REGISTERED; 476 tk->tp.flags |= TP_FLAG_REGISTERED;
314 else { 477 else {
315 pr_warning("Could not insert probe at %s+%lu: %d\n", 478 pr_warning("Could not insert probe at %s+%lu: %d\n",
316 trace_probe_symbol(tp), trace_probe_offset(tp), ret); 479 trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
317 if (ret == -ENOENT && trace_probe_is_on_module(tp)) { 480 if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
318 pr_warning("This probe might be able to register after" 481 pr_warning("This probe might be able to register after"
319 "target module is loaded. Continue.\n"); 482 "target module is loaded. Continue.\n");
320 ret = 0; 483 ret = 0;
321 } else if (ret == -EILSEQ) { 484 } else if (ret == -EILSEQ) {
322 pr_warning("Probing address(0x%p) is not an " 485 pr_warning("Probing address(0x%p) is not an "
323 "instruction boundary.\n", 486 "instruction boundary.\n",
324 tp->rp.kp.addr); 487 tk->rp.kp.addr);
325 ret = -EINVAL; 488 ret = -EINVAL;
326 } 489 }
327 } 490 }
@@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp)
330} 493}
331 494
332/* Internal unregister function - just handle k*probes and flags */ 495/* Internal unregister function - just handle k*probes and flags */
333static void __unregister_trace_probe(struct trace_probe *tp) 496static void __unregister_trace_kprobe(struct trace_kprobe *tk)
334{ 497{
335 if (trace_probe_is_registered(tp)) { 498 if (trace_probe_is_registered(&tk->tp)) {
336 if (trace_probe_is_return(tp)) 499 if (trace_kprobe_is_return(tk))
337 unregister_kretprobe(&tp->rp); 500 unregister_kretprobe(&tk->rp);
338 else 501 else
339 unregister_kprobe(&tp->rp.kp); 502 unregister_kprobe(&tk->rp.kp);
340 tp->flags &= ~TP_FLAG_REGISTERED; 503 tk->tp.flags &= ~TP_FLAG_REGISTERED;
341 /* Cleanup kprobe for reuse */ 504 /* Cleanup kprobe for reuse */
342 if (tp->rp.kp.symbol_name) 505 if (tk->rp.kp.symbol_name)
343 tp->rp.kp.addr = NULL; 506 tk->rp.kp.addr = NULL;
344 } 507 }
345} 508}
346 509
347/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 510/* Unregister a trace_probe and probe_event: call with locking probe_lock */
348static int unregister_trace_probe(struct trace_probe *tp) 511static int unregister_trace_kprobe(struct trace_kprobe *tk)
349{ 512{
350 /* Enabled event can not be unregistered */ 513 /* Enabled event can not be unregistered */
351 if (trace_probe_is_enabled(tp)) 514 if (trace_probe_is_enabled(&tk->tp))
352 return -EBUSY; 515 return -EBUSY;
353 516
354 /* Will fail if probe is being used by ftrace or perf */ 517 /* Will fail if probe is being used by ftrace or perf */
355 if (unregister_probe_event(tp)) 518 if (unregister_kprobe_event(tk))
356 return -EBUSY; 519 return -EBUSY;
357 520
358 __unregister_trace_probe(tp); 521 __unregister_trace_kprobe(tk);
359 list_del(&tp->list); 522 list_del(&tk->list);
360 523
361 return 0; 524 return 0;
362} 525}
363 526
364/* Register a trace_probe and probe_event */ 527/* Register a trace_probe and probe_event */
365static int register_trace_probe(struct trace_probe *tp) 528static int register_trace_kprobe(struct trace_kprobe *tk)
366{ 529{
367 struct trace_probe *old_tp; 530 struct trace_kprobe *old_tk;
368 int ret; 531 int ret;
369 532
370 mutex_lock(&probe_lock); 533 mutex_lock(&probe_lock);
371 534
372 /* Delete old (same name) event if exist */ 535 /* Delete old (same name) event if exist */
373 old_tp = find_trace_probe(tp->call.name, tp->call.class->system); 536 old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
374 if (old_tp) { 537 if (old_tk) {
375 ret = unregister_trace_probe(old_tp); 538 ret = unregister_trace_kprobe(old_tk);
376 if (ret < 0) 539 if (ret < 0)
377 goto end; 540 goto end;
378 free_trace_probe(old_tp); 541 free_trace_kprobe(old_tk);
379 } 542 }
380 543
381 /* Register new event */ 544 /* Register new event */
382 ret = register_probe_event(tp); 545 ret = register_kprobe_event(tk);
383 if (ret) { 546 if (ret) {
384 pr_warning("Failed to register probe event(%d)\n", ret); 547 pr_warning("Failed to register probe event(%d)\n", ret);
385 goto end; 548 goto end;
386 } 549 }
387 550
388 /* Register k*probe */ 551 /* Register k*probe */
389 ret = __register_trace_probe(tp); 552 ret = __register_trace_kprobe(tk);
390 if (ret < 0) 553 if (ret < 0)
391 unregister_probe_event(tp); 554 unregister_kprobe_event(tk);
392 else 555 else
393 list_add_tail(&tp->list, &probe_list); 556 list_add_tail(&tk->list, &probe_list);
394 557
395end: 558end:
396 mutex_unlock(&probe_lock); 559 mutex_unlock(&probe_lock);
@@ -398,11 +561,11 @@ end:
398} 561}
399 562
400/* Module notifier call back, checking event on the module */ 563/* Module notifier call back, checking event on the module */
401static int trace_probe_module_callback(struct notifier_block *nb, 564static int trace_kprobe_module_callback(struct notifier_block *nb,
402 unsigned long val, void *data) 565 unsigned long val, void *data)
403{ 566{
404 struct module *mod = data; 567 struct module *mod = data;
405 struct trace_probe *tp; 568 struct trace_kprobe *tk;
406 int ret; 569 int ret;
407 570
408 if (val != MODULE_STATE_COMING) 571 if (val != MODULE_STATE_COMING)
@@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb,
410 573
411 /* Update probes on coming module */ 574 /* Update probes on coming module */
412 mutex_lock(&probe_lock); 575 mutex_lock(&probe_lock);
413 list_for_each_entry(tp, &probe_list, list) { 576 list_for_each_entry(tk, &probe_list, list) {
414 if (trace_probe_within_module(tp, mod)) { 577 if (trace_kprobe_within_module(tk, mod)) {
415 /* Don't need to check busy - this should have gone. */ 578 /* Don't need to check busy - this should have gone. */
416 __unregister_trace_probe(tp); 579 __unregister_trace_kprobe(tk);
417 ret = __register_trace_probe(tp); 580 ret = __register_trace_kprobe(tk);
418 if (ret) 581 if (ret)
419 pr_warning("Failed to re-register probe %s on" 582 pr_warning("Failed to re-register probe %s on"
420 "%s: %d\n", 583 "%s: %d\n",
421 tp->call.name, mod->name, ret); 584 tk->tp.call.name, mod->name, ret);
422 } 585 }
423 } 586 }
424 mutex_unlock(&probe_lock); 587 mutex_unlock(&probe_lock);
@@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb,
426 return NOTIFY_DONE; 589 return NOTIFY_DONE;
427} 590}
428 591
429static struct notifier_block trace_probe_module_nb = { 592static struct notifier_block trace_kprobe_module_nb = {
430 .notifier_call = trace_probe_module_callback, 593 .notifier_call = trace_kprobe_module_callback,
431 .priority = 1 /* Invoked after kprobe module callback */ 594 .priority = 1 /* Invoked after kprobe module callback */
432}; 595};
433 596
434static int create_trace_probe(int argc, char **argv) 597static int create_trace_kprobe(int argc, char **argv)
435{ 598{
436 /* 599 /*
437 * Argument syntax: 600 * Argument syntax:
@@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv)
451 * Type of args: 614 * Type of args:
452 * FETCHARG:TYPE : use TYPE instead of unsigned long. 615 * FETCHARG:TYPE : use TYPE instead of unsigned long.
453 */ 616 */
454 struct trace_probe *tp; 617 struct trace_kprobe *tk;
455 int i, ret = 0; 618 int i, ret = 0;
456 bool is_return = false, is_delete = false; 619 bool is_return = false, is_delete = false;
457 char *symbol = NULL, *event = NULL, *group = NULL; 620 char *symbol = NULL, *event = NULL, *group = NULL;
@@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv)
498 return -EINVAL; 661 return -EINVAL;
499 } 662 }
500 mutex_lock(&probe_lock); 663 mutex_lock(&probe_lock);
501 tp = find_trace_probe(event, group); 664 tk = find_trace_kprobe(event, group);
502 if (!tp) { 665 if (!tk) {
503 mutex_unlock(&probe_lock); 666 mutex_unlock(&probe_lock);
504 pr_info("Event %s/%s doesn't exist.\n", group, event); 667 pr_info("Event %s/%s doesn't exist.\n", group, event);
505 return -ENOENT; 668 return -ENOENT;
506 } 669 }
507 /* delete an event */ 670 /* delete an event */
508 ret = unregister_trace_probe(tp); 671 ret = unregister_trace_kprobe(tk);
509 if (ret == 0) 672 if (ret == 0)
510 free_trace_probe(tp); 673 free_trace_kprobe(tk);
511 mutex_unlock(&probe_lock); 674 mutex_unlock(&probe_lock);
512 return ret; 675 return ret;
513 } 676 }
@@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv)
554 is_return ? 'r' : 'p', addr); 717 is_return ? 'r' : 'p', addr);
555 event = buf; 718 event = buf;
556 } 719 }
557 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, 720 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
558 is_return); 721 is_return);
559 if (IS_ERR(tp)) { 722 if (IS_ERR(tk)) {
560 pr_info("Failed to allocate trace_probe.(%d)\n", 723 pr_info("Failed to allocate trace_probe.(%d)\n",
561 (int)PTR_ERR(tp)); 724 (int)PTR_ERR(tk));
562 return PTR_ERR(tp); 725 return PTR_ERR(tk);
563 } 726 }
564 727
565 /* parse arguments */ 728 /* parse arguments */
566 ret = 0; 729 ret = 0;
567 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 730 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
731 struct probe_arg *parg = &tk->tp.args[i];
732
568 /* Increment count for freeing args in error case */ 733 /* Increment count for freeing args in error case */
569 tp->nr_args++; 734 tk->tp.nr_args++;
570 735
571 /* Parse argument name */ 736 /* Parse argument name */
572 arg = strchr(argv[i], '='); 737 arg = strchr(argv[i], '=');
573 if (arg) { 738 if (arg) {
574 *arg++ = '\0'; 739 *arg++ = '\0';
575 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 740 parg->name = kstrdup(argv[i], GFP_KERNEL);
576 } else { 741 } else {
577 arg = argv[i]; 742 arg = argv[i];
578 /* If argument name is omitted, set "argN" */ 743 /* If argument name is omitted, set "argN" */
579 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); 744 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
580 tp->args[i].name = kstrdup(buf, GFP_KERNEL); 745 parg->name = kstrdup(buf, GFP_KERNEL);
581 } 746 }
582 747
583 if (!tp->args[i].name) { 748 if (!parg->name) {
584 pr_info("Failed to allocate argument[%d] name.\n", i); 749 pr_info("Failed to allocate argument[%d] name.\n", i);
585 ret = -ENOMEM; 750 ret = -ENOMEM;
586 goto error; 751 goto error;
587 } 752 }
588 753
589 if (!is_good_name(tp->args[i].name)) { 754 if (!is_good_name(parg->name)) {
590 pr_info("Invalid argument[%d] name: %s\n", 755 pr_info("Invalid argument[%d] name: %s\n",
591 i, tp->args[i].name); 756 i, parg->name);
592 ret = -EINVAL; 757 ret = -EINVAL;
593 goto error; 758 goto error;
594 } 759 }
595 760
596 if (traceprobe_conflict_field_name(tp->args[i].name, 761 if (traceprobe_conflict_field_name(parg->name,
597 tp->args, i)) { 762 tk->tp.args, i)) {
598 pr_info("Argument[%d] name '%s' conflicts with " 763 pr_info("Argument[%d] name '%s' conflicts with "
599 "another field.\n", i, argv[i]); 764 "another field.\n", i, argv[i]);
600 ret = -EINVAL; 765 ret = -EINVAL;
@@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv)
602 } 767 }
603 768
604 /* Parse fetch argument */ 769 /* Parse fetch argument */
605 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], 770 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
606 is_return, true); 771 is_return, true);
607 if (ret) { 772 if (ret) {
608 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 773 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
@@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv)
610 } 775 }
611 } 776 }
612 777
613 ret = register_trace_probe(tp); 778 ret = register_trace_kprobe(tk);
614 if (ret) 779 if (ret)
615 goto error; 780 goto error;
616 return 0; 781 return 0;
617 782
618error: 783error:
619 free_trace_probe(tp); 784 free_trace_kprobe(tk);
620 return ret; 785 return ret;
621} 786}
622 787
623static int release_all_trace_probes(void) 788static int release_all_trace_kprobes(void)
624{ 789{
625 struct trace_probe *tp; 790 struct trace_kprobe *tk;
626 int ret = 0; 791 int ret = 0;
627 792
628 mutex_lock(&probe_lock); 793 mutex_lock(&probe_lock);
629 /* Ensure no probe is in use. */ 794 /* Ensure no probe is in use. */
630 list_for_each_entry(tp, &probe_list, list) 795 list_for_each_entry(tk, &probe_list, list)
631 if (trace_probe_is_enabled(tp)) { 796 if (trace_probe_is_enabled(&tk->tp)) {
632 ret = -EBUSY; 797 ret = -EBUSY;
633 goto end; 798 goto end;
634 } 799 }
635 /* TODO: Use batch unregistration */ 800 /* TODO: Use batch unregistration */
636 while (!list_empty(&probe_list)) { 801 while (!list_empty(&probe_list)) {
637 tp = list_entry(probe_list.next, struct trace_probe, list); 802 tk = list_entry(probe_list.next, struct trace_kprobe, list);
638 ret = unregister_trace_probe(tp); 803 ret = unregister_trace_kprobe(tk);
639 if (ret) 804 if (ret)
640 goto end; 805 goto end;
641 free_trace_probe(tp); 806 free_trace_kprobe(tk);
642 } 807 }
643 808
644end: 809end:
@@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v)
666 831
667static int probes_seq_show(struct seq_file *m, void *v) 832static int probes_seq_show(struct seq_file *m, void *v)
668{ 833{
669 struct trace_probe *tp = v; 834 struct trace_kprobe *tk = v;
670 int i; 835 int i;
671 836
672 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); 837 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
673 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 838 seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
674 839
675 if (!tp->symbol) 840 if (!tk->symbol)
676 seq_printf(m, " 0x%p", tp->rp.kp.addr); 841 seq_printf(m, " 0x%p", tk->rp.kp.addr);
677 else if (tp->rp.kp.offset) 842 else if (tk->rp.kp.offset)
678 seq_printf(m, " %s+%u", trace_probe_symbol(tp), 843 seq_printf(m, " %s+%u", trace_kprobe_symbol(tk),
679 tp->rp.kp.offset); 844 tk->rp.kp.offset);
680 else 845 else
681 seq_printf(m, " %s", trace_probe_symbol(tp)); 846 seq_printf(m, " %s", trace_kprobe_symbol(tk));
682 847
683 for (i = 0; i < tp->nr_args; i++) 848 for (i = 0; i < tk->tp.nr_args; i++)
684 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 849 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
685 seq_printf(m, "\n"); 850 seq_printf(m, "\n");
686 851
687 return 0; 852 return 0;
@@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file)
699 int ret; 864 int ret;
700 865
701 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { 866 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
702 ret = release_all_trace_probes(); 867 ret = release_all_trace_kprobes();
703 if (ret < 0) 868 if (ret < 0)
704 return ret; 869 return ret;
705 } 870 }
@@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer,
711 size_t count, loff_t *ppos) 876 size_t count, loff_t *ppos)
712{ 877{
713 return traceprobe_probes_write(file, buffer, count, ppos, 878 return traceprobe_probes_write(file, buffer, count, ppos,
714 create_trace_probe); 879 create_trace_kprobe);
715} 880}
716 881
717static const struct file_operations kprobe_events_ops = { 882static const struct file_operations kprobe_events_ops = {
@@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = {
726/* Probes profiling interfaces */ 891/* Probes profiling interfaces */
727static int probes_profile_seq_show(struct seq_file *m, void *v) 892static int probes_profile_seq_show(struct seq_file *m, void *v)
728{ 893{
729 struct trace_probe *tp = v; 894 struct trace_kprobe *tk = v;
730 895
731 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, 896 seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
732 tp->rp.kp.nmissed); 897 tk->rp.kp.nmissed);
733 898
734 return 0; 899 return 0;
735} 900}
@@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = {
754 .release = seq_release, 919 .release = seq_release,
755}; 920};
756 921
757/* Sum up total data length for dynamic arraies (strings) */
758static __kprobes int __get_data_size(struct trace_probe *tp,
759 struct pt_regs *regs)
760{
761 int i, ret = 0;
762 u32 len;
763
764 for (i = 0; i < tp->nr_args; i++)
765 if (unlikely(tp->args[i].fetch_size.fn)) {
766 call_fetch(&tp->args[i].fetch_size, regs, &len);
767 ret += len;
768 }
769
770 return ret;
771}
772
773/* Store the value of each argument */
774static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
775 struct pt_regs *regs,
776 u8 *data, int maxlen)
777{
778 int i;
779 u32 end = tp->size;
780 u32 *dl; /* Data (relative) location */
781
782 for (i = 0; i < tp->nr_args; i++) {
783 if (unlikely(tp->args[i].fetch_size.fn)) {
784 /*
785 * First, we set the relative location and
786 * maximum data length to *dl
787 */
788 dl = (u32 *)(data + tp->args[i].offset);
789 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
790 /* Then try to fetch string or dynamic array data */
791 call_fetch(&tp->args[i].fetch, regs, dl);
792 /* Reduce maximum length */
793 end += get_rloc_len(*dl);
794 maxlen -= get_rloc_len(*dl);
795 /* Trick here, convert data_rloc to data_loc */
796 *dl = convert_rloc_to_loc(*dl,
797 ent_size + tp->args[i].offset);
798 } else
799 /* Just fetching data normally */
800 call_fetch(&tp->args[i].fetch, regs,
801 data + tp->args[i].offset);
802 }
803}
804
805/* Kprobe handler */ 922/* Kprobe handler */
806static __kprobes void 923static __kprobes void
807__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, 924__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
808 struct ftrace_event_file *ftrace_file) 925 struct ftrace_event_file *ftrace_file)
809{ 926{
810 struct kprobe_trace_entry_head *entry; 927 struct kprobe_trace_entry_head *entry;
@@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
812 struct ring_buffer *buffer; 929 struct ring_buffer *buffer;
813 int size, dsize, pc; 930 int size, dsize, pc;
814 unsigned long irq_flags; 931 unsigned long irq_flags;
815 struct ftrace_event_call *call = &tp->call; 932 struct ftrace_event_call *call = &tk->tp.call;
816 933
817 WARN_ON(call != ftrace_file->event_call); 934 WARN_ON(call != ftrace_file->event_call);
818 935
819 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 936 if (ftrace_trigger_soft_disabled(ftrace_file))
820 return; 937 return;
821 938
822 local_save_flags(irq_flags); 939 local_save_flags(irq_flags);
823 pc = preempt_count(); 940 pc = preempt_count();
824 941
825 dsize = __get_data_size(tp, regs); 942 dsize = __get_data_size(&tk->tp, regs);
826 size = sizeof(*entry) + tp->size + dsize; 943 size = sizeof(*entry) + tk->tp.size + dsize;
827 944
828 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, 945 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
829 call->event.type, 946 call->event.type,
@@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
832 return; 949 return;
833 950
834 entry = ring_buffer_event_data(event); 951 entry = ring_buffer_event_data(event);
835 entry->ip = (unsigned long)tp->rp.kp.addr; 952 entry->ip = (unsigned long)tk->rp.kp.addr;
836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 953 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
837 954
838 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 955 event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
839 trace_buffer_unlock_commit_regs(buffer, event, 956 entry, irq_flags, pc, regs);
840 irq_flags, pc, regs);
841} 957}
842 958
843static __kprobes void 959static __kprobes void
844kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) 960kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
845{ 961{
846 struct event_file_link *link; 962 struct event_file_link *link;
847 963
848 list_for_each_entry_rcu(link, &tp->files, list) 964 list_for_each_entry_rcu(link, &tk->tp.files, list)
849 __kprobe_trace_func(tp, regs, link->file); 965 __kprobe_trace_func(tk, regs, link->file);
850} 966}
851 967
852/* Kretprobe handler */ 968/* Kretprobe handler */
853static __kprobes void 969static __kprobes void
854__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 970__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
855 struct pt_regs *regs, 971 struct pt_regs *regs,
856 struct ftrace_event_file *ftrace_file) 972 struct ftrace_event_file *ftrace_file)
857{ 973{
@@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
860 struct ring_buffer *buffer; 976 struct ring_buffer *buffer;
861 int size, pc, dsize; 977 int size, pc, dsize;
862 unsigned long irq_flags; 978 unsigned long irq_flags;
863 struct ftrace_event_call *call = &tp->call; 979 struct ftrace_event_call *call = &tk->tp.call;
864 980
865 WARN_ON(call != ftrace_file->event_call); 981 WARN_ON(call != ftrace_file->event_call);
866 982
867 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 983 if (ftrace_trigger_soft_disabled(ftrace_file))
868 return; 984 return;
869 985
870 local_save_flags(irq_flags); 986 local_save_flags(irq_flags);
871 pc = preempt_count(); 987 pc = preempt_count();
872 988
873 dsize = __get_data_size(tp, regs); 989 dsize = __get_data_size(&tk->tp, regs);
874 size = sizeof(*entry) + tp->size + dsize; 990 size = sizeof(*entry) + tk->tp.size + dsize;
875 991
876 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, 992 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
877 call->event.type, 993 call->event.type,
@@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
880 return; 996 return;
881 997
882 entry = ring_buffer_event_data(event); 998 entry = ring_buffer_event_data(event);
883 entry->func = (unsigned long)tp->rp.kp.addr; 999 entry->func = (unsigned long)tk->rp.kp.addr;
884 entry->ret_ip = (unsigned long)ri->ret_addr; 1000 entry->ret_ip = (unsigned long)ri->ret_addr;
885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1001 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
886 1002
887 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 1003 event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
888 trace_buffer_unlock_commit_regs(buffer, event, 1004 entry, irq_flags, pc, regs);
889 irq_flags, pc, regs);
890} 1005}
891 1006
892static __kprobes void 1007static __kprobes void
893kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 1008kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
894 struct pt_regs *regs) 1009 struct pt_regs *regs)
895{ 1010{
896 struct event_file_link *link; 1011 struct event_file_link *link;
897 1012
898 list_for_each_entry_rcu(link, &tp->files, list) 1013 list_for_each_entry_rcu(link, &tk->tp.files, list)
899 __kretprobe_trace_func(tp, ri, regs, link->file); 1014 __kretprobe_trace_func(tk, ri, regs, link->file);
900} 1015}
901 1016
902/* Event entry printers */ 1017/* Event entry printers */
@@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
983{ 1098{
984 int ret, i; 1099 int ret, i;
985 struct kprobe_trace_entry_head field; 1100 struct kprobe_trace_entry_head field;
986 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1101 struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
987 1102
988 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1103 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
989 /* Set argument names as fields */ 1104 /* Set argument names as fields */
990 for (i = 0; i < tp->nr_args; i++) { 1105 for (i = 0; i < tk->tp.nr_args; i++) {
991 ret = trace_define_field(event_call, tp->args[i].type->fmttype, 1106 struct probe_arg *parg = &tk->tp.args[i];
992 tp->args[i].name, 1107
993 sizeof(field) + tp->args[i].offset, 1108 ret = trace_define_field(event_call, parg->type->fmttype,
994 tp->args[i].type->size, 1109 parg->name,
995 tp->args[i].type->is_signed, 1110 sizeof(field) + parg->offset,
1111 parg->type->size,
1112 parg->type->is_signed,
996 FILTER_OTHER); 1113 FILTER_OTHER);
997 if (ret) 1114 if (ret)
998 return ret; 1115 return ret;
@@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1004{ 1121{
1005 int ret, i; 1122 int ret, i;
1006 struct kretprobe_trace_entry_head field; 1123 struct kretprobe_trace_entry_head field;
1007 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1124 struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
1008 1125
1009 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1126 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1010 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1127 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1011 /* Set argument names as fields */ 1128 /* Set argument names as fields */
1012 for (i = 0; i < tp->nr_args; i++) { 1129 for (i = 0; i < tk->tp.nr_args; i++) {
1013 ret = trace_define_field(event_call, tp->args[i].type->fmttype, 1130 struct probe_arg *parg = &tk->tp.args[i];
1014 tp->args[i].name, 1131
1015 sizeof(field) + tp->args[i].offset, 1132 ret = trace_define_field(event_call, parg->type->fmttype,
1016 tp->args[i].type->size, 1133 parg->name,
1017 tp->args[i].type->is_signed, 1134 sizeof(field) + parg->offset,
1135 parg->type->size,
1136 parg->type->is_signed,
1018 FILTER_OTHER); 1137 FILTER_OTHER);
1019 if (ret) 1138 if (ret)
1020 return ret; 1139 return ret;
@@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1022 return 0; 1141 return 0;
1023} 1142}
1024 1143
1025static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1026{
1027 int i;
1028 int pos = 0;
1029
1030 const char *fmt, *arg;
1031
1032 if (!trace_probe_is_return(tp)) {
1033 fmt = "(%lx)";
1034 arg = "REC->" FIELD_STRING_IP;
1035 } else {
1036 fmt = "(%lx <- %lx)";
1037 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1038 }
1039
1040 /* When len=0, we just calculate the needed length */
1041#define LEN_OR_ZERO (len ? len - pos : 0)
1042
1043 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1044
1045 for (i = 0; i < tp->nr_args; i++) {
1046 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1047 tp->args[i].name, tp->args[i].type->fmt);
1048 }
1049
1050 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1051
1052 for (i = 0; i < tp->nr_args; i++) {
1053 if (strcmp(tp->args[i].type->name, "string") == 0)
1054 pos += snprintf(buf + pos, LEN_OR_ZERO,
1055 ", __get_str(%s)",
1056 tp->args[i].name);
1057 else
1058 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1059 tp->args[i].name);
1060 }
1061
1062#undef LEN_OR_ZERO
1063
1064 /* return the length of print_fmt */
1065 return pos;
1066}
1067
1068static int set_print_fmt(struct trace_probe *tp)
1069{
1070 int len;
1071 char *print_fmt;
1072
1073 /* First: called with 0 length to calculate the needed length */
1074 len = __set_print_fmt(tp, NULL, 0);
1075 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1076 if (!print_fmt)
1077 return -ENOMEM;
1078
1079 /* Second: actually write the @print_fmt */
1080 __set_print_fmt(tp, print_fmt, len + 1);
1081 tp->call.print_fmt = print_fmt;
1082
1083 return 0;
1084}
1085
1086#ifdef CONFIG_PERF_EVENTS 1144#ifdef CONFIG_PERF_EVENTS
1087 1145
1088/* Kprobe profile handler */ 1146/* Kprobe profile handler */
1089static __kprobes void 1147static __kprobes void
1090kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) 1148kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1091{ 1149{
1092 struct ftrace_event_call *call = &tp->call; 1150 struct ftrace_event_call *call = &tk->tp.call;
1093 struct kprobe_trace_entry_head *entry; 1151 struct kprobe_trace_entry_head *entry;
1094 struct hlist_head *head; 1152 struct hlist_head *head;
1095 int size, __size, dsize; 1153 int size, __size, dsize;
@@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1099 if (hlist_empty(head)) 1157 if (hlist_empty(head))
1100 return; 1158 return;
1101 1159
1102 dsize = __get_data_size(tp, regs); 1160 dsize = __get_data_size(&tk->tp, regs);
1103 __size = sizeof(*entry) + tp->size + dsize; 1161 __size = sizeof(*entry) + tk->tp.size + dsize;
1104 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1162 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1105 size -= sizeof(u32); 1163 size -= sizeof(u32);
1106 1164
@@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1108 if (!entry) 1166 if (!entry)
1109 return; 1167 return;
1110 1168
1111 entry->ip = (unsigned long)tp->rp.kp.addr; 1169 entry->ip = (unsigned long)tk->rp.kp.addr;
1112 memset(&entry[1], 0, dsize); 1170 memset(&entry[1], 0, dsize);
1113 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1171 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1114 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1172 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1115} 1173}
1116 1174
1117/* Kretprobe profile handler */ 1175/* Kretprobe profile handler */
1118static __kprobes void 1176static __kprobes void
1119kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, 1177kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1120 struct pt_regs *regs) 1178 struct pt_regs *regs)
1121{ 1179{
1122 struct ftrace_event_call *call = &tp->call; 1180 struct ftrace_event_call *call = &tk->tp.call;
1123 struct kretprobe_trace_entry_head *entry; 1181 struct kretprobe_trace_entry_head *entry;
1124 struct hlist_head *head; 1182 struct hlist_head *head;
1125 int size, __size, dsize; 1183 int size, __size, dsize;
@@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1129 if (hlist_empty(head)) 1187 if (hlist_empty(head))
1130 return; 1188 return;
1131 1189
1132 dsize = __get_data_size(tp, regs); 1190 dsize = __get_data_size(&tk->tp, regs);
1133 __size = sizeof(*entry) + tp->size + dsize; 1191 __size = sizeof(*entry) + tk->tp.size + dsize;
1134 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1192 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1135 size -= sizeof(u32); 1193 size -= sizeof(u32);
1136 1194
@@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1138 if (!entry) 1196 if (!entry)
1139 return; 1197 return;
1140 1198
1141 entry->func = (unsigned long)tp->rp.kp.addr; 1199 entry->func = (unsigned long)tk->rp.kp.addr;
1142 entry->ret_ip = (unsigned long)ri->ret_addr; 1200 entry->ret_ip = (unsigned long)ri->ret_addr;
1143 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1201 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1144 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1202 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1145} 1203}
1146#endif /* CONFIG_PERF_EVENTS */ 1204#endif /* CONFIG_PERF_EVENTS */
@@ -1155,20 +1213,20 @@ static __kprobes
1155int kprobe_register(struct ftrace_event_call *event, 1213int kprobe_register(struct ftrace_event_call *event,
1156 enum trace_reg type, void *data) 1214 enum trace_reg type, void *data)
1157{ 1215{
1158 struct trace_probe *tp = (struct trace_probe *)event->data; 1216 struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
1159 struct ftrace_event_file *file = data; 1217 struct ftrace_event_file *file = data;
1160 1218
1161 switch (type) { 1219 switch (type) {
1162 case TRACE_REG_REGISTER: 1220 case TRACE_REG_REGISTER:
1163 return enable_trace_probe(tp, file); 1221 return enable_trace_kprobe(tk, file);
1164 case TRACE_REG_UNREGISTER: 1222 case TRACE_REG_UNREGISTER:
1165 return disable_trace_probe(tp, file); 1223 return disable_trace_kprobe(tk, file);
1166 1224
1167#ifdef CONFIG_PERF_EVENTS 1225#ifdef CONFIG_PERF_EVENTS
1168 case TRACE_REG_PERF_REGISTER: 1226 case TRACE_REG_PERF_REGISTER:
1169 return enable_trace_probe(tp, NULL); 1227 return enable_trace_kprobe(tk, NULL);
1170 case TRACE_REG_PERF_UNREGISTER: 1228 case TRACE_REG_PERF_UNREGISTER:
1171 return disable_trace_probe(tp, NULL); 1229 return disable_trace_kprobe(tk, NULL);
1172 case TRACE_REG_PERF_OPEN: 1230 case TRACE_REG_PERF_OPEN:
1173 case TRACE_REG_PERF_CLOSE: 1231 case TRACE_REG_PERF_CLOSE:
1174 case TRACE_REG_PERF_ADD: 1232 case TRACE_REG_PERF_ADD:
@@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event,
1182static __kprobes 1240static __kprobes
1183int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1241int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1184{ 1242{
1185 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1243 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
1186 1244
1187 tp->nhit++; 1245 tk->nhit++;
1188 1246
1189 if (tp->flags & TP_FLAG_TRACE) 1247 if (tk->tp.flags & TP_FLAG_TRACE)
1190 kprobe_trace_func(tp, regs); 1248 kprobe_trace_func(tk, regs);
1191#ifdef CONFIG_PERF_EVENTS 1249#ifdef CONFIG_PERF_EVENTS
1192 if (tp->flags & TP_FLAG_PROFILE) 1250 if (tk->tp.flags & TP_FLAG_PROFILE)
1193 kprobe_perf_func(tp, regs); 1251 kprobe_perf_func(tk, regs);
1194#endif 1252#endif
1195 return 0; /* We don't tweek kernel, so just return 0 */ 1253 return 0; /* We don't tweek kernel, so just return 0 */
1196} 1254}
@@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1198static __kprobes 1256static __kprobes
1199int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) 1257int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1200{ 1258{
1201 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1259 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
1202 1260
1203 tp->nhit++; 1261 tk->nhit++;
1204 1262
1205 if (tp->flags & TP_FLAG_TRACE) 1263 if (tk->tp.flags & TP_FLAG_TRACE)
1206 kretprobe_trace_func(tp, ri, regs); 1264 kretprobe_trace_func(tk, ri, regs);
1207#ifdef CONFIG_PERF_EVENTS 1265#ifdef CONFIG_PERF_EVENTS
1208 if (tp->flags & TP_FLAG_PROFILE) 1266 if (tk->tp.flags & TP_FLAG_PROFILE)
1209 kretprobe_perf_func(tp, ri, regs); 1267 kretprobe_perf_func(tk, ri, regs);
1210#endif 1268#endif
1211 return 0; /* We don't tweek kernel, so just return 0 */ 1269 return 0; /* We don't tweek kernel, so just return 0 */
1212} 1270}
@@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = {
1219 .trace = print_kprobe_event 1277 .trace = print_kprobe_event
1220}; 1278};
1221 1279
1222static int register_probe_event(struct trace_probe *tp) 1280static int register_kprobe_event(struct trace_kprobe *tk)
1223{ 1281{
1224 struct ftrace_event_call *call = &tp->call; 1282 struct ftrace_event_call *call = &tk->tp.call;
1225 int ret; 1283 int ret;
1226 1284
1227 /* Initialize ftrace_event_call */ 1285 /* Initialize ftrace_event_call */
1228 INIT_LIST_HEAD(&call->class->fields); 1286 INIT_LIST_HEAD(&call->class->fields);
1229 if (trace_probe_is_return(tp)) { 1287 if (trace_kprobe_is_return(tk)) {
1230 call->event.funcs = &kretprobe_funcs; 1288 call->event.funcs = &kretprobe_funcs;
1231 call->class->define_fields = kretprobe_event_define_fields; 1289 call->class->define_fields = kretprobe_event_define_fields;
1232 } else { 1290 } else {
1233 call->event.funcs = &kprobe_funcs; 1291 call->event.funcs = &kprobe_funcs;
1234 call->class->define_fields = kprobe_event_define_fields; 1292 call->class->define_fields = kprobe_event_define_fields;
1235 } 1293 }
1236 if (set_print_fmt(tp) < 0) 1294 if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
1237 return -ENOMEM; 1295 return -ENOMEM;
1238 ret = register_ftrace_event(&call->event); 1296 ret = register_ftrace_event(&call->event);
1239 if (!ret) { 1297 if (!ret) {
@@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp)
1242 } 1300 }
1243 call->flags = 0; 1301 call->flags = 0;
1244 call->class->reg = kprobe_register; 1302 call->class->reg = kprobe_register;
1245 call->data = tp; 1303 call->data = tk;
1246 ret = trace_add_event_call(call); 1304 ret = trace_add_event_call(call);
1247 if (ret) { 1305 if (ret) {
1248 pr_info("Failed to register kprobe event: %s\n", call->name); 1306 pr_info("Failed to register kprobe event: %s\n", call->name);
@@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp)
1252 return ret; 1310 return ret;
1253} 1311}
1254 1312
1255static int unregister_probe_event(struct trace_probe *tp) 1313static int unregister_kprobe_event(struct trace_kprobe *tk)
1256{ 1314{
1257 int ret; 1315 int ret;
1258 1316
1259 /* tp->event is unregistered in trace_remove_event_call() */ 1317 /* tp->event is unregistered in trace_remove_event_call() */
1260 ret = trace_remove_event_call(&tp->call); 1318 ret = trace_remove_event_call(&tk->tp.call);
1261 if (!ret) 1319 if (!ret)
1262 kfree(tp->call.print_fmt); 1320 kfree(tk->tp.call.print_fmt);
1263 return ret; 1321 return ret;
1264} 1322}
1265 1323
@@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void)
1269 struct dentry *d_tracer; 1327 struct dentry *d_tracer;
1270 struct dentry *entry; 1328 struct dentry *entry;
1271 1329
1272 if (register_module_notifier(&trace_probe_module_nb)) 1330 if (register_module_notifier(&trace_kprobe_module_nb))
1273 return -EINVAL; 1331 return -EINVAL;
1274 1332
1275 d_tracer = tracing_init_dentry(); 1333 d_tracer = tracing_init_dentry();
@@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
1309} 1367}
1310 1368
1311static struct ftrace_event_file * 1369static struct ftrace_event_file *
1312find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) 1370find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
1313{ 1371{
1314 struct ftrace_event_file *file; 1372 struct ftrace_event_file *file;
1315 1373
1316 list_for_each_entry(file, &tr->events, list) 1374 list_for_each_entry(file, &tr->events, list)
1317 if (file->event_call == &tp->call) 1375 if (file->event_call == &tk->tp.call)
1318 return file; 1376 return file;
1319 1377
1320 return NULL; 1378 return NULL;
1321} 1379}
1322 1380
1323/* 1381/*
1324 * Nobody but us can call enable_trace_probe/disable_trace_probe at this 1382 * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this
1325 * stage, we can do this lockless. 1383 * stage, we can do this lockless.
1326 */ 1384 */
1327static __init int kprobe_trace_self_tests_init(void) 1385static __init int kprobe_trace_self_tests_init(void)
1328{ 1386{
1329 int ret, warn = 0; 1387 int ret, warn = 0;
1330 int (*target)(int, int, int, int, int, int); 1388 int (*target)(int, int, int, int, int, int);
1331 struct trace_probe *tp; 1389 struct trace_kprobe *tk;
1332 struct ftrace_event_file *file; 1390 struct ftrace_event_file *file;
1333 1391
1334 target = kprobe_trace_selftest_target; 1392 target = kprobe_trace_selftest_target;
@@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void)
1337 1395
1338 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " 1396 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
1339 "$stack $stack0 +0($stack)", 1397 "$stack $stack0 +0($stack)",
1340 create_trace_probe); 1398 create_trace_kprobe);
1341 if (WARN_ON_ONCE(ret)) { 1399 if (WARN_ON_ONCE(ret)) {
1342 pr_warn("error on probing function entry.\n"); 1400 pr_warn("error on probing function entry.\n");
1343 warn++; 1401 warn++;
1344 } else { 1402 } else {
1345 /* Enable trace point */ 1403 /* Enable trace point */
1346 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1404 tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
1347 if (WARN_ON_ONCE(tp == NULL)) { 1405 if (WARN_ON_ONCE(tk == NULL)) {
1348 pr_warn("error on getting new probe.\n"); 1406 pr_warn("error on getting new probe.\n");
1349 warn++; 1407 warn++;
1350 } else { 1408 } else {
1351 file = find_trace_probe_file(tp, top_trace_array()); 1409 file = find_trace_probe_file(tk, top_trace_array());
1352 if (WARN_ON_ONCE(file == NULL)) { 1410 if (WARN_ON_ONCE(file == NULL)) {
1353 pr_warn("error on getting probe file.\n"); 1411 pr_warn("error on getting probe file.\n");
1354 warn++; 1412 warn++;
1355 } else 1413 } else
1356 enable_trace_probe(tp, file); 1414 enable_trace_kprobe(tk, file);
1357 } 1415 }
1358 } 1416 }
1359 1417
1360 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " 1418 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
1361 "$retval", create_trace_probe); 1419 "$retval", create_trace_kprobe);
1362 if (WARN_ON_ONCE(ret)) { 1420 if (WARN_ON_ONCE(ret)) {
1363 pr_warn("error on probing function return.\n"); 1421 pr_warn("error on probing function return.\n");
1364 warn++; 1422 warn++;
1365 } else { 1423 } else {
1366 /* Enable trace point */ 1424 /* Enable trace point */
1367 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1425 tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
1368 if (WARN_ON_ONCE(tp == NULL)) { 1426 if (WARN_ON_ONCE(tk == NULL)) {
1369 pr_warn("error on getting 2nd new probe.\n"); 1427 pr_warn("error on getting 2nd new probe.\n");
1370 warn++; 1428 warn++;
1371 } else { 1429 } else {
1372 file = find_trace_probe_file(tp, top_trace_array()); 1430 file = find_trace_probe_file(tk, top_trace_array());
1373 if (WARN_ON_ONCE(file == NULL)) { 1431 if (WARN_ON_ONCE(file == NULL)) {
1374 pr_warn("error on getting probe file.\n"); 1432 pr_warn("error on getting probe file.\n");
1375 warn++; 1433 warn++;
1376 } else 1434 } else
1377 enable_trace_probe(tp, file); 1435 enable_trace_kprobe(tk, file);
1378 } 1436 }
1379 } 1437 }
1380 1438
@@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void)
1384 ret = target(1, 2, 3, 4, 5, 6); 1442 ret = target(1, 2, 3, 4, 5, 6);
1385 1443
1386 /* Disable trace points before removing it */ 1444 /* Disable trace points before removing it */
1387 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1445 tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
1388 if (WARN_ON_ONCE(tp == NULL)) { 1446 if (WARN_ON_ONCE(tk == NULL)) {
1389 pr_warn("error on getting test probe.\n"); 1447 pr_warn("error on getting test probe.\n");
1390 warn++; 1448 warn++;
1391 } else { 1449 } else {
1392 file = find_trace_probe_file(tp, top_trace_array()); 1450 file = find_trace_probe_file(tk, top_trace_array());
1393 if (WARN_ON_ONCE(file == NULL)) { 1451 if (WARN_ON_ONCE(file == NULL)) {
1394 pr_warn("error on getting probe file.\n"); 1452 pr_warn("error on getting probe file.\n");
1395 warn++; 1453 warn++;
1396 } else 1454 } else
1397 disable_trace_probe(tp, file); 1455 disable_trace_kprobe(tk, file);
1398 } 1456 }
1399 1457
1400 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1458 tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
1401 if (WARN_ON_ONCE(tp == NULL)) { 1459 if (WARN_ON_ONCE(tk == NULL)) {
1402 pr_warn("error on getting 2nd test probe.\n"); 1460 pr_warn("error on getting 2nd test probe.\n");
1403 warn++; 1461 warn++;
1404 } else { 1462 } else {
1405 file = find_trace_probe_file(tp, top_trace_array()); 1463 file = find_trace_probe_file(tk, top_trace_array());
1406 if (WARN_ON_ONCE(file == NULL)) { 1464 if (WARN_ON_ONCE(file == NULL)) {
1407 pr_warn("error on getting probe file.\n"); 1465 pr_warn("error on getting probe file.\n");
1408 warn++; 1466 warn++;
1409 } else 1467 } else
1410 disable_trace_probe(tp, file); 1468 disable_trace_kprobe(tk, file);
1411 } 1469 }
1412 1470
1413 ret = traceprobe_command("-:testprobe", create_trace_probe); 1471 ret = traceprobe_command("-:testprobe", create_trace_kprobe);
1414 if (WARN_ON_ONCE(ret)) { 1472 if (WARN_ON_ONCE(ret)) {
1415 pr_warn("error on deleting a probe.\n"); 1473 pr_warn("error on deleting a probe.\n");
1416 warn++; 1474 warn++;
1417 } 1475 }
1418 1476
1419 ret = traceprobe_command("-:testprobe2", create_trace_probe); 1477 ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
1420 if (WARN_ON_ONCE(ret)) { 1478 if (WARN_ON_ONCE(ret)) {
1421 pr_warn("error on deleting a probe.\n"); 1479 pr_warn("error on deleting a probe.\n");
1422 warn++; 1480 warn++;
1423 } 1481 }
1424 1482
1425end: 1483end:
1426 release_all_trace_probes(); 1484 release_all_trace_kprobes();
1427 if (warn) 1485 if (warn)
1428 pr_cont("NG: Some tests are failed. Please check them.\n"); 1486 pr_cont("NG: Some tests are failed. Please check them.\n");
1429 else 1487 else
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 412e959709b4..8364a421b4df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -35,46 +35,27 @@ const char *reserved_field_names[] = {
35 FIELD_STRING_FUNC, 35 FIELD_STRING_FUNC,
36}; 36};
37 37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */ 38/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 40__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \ 41 const char *name, \
46 void *data, void *ent)\ 42 void *data, void *ent) \
47{ \ 43{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 44 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
49} \ 45} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65 47
66/* For data_loc conversion */ 48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
67static inline void *get_loc_data(u32 *dl, void *ent) 49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
68{ 50DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
69 return (u8 *)ent + get_rloc_offs(*dl); 51DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
70} 52DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
71 53DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
72/* For defining macros, define string/string_size types */ 54DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
73typedef u32 string; 55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
74typedef u32 string_size;
75 56
76/* Print type function for string type */ 57/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, 58__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name, 59 const char *name,
79 void *data, void *ent) 60 void *data, void *ent)
80{ 61{
@@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
87 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
88} 69}
89 70
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; 71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102 72
103#define CHECK_FETCH_FUNCS(method, fn) \ 73#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \ 74 (((FETCH_FUNC_NAME(method, u8) == fn) || \
@@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64)
111 81
112/* Data fetch function templates */ 82/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \ 83#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 84__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \ 85 void *offset, void *dest) \
116{ \ 86{ \
117 *(type *)dest = (type)regs_get_register(regs, \ 87 *(type *)dest = (type)regs_get_register(regs, \
@@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg)
122#define fetch_reg_string NULL 92#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL 93#define fetch_reg_string_size NULL
124 94
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \ 95#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 96__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
139 void *dummy, void *dest) \ 97 void *dummy, void *dest) \
140{ \ 98{ \
141 *(type *)dest = (type)regs_return_value(regs); \ 99 *(type *)dest = (type)regs_return_value(regs); \
@@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval)
145#define fetch_retval_string NULL 103#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL 104#define fetch_retval_string_size NULL
147 105
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */ 106/* Dereference memory access function */
285struct deref_fetch_param { 107struct deref_fetch_param {
286 struct fetch_param orig; 108 struct fetch_param orig;
287 long offset; 109 long offset;
110 fetch_func_t fetch;
111 fetch_func_t fetch_size;
288}; 112};
289 113
290#define DEFINE_FETCH_deref(type) \ 114#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ 115__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
292 void *data, void *dest) \ 116 void *data, void *dest) \
293{ \ 117{ \
294 struct deref_fetch_param *dprm = data; \ 118 struct deref_fetch_param *dprm = data; \
@@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
296 call_fetch(&dprm->orig, regs, &addr); \ 120 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \ 121 if (addr) { \
298 addr += dprm->offset; \ 122 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \ 123 dprm->fetch(regs, (void *)addr, dest); \
300 } else \ 124 } else \
301 *(type *)dest = 0; \ 125 *(type *)dest = 0; \
302} 126}
303DEFINE_BASIC_FETCH_FUNCS(deref) 127DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string) 128DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size) 129
130__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
131 void *data, void *dest)
132{
133 struct deref_fetch_param *dprm = data;
134 unsigned long addr;
135
136 call_fetch(&dprm->orig, regs, &addr);
137 if (addr && dprm->fetch_size) {
138 addr += dprm->offset;
139 dprm->fetch_size(regs, (void *)addr, dest);
140 } else
141 *(string_size *)dest = 0;
142}
306 143
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) 144static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{ 145{
@@ -329,7 +166,7 @@ struct bitfield_fetch_param {
329}; 166};
330 167
331#define DEFINE_FETCH_bitfield(type) \ 168#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ 169__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
333 void *data, void *dest) \ 170 void *data, void *dest) \
334{ \ 171{ \
335 struct bitfield_fetch_param *bprm = data; \ 172 struct bitfield_fetch_param *bprm = data; \
@@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
374 kfree(data); 211 kfree(data);
375} 212}
376 213
377/* Default (unsigned long) fetch type */ 214static const struct fetch_type *find_fetch_type(const char *type,
378#define __DEFAULT_FETCH_TYPE(t) u##t 215 const struct fetch_type *ftbl)
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{ 216{
430 int i; 217 int i;
431 218
@@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type)
446 233
447 switch (bs) { 234 switch (bs) {
448 case 8: 235 case 8:
449 return find_fetch_type("u8"); 236 return find_fetch_type("u8", ftbl);
450 case 16: 237 case 16:
451 return find_fetch_type("u16"); 238 return find_fetch_type("u16", ftbl);
452 case 32: 239 case 32:
453 return find_fetch_type("u32"); 240 return find_fetch_type("u32", ftbl);
454 case 64: 241 case 64:
455 return find_fetch_type("u64"); 242 return find_fetch_type("u64", ftbl);
456 default: 243 default:
457 goto fail; 244 goto fail;
458 } 245 }
459 } 246 }
460 247
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 248 for (i = 0; ftbl[i].name; i++) {
462 if (strcmp(type, fetch_type_table[i].name) == 0) 249 if (strcmp(type, ftbl[i].name) == 0)
463 return &fetch_type_table[i]; 250 return &ftbl[i];
251 }
464 252
465fail: 253fail:
466 return NULL; 254 return NULL;
467} 255}
468 256
469/* Special function : only accept unsigned long */ 257/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs, 258static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest) 259 void *dummy, void *dest)
472{ 260{
473 *(unsigned long *)dest = kernel_stack_pointer(regs); 261 *(unsigned long *)dest = kernel_stack_pointer(regs);
474} 262}
475 263
264static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
265 void *dummy, void *dest)
266{
267 *(unsigned long *)dest = user_stack_pointer(regs);
268}
269
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type, 270static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn) 271 fetch_func_t orig_fn,
272 const struct fetch_type *ftbl)
478{ 273{
479 int i; 274 int i;
480 275
481 if (type != &fetch_type_table[FETCH_TYPE_STRING]) 276 if (type != &ftbl[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */ 277 return NULL; /* Only string type needs size function */
483 278
484 for (i = 0; i < FETCH_MTD_END; i++) 279 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn) 280 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; 281 return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
487 282
488 WARN_ON(1); /* This should not happen */ 283 WARN_ON(1); /* This should not happen */
489 284
@@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 311#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517 312
518static int parse_probe_vars(char *arg, const struct fetch_type *t, 313static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return) 314 struct fetch_param *f, bool is_return,
315 bool is_kprobe)
520{ 316{
521 int ret = 0; 317 int ret = 0;
522 unsigned long param; 318 unsigned long param;
@@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
528 ret = -EINVAL; 324 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) { 325 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') { 326 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) 327 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
532 f->fn = fetch_stack_address; 328 return -EINVAL;
329
330 if (is_kprobe)
331 f->fn = fetch_kernel_stack_address;
533 else 332 else
534 ret = -EINVAL; 333 f->fn = fetch_user_stack_address;
535 } else if (isdigit(arg[5])) { 334 } else if (isdigit(arg[5])) {
536 ret = kstrtoul(arg + 5, 10, &param); 335 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 336 if (ret || (is_kprobe && param > PARAM_MAX_STACK))
538 ret = -EINVAL; 337 ret = -EINVAL;
539 else { 338 else {
540 f->fn = t->fetch[FETCH_MTD_stack]; 339 f->fn = t->fetch[FETCH_MTD_stack];
@@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
552static int parse_probe_arg(char *arg, const struct fetch_type *t, 351static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe) 352 struct fetch_param *f, bool is_return, bool is_kprobe)
554{ 353{
354 const struct fetch_type *ftbl;
555 unsigned long param; 355 unsigned long param;
556 long offset; 356 long offset;
557 char *tmp; 357 char *tmp;
558 int ret; 358 int ret = 0;
559
560 ret = 0;
561 359
562 /* Until uprobe_events supports only reg arguments */ 360 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
563 if (!is_kprobe && arg[0] != '%') 361 BUG_ON(ftbl == NULL);
564 return -EINVAL;
565 362
566 switch (arg[0]) { 363 switch (arg[0]) {
567 case '$': 364 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return); 365 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
569 break; 366 break;
570 367
571 case '%': /* named register */ 368 case '%': /* named register */
@@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
577 } 374 }
578 break; 375 break;
579 376
580 case '@': /* memory or symbol */ 377 case '@': /* memory, file-offset or symbol */
581 if (isdigit(arg[1])) { 378 if (isdigit(arg[1])) {
582 ret = kstrtoul(arg + 1, 0, &param); 379 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 380 if (ret)
@@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
585 382
586 f->fn = t->fetch[FETCH_MTD_memory]; 383 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param; 384 f->data = (void *)param;
385 } else if (arg[1] == '+') {
386 /* kprobes don't support file offsets */
387 if (is_kprobe)
388 return -EINVAL;
389
390 ret = kstrtol(arg + 2, 0, &offset);
391 if (ret)
392 break;
393
394 f->fn = t->fetch[FETCH_MTD_file_offset];
395 f->data = (void *)offset;
588 } else { 396 } else {
397 /* uprobes don't support symbols */
398 if (!is_kprobe)
399 return -EINVAL;
400
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset); 401 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret) 402 if (ret)
591 break; 403 break;
@@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
616 struct deref_fetch_param *dprm; 428 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2; 429 const struct fetch_type *t2;
618 430
619 t2 = find_fetch_type(NULL); 431 t2 = find_fetch_type(NULL, ftbl);
620 *tmp = '\0'; 432 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); 433 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622 434
@@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
624 return -ENOMEM; 436 return -ENOMEM;
625 437
626 dprm->offset = offset; 438 dprm->offset = offset;
439 dprm->fetch = t->fetch[FETCH_MTD_memory];
440 dprm->fetch_size = get_fetch_size_function(t,
441 dprm->fetch, ftbl);
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, 442 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe); 443 is_kprobe);
629 if (ret) 444 if (ret)
@@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf,
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 500int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe) 501 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{ 502{
503 const struct fetch_type *ftbl;
688 const char *t; 504 const char *t;
689 int ret; 505 int ret;
690 506
507 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
508 BUG_ON(ftbl == NULL);
509
691 if (strlen(arg) > MAX_ARGSTR_LEN) { 510 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg); 511 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC; 512 return -ENOSPC;
@@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
702 arg[t - parg->comm] = '\0'; 521 arg[t - parg->comm] = '\0';
703 t++; 522 t++;
704 } 523 }
705 parg->type = find_fetch_type(t); 524 parg->type = find_fetch_type(t, ftbl);
706 if (!parg->type) { 525 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t); 526 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL; 527 return -EINVAL;
@@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
716 535
717 if (ret >= 0) { 536 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type, 537 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn); 538 parg->fetch.fn,
539 ftbl);
720 parg->fetch_size.data = parg->fetch.data; 540 parg->fetch_size.data = parg->fetch.data;
721 } 541 }
722 542
@@ -837,3 +657,65 @@ out:
837 657
838 return ret; 658 return ret;
839} 659}
660
661static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
662 bool is_return)
663{
664 int i;
665 int pos = 0;
666
667 const char *fmt, *arg;
668
669 if (!is_return) {
670 fmt = "(%lx)";
671 arg = "REC->" FIELD_STRING_IP;
672 } else {
673 fmt = "(%lx <- %lx)";
674 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
675 }
676
677 /* When len=0, we just calculate the needed length */
678#define LEN_OR_ZERO (len ? len - pos : 0)
679
680 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
681
682 for (i = 0; i < tp->nr_args; i++) {
683 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
684 tp->args[i].name, tp->args[i].type->fmt);
685 }
686
687 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
688
689 for (i = 0; i < tp->nr_args; i++) {
690 if (strcmp(tp->args[i].type->name, "string") == 0)
691 pos += snprintf(buf + pos, LEN_OR_ZERO,
692 ", __get_str(%s)",
693 tp->args[i].name);
694 else
695 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
696 tp->args[i].name);
697 }
698
699#undef LEN_OR_ZERO
700
701 /* return the length of print_fmt */
702 return pos;
703}
704
705int set_print_fmt(struct trace_probe *tp, bool is_return)
706{
707 int len;
708 char *print_fmt;
709
710 /* First: called with 0 length to calculate the needed length */
711 len = __set_print_fmt(tp, NULL, 0, is_return);
712 print_fmt = kmalloc(len + 1, GFP_KERNEL);
713 if (!print_fmt)
714 return -ENOMEM;
715
716 /* Second: actually write the @print_fmt */
717 __set_print_fmt(tp, print_fmt, len + 1, is_return);
718 tp->call.print_fmt = print_fmt;
719
720 return 0;
721}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5c7e09d10d74..b73574a5f429 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,6 +81,17 @@
81 */ 81 */
82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) 82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
83 83
84static inline void *get_rloc_data(u32 *dl)
85{
86 return (u8 *)dl + get_rloc_offs(*dl);
87}
88
89/* For data_loc conversion */
90static inline void *get_loc_data(u32 *dl, void *ent)
91{
92 return (u8 *)ent + get_rloc_offs(*dl);
93}
94
84/* Data fetch function type */ 95/* Data fetch function type */
85typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 96typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
86/* Printing function type */ 97/* Printing function type */
@@ -95,6 +106,7 @@ enum {
95 FETCH_MTD_symbol, 106 FETCH_MTD_symbol,
96 FETCH_MTD_deref, 107 FETCH_MTD_deref,
97 FETCH_MTD_bitfield, 108 FETCH_MTD_bitfield,
109 FETCH_MTD_file_offset,
98 FETCH_MTD_END, 110 FETCH_MTD_END,
99}; 111};
100 112
@@ -115,6 +127,148 @@ struct fetch_param {
115 void *data; 127 void *data;
116}; 128};
117 129
130/* For defining macros, define string/string_size types */
131typedef u32 string;
132typedef u32 string_size;
133
134#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
135#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
136
137/* Printing in basic type function template */
138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
139__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
140 const char *name, \
141 void *data, void *ent); \
142extern const char PRINT_TYPE_FMT_NAME(type)[]
143
144DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
145DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
146DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
147DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
148DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
149DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
150DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
151DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
152DECLARE_BASIC_PRINT_TYPE_FUNC(string);
153
154#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
155
156/* Declare macro for basic types */
157#define DECLARE_FETCH_FUNC(method, type) \
158extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \
159 void *data, void *dest)
160
161#define DECLARE_BASIC_FETCH_FUNCS(method) \
162DECLARE_FETCH_FUNC(method, u8); \
163DECLARE_FETCH_FUNC(method, u16); \
164DECLARE_FETCH_FUNC(method, u32); \
165DECLARE_FETCH_FUNC(method, u64)
166
167DECLARE_BASIC_FETCH_FUNCS(reg);
168#define fetch_reg_string NULL
169#define fetch_reg_string_size NULL
170
171DECLARE_BASIC_FETCH_FUNCS(retval);
172#define fetch_retval_string NULL
173#define fetch_retval_string_size NULL
174
175DECLARE_BASIC_FETCH_FUNCS(symbol);
176DECLARE_FETCH_FUNC(symbol, string);
177DECLARE_FETCH_FUNC(symbol, string_size);
178
179DECLARE_BASIC_FETCH_FUNCS(deref);
180DECLARE_FETCH_FUNC(deref, string);
181DECLARE_FETCH_FUNC(deref, string_size);
182
183DECLARE_BASIC_FETCH_FUNCS(bitfield);
184#define fetch_bitfield_string NULL
185#define fetch_bitfield_string_size NULL
186
187/*
188 * Define macro for basic types - we don't need to define s* types, because
189 * we have to care only about bitwidth at recording time.
190 */
191#define DEFINE_BASIC_FETCH_FUNCS(method) \
192DEFINE_FETCH_##method(u8) \
193DEFINE_FETCH_##method(u16) \
194DEFINE_FETCH_##method(u32) \
195DEFINE_FETCH_##method(u64)
196
197/* Default (unsigned long) fetch type */
198#define __DEFAULT_FETCH_TYPE(t) u##t
199#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
200#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
201#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
202
203#define ASSIGN_FETCH_FUNC(method, type) \
204 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
205
206#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
207 {.name = _name, \
208 .size = _size, \
209 .is_signed = sign, \
210 .print = PRINT_TYPE_FUNC_NAME(ptype), \
211 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
212 .fmttype = _fmttype, \
213 .fetch = { \
214ASSIGN_FETCH_FUNC(reg, ftype), \
215ASSIGN_FETCH_FUNC(stack, ftype), \
216ASSIGN_FETCH_FUNC(retval, ftype), \
217ASSIGN_FETCH_FUNC(memory, ftype), \
218ASSIGN_FETCH_FUNC(symbol, ftype), \
219ASSIGN_FETCH_FUNC(deref, ftype), \
220ASSIGN_FETCH_FUNC(bitfield, ftype), \
221ASSIGN_FETCH_FUNC(file_offset, ftype), \
222 } \
223 }
224
225#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
226 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
227
228#define ASSIGN_FETCH_TYPE_END {}
229
230#define FETCH_TYPE_STRING 0
231#define FETCH_TYPE_STRSIZE 1
232
233/*
234 * Fetch type information table.
235 * It's declared as a weak symbol due to conditional compilation.
236 */
237extern __weak const struct fetch_type kprobes_fetch_type_table[];
238extern __weak const struct fetch_type uprobes_fetch_type_table[];
239
240#ifdef CONFIG_KPROBE_EVENT
241struct symbol_cache;
242unsigned long update_symbol_cache(struct symbol_cache *sc);
243void free_symbol_cache(struct symbol_cache *sc);
244struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
245#else
246/* uprobes do not support symbol fetch methods */
247#define fetch_symbol_u8 NULL
248#define fetch_symbol_u16 NULL
249#define fetch_symbol_u32 NULL
250#define fetch_symbol_u64 NULL
251#define fetch_symbol_string NULL
252#define fetch_symbol_string_size NULL
253
254struct symbol_cache {
255};
256static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
257{
258 return 0;
259}
260
261static inline void __used free_symbol_cache(struct symbol_cache *sc)
262{
263}
264
265static inline struct symbol_cache * __used
266alloc_symbol_cache(const char *sym, long offset)
267{
268 return NULL;
269}
270#endif /* CONFIG_KPROBE_EVENT */
271
118struct probe_arg { 272struct probe_arg {
119 struct fetch_param fetch; 273 struct fetch_param fetch;
120 struct fetch_param fetch_size; 274 struct fetch_param fetch_size;
@@ -124,6 +278,26 @@ struct probe_arg {
124 const struct fetch_type *type; /* Type of this argument */ 278 const struct fetch_type *type; /* Type of this argument */
125}; 279};
126 280
281struct trace_probe {
282 unsigned int flags; /* For TP_FLAG_* */
283 struct ftrace_event_class class;
284 struct ftrace_event_call call;
285 struct list_head files;
286 ssize_t size; /* trace entry size */
287 unsigned int nr_args;
288 struct probe_arg args[];
289};
290
291static inline bool trace_probe_is_enabled(struct trace_probe *tp)
292{
293 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
294}
295
296static inline bool trace_probe_is_registered(struct trace_probe *tp)
297{
298 return !!(tp->flags & TP_FLAG_REGISTERED);
299}
300
127static inline __kprobes void call_fetch(struct fetch_param *fprm, 301static inline __kprobes void call_fetch(struct fetch_param *fprm,
128 struct pt_regs *regs, void *dest) 302 struct pt_regs *regs, void *dest)
129{ 303{
@@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file,
158 int (*createfn)(int, char**)); 332 int (*createfn)(int, char**));
159 333
160extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); 334extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
335
336/* Sum up total data length for dynamic arraies (strings) */
337static inline __kprobes int
338__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
339{
340 int i, ret = 0;
341 u32 len;
342
343 for (i = 0; i < tp->nr_args; i++)
344 if (unlikely(tp->args[i].fetch_size.fn)) {
345 call_fetch(&tp->args[i].fetch_size, regs, &len);
346 ret += len;
347 }
348
349 return ret;
350}
351
352/* Store the value of each argument */
353static inline __kprobes void
354store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
355 u8 *data, int maxlen)
356{
357 int i;
358 u32 end = tp->size;
359 u32 *dl; /* Data (relative) location */
360
361 for (i = 0; i < tp->nr_args; i++) {
362 if (unlikely(tp->args[i].fetch_size.fn)) {
363 /*
364 * First, we set the relative location and
365 * maximum data length to *dl
366 */
367 dl = (u32 *)(data + tp->args[i].offset);
368 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
369 /* Then try to fetch string or dynamic array data */
370 call_fetch(&tp->args[i].fetch, regs, dl);
371 /* Reduce maximum length */
372 end += get_rloc_len(*dl);
373 maxlen -= get_rloc_len(*dl);
374 /* Trick here, convert data_rloc to data_loc */
375 *dl = convert_rloc_to_loc(*dl,
376 ent_size + tp->args[i].offset);
377 } else
378 /* Just fetching data normally */
379 call_fetch(&tp->args[i].fetch, regs,
380 data + tp->args[i].offset);
381 }
382}
383
384extern int set_print_fmt(struct trace_probe *tp, bool is_return);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fee77e15d815..6e32635e5e57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -16,6 +16,7 @@
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h> 18#include <linux/sched/rt.h>
19#include <linux/sched/deadline.h>
19#include <trace/events/sched.h> 20#include <trace/events/sched.h>
20#include "trace.h" 21#include "trace.h"
21 22
@@ -27,6 +28,8 @@ static int wakeup_cpu;
27static int wakeup_current_cpu; 28static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 29static unsigned wakeup_prio = -1;
29static int wakeup_rt; 30static int wakeup_rt;
31static int wakeup_dl;
32static int tracing_dl = 0;
30 33
31static arch_spinlock_t wakeup_lock = 34static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 35 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
437{ 440{
438 wakeup_cpu = -1; 441 wakeup_cpu = -1;
439 wakeup_prio = -1; 442 wakeup_prio = -1;
443 tracing_dl = 0;
440 444
441 if (wakeup_task) 445 if (wakeup_task)
442 put_task_struct(wakeup_task); 446 put_task_struct(wakeup_task);
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472 tracing_record_cmdline(p); 476 tracing_record_cmdline(p);
473 tracing_record_cmdline(current); 477 tracing_record_cmdline(current);
474 478
475 if ((wakeup_rt && !rt_task(p)) || 479 /*
476 p->prio >= wakeup_prio || 480 * Semantic is like this:
477 p->prio >= current->prio) 481 * - wakeup tracer handles all tasks in the system, independently
482 * from their scheduling class;
483 * - wakeup_rt tracer handles tasks belonging to sched_dl and
484 * sched_rt class;
485 * - wakeup_dl handles tasks belonging to sched_dl class only.
486 */
487 if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
488 (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
489 (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
478 return; 490 return;
479 491
480 pc = preempt_count(); 492 pc = preempt_count();
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
486 arch_spin_lock(&wakeup_lock); 498 arch_spin_lock(&wakeup_lock);
487 499
488 /* check for races. */ 500 /* check for races. */
489 if (!tracer_enabled || p->prio >= wakeup_prio) 501 if (!tracer_enabled || tracing_dl ||
502 (!dl_task(p) && p->prio >= wakeup_prio))
490 goto out_locked; 503 goto out_locked;
491 504
492 /* reset the trace */ 505 /* reset the trace */
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
496 wakeup_current_cpu = wakeup_cpu; 509 wakeup_current_cpu = wakeup_cpu;
497 wakeup_prio = p->prio; 510 wakeup_prio = p->prio;
498 511
512 /*
513 * Once you start tracing a -deadline task, don't bother tracing
514 * another task until the first one wakes up.
515 */
516 if (dl_task(p))
517 tracing_dl = 1;
518 else
519 tracing_dl = 0;
520
499 wakeup_task = p; 521 wakeup_task = p;
500 get_task_struct(wakeup_task); 522 get_task_struct(wakeup_task);
501 523
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
597 619
598static int wakeup_tracer_init(struct trace_array *tr) 620static int wakeup_tracer_init(struct trace_array *tr)
599{ 621{
622 wakeup_dl = 0;
600 wakeup_rt = 0; 623 wakeup_rt = 0;
601 return __wakeup_tracer_init(tr); 624 return __wakeup_tracer_init(tr);
602} 625}
603 626
604static int wakeup_rt_tracer_init(struct trace_array *tr) 627static int wakeup_rt_tracer_init(struct trace_array *tr)
605{ 628{
629 wakeup_dl = 0;
606 wakeup_rt = 1; 630 wakeup_rt = 1;
607 return __wakeup_tracer_init(tr); 631 return __wakeup_tracer_init(tr);
608} 632}
609 633
634static int wakeup_dl_tracer_init(struct trace_array *tr)
635{
636 wakeup_dl = 1;
637 wakeup_rt = 0;
638 return __wakeup_tracer_init(tr);
639}
640
610static void wakeup_tracer_reset(struct trace_array *tr) 641static void wakeup_tracer_reset(struct trace_array *tr)
611{ 642{
612 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; 643 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
674 .use_max_tr = true, 705 .use_max_tr = true,
675}; 706};
676 707
708static struct tracer wakeup_dl_tracer __read_mostly =
709{
710 .name = "wakeup_dl",
711 .init = wakeup_dl_tracer_init,
712 .reset = wakeup_tracer_reset,
713 .start = wakeup_tracer_start,
714 .stop = wakeup_tracer_stop,
715 .wait_pipe = poll_wait_pipe,
716 .print_max = true,
717 .print_header = wakeup_print_header,
718 .print_line = wakeup_print_line,
719 .flags = &tracer_flags,
720 .set_flag = wakeup_set_flag,
721 .flag_changed = wakeup_flag_changed,
722#ifdef CONFIG_FTRACE_SELFTEST
723 .selftest = trace_selftest_startup_wakeup,
724#endif
725 .open = wakeup_trace_open,
726 .close = wakeup_trace_close,
727 .use_max_tr = true,
728};
729
677__init static int init_wakeup_tracer(void) 730__init static int init_wakeup_tracer(void)
678{ 731{
679 int ret; 732 int ret;
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
686 if (ret) 739 if (ret)
687 return ret; 740 return ret;
688 741
742 ret = register_tracer(&wakeup_dl_tracer);
743 if (ret)
744 return ret;
745
689 return 0; 746 return 0;
690} 747}
691core_initcall(init_wakeup_tracer); 748core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a7329b7902f8..e98fca60974f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
1022#ifdef CONFIG_SCHED_TRACER 1022#ifdef CONFIG_SCHED_TRACER
1023static int trace_wakeup_test_thread(void *data) 1023static int trace_wakeup_test_thread(void *data)
1024{ 1024{
1025 /* Make this a RT thread, doesn't need to be too high */ 1025 /* Make this a -deadline thread */
1026 static const struct sched_param param = { .sched_priority = 5 }; 1026 static const struct sched_attr attr = {
1027 .sched_policy = SCHED_DEADLINE,
1028 .sched_runtime = 100000ULL,
1029 .sched_deadline = 10000000ULL,
1030 .sched_period = 10000000ULL
1031 };
1027 struct completion *x = data; 1032 struct completion *x = data;
1028 1033
1029 sched_setscheduler(current, SCHED_FIFO, &param); 1034 sched_setattr(current, &attr);
1030 1035
1031 /* Make it know we have a new prio */ 1036 /* Make it know we have a new prio */
1032 complete(x); 1037 complete(x);
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
1040 /* we are awake, now wait to disappear */ 1045 /* we are awake, now wait to disappear */
1041 while (!kthread_should_stop()) { 1046 while (!kthread_should_stop()) {
1042 /* 1047 /*
1043 * This is an RT task, do short sleeps to let 1048 * This will likely be the system top priority
1044 * others run. 1049 * task, do short sleeps to let others run.
1045 */ 1050 */
1046 msleep(100); 1051 msleep(100);
1047 } 1052 }
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1054{ 1059{
1055 unsigned long save_max = tracing_max_latency; 1060 unsigned long save_max = tracing_max_latency;
1056 struct task_struct *p; 1061 struct task_struct *p;
1057 struct completion isrt; 1062 struct completion is_ready;
1058 unsigned long count; 1063 unsigned long count;
1059 int ret; 1064 int ret;
1060 1065
1061 init_completion(&isrt); 1066 init_completion(&is_ready);
1062 1067
1063 /* create a high prio thread */ 1068 /* create a -deadline thread */
1064 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); 1069 p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
1065 if (IS_ERR(p)) { 1070 if (IS_ERR(p)) {
1066 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1071 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
1067 return -1; 1072 return -1;
1068 } 1073 }
1069 1074
1070 /* make sure the thread is running at an RT prio */ 1075 /* make sure the thread is running at -deadline policy */
1071 wait_for_completion(&isrt); 1076 wait_for_completion(&is_ready);
1072 1077
1073 /* start the tracing */ 1078 /* start the tracing */
1074 ret = tracer_init(trace, tr); 1079 ret = tracer_init(trace, tr);
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1082 1087
1083 while (p->on_rq) { 1088 while (p->on_rq) {
1084 /* 1089 /*
1085 * Sleep to make sure the RT thread is asleep too. 1090 * Sleep to make sure the -deadline thread is asleep too.
1086 * On virtual machines we can't rely on timings, 1091 * On virtual machines we can't rely on timings,
1087 * but we want to make sure this test still works. 1092 * but we want to make sure this test still works.
1088 */ 1093 */
1089 msleep(100); 1094 msleep(100);
1090 } 1095 }
1091 1096
1092 init_completion(&isrt); 1097 init_completion(&is_ready);
1093 1098
1094 wake_up_process(p); 1099 wake_up_process(p);
1095 1100
1096 /* Wait for the task to wake up */ 1101 /* Wait for the task to wake up */
1097 wait_for_completion(&isrt); 1102 wait_for_completion(&is_ready);
1098 1103
1099 /* stop the tracing. */ 1104 /* stop the tracing. */
1100 tracing_stop(); 1105 tracing_stop();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b20428c5efe2..e6be585cf06a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
382 .open = stack_trace_filter_open, 382 .open = stack_trace_filter_open,
383 .read = seq_read, 383 .read = seq_read,
384 .write = ftrace_filter_write, 384 .write = ftrace_filter_write,
385 .llseek = ftrace_filter_lseek, 385 .llseek = tracing_lseek,
386 .release = ftrace_regex_release, 386 .release = ftrace_regex_release,
387}; 387};
388 388
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e4b6d11bdf78..759d5e004517 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
321 if (!ftrace_file) 321 if (!ftrace_file)
322 return; 322 return;
323 323
324 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 324 if (ftrace_trigger_soft_disabled(ftrace_file))
325 return; 325 return;
326 326
327 sys_data = syscall_nr_to_meta(syscall_nr); 327 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
343 entry->nr = syscall_nr; 343 entry->nr = syscall_nr;
344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
345 345
346 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 346 event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
347 trace_current_buffer_unlock_commit(buffer, event, 347 irq_flags, pc);
348 irq_flags, pc);
349} 348}
350 349
351static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 350static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
369 if (!ftrace_file) 368 if (!ftrace_file)
370 return; 369 return;
371 370
372 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 371 if (ftrace_trigger_soft_disabled(ftrace_file))
373 return; 372 return;
374 373
375 sys_data = syscall_nr_to_meta(syscall_nr); 374 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
390 entry->nr = syscall_nr; 389 entry->nr = syscall_nr;
391 entry->ret = syscall_get_return_value(current, regs); 390 entry->ret = syscall_get_return_value(current, regs);
392 391
393 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 392 event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
394 trace_current_buffer_unlock_commit(buffer, event, 393 irq_flags, pc);
395 irq_flags, pc);
396} 394}
397 395
398static int reg_event_syscall_enter(struct ftrace_event_file *file, 396static int reg_event_syscall_enter(struct ftrace_event_file *file,
@@ -431,11 +429,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
431 if (!tr->sys_refcount_enter) 429 if (!tr->sys_refcount_enter)
432 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 430 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
433 mutex_unlock(&syscall_trace_lock); 431 mutex_unlock(&syscall_trace_lock);
434 /*
435 * Callers expect the event to be completely disabled on
436 * return, so wait for current handlers to finish.
437 */
438 synchronize_sched();
439} 432}
440 433
441static int reg_event_syscall_exit(struct ftrace_event_file *file, 434static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -474,11 +467,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
474 if (!tr->sys_refcount_exit) 467 if (!tr->sys_refcount_exit)
475 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 468 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
476 mutex_unlock(&syscall_trace_lock); 469 mutex_unlock(&syscall_trace_lock);
477 /*
478 * Callers expect the event to be completely disabled on
479 * return, so wait for current handlers to finish.
480 */
481 synchronize_sched();
482} 470}
483 471
484static int __init init_syscall_trace(struct ftrace_event_call *call) 472static int __init init_syscall_trace(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b6dcc42ef7f5..79e52d93860b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -51,22 +51,17 @@ struct trace_uprobe_filter {
51 */ 51 */
52struct trace_uprobe { 52struct trace_uprobe {
53 struct list_head list; 53 struct list_head list;
54 struct ftrace_event_class class;
55 struct ftrace_event_call call;
56 struct trace_uprobe_filter filter; 54 struct trace_uprobe_filter filter;
57 struct uprobe_consumer consumer; 55 struct uprobe_consumer consumer;
58 struct inode *inode; 56 struct inode *inode;
59 char *filename; 57 char *filename;
60 unsigned long offset; 58 unsigned long offset;
61 unsigned long nhit; 59 unsigned long nhit;
62 unsigned int flags; /* For TP_FLAG_* */ 60 struct trace_probe tp;
63 ssize_t size; /* trace entry size */
64 unsigned int nr_args;
65 struct probe_arg args[];
66}; 61};
67 62
68#define SIZEOF_TRACE_UPROBE(n) \ 63#define SIZEOF_TRACE_UPROBE(n) \
69 (offsetof(struct trace_uprobe, args) + \ 64 (offsetof(struct trace_uprobe, tp.args) + \
70 (sizeof(struct probe_arg) * (n))) 65 (sizeof(struct probe_arg) * (n)))
71 66
72static int register_uprobe_event(struct trace_uprobe *tu); 67static int register_uprobe_event(struct trace_uprobe *tu);
@@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu);
75static DEFINE_MUTEX(uprobe_lock); 70static DEFINE_MUTEX(uprobe_lock);
76static LIST_HEAD(uprobe_list); 71static LIST_HEAD(uprobe_list);
77 72
73struct uprobe_dispatch_data {
74 struct trace_uprobe *tu;
75 unsigned long bp_addr;
76};
77
78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
79static int uretprobe_dispatcher(struct uprobe_consumer *con, 79static int uretprobe_dispatcher(struct uprobe_consumer *con,
80 unsigned long func, struct pt_regs *regs); 80 unsigned long func, struct pt_regs *regs);
81 81
82#ifdef CONFIG_STACK_GROWSUP
83static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
84{
85 return addr - (n * sizeof(long));
86}
87#else
88static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
89{
90 return addr + (n * sizeof(long));
91}
92#endif
93
94static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
95{
96 unsigned long ret;
97 unsigned long addr = user_stack_pointer(regs);
98
99 addr = adjust_stack_addr(addr, n);
100
101 if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
102 return 0;
103
104 return ret;
105}
106
107/*
108 * Uprobes-specific fetch functions
109 */
110#define DEFINE_FETCH_stack(type) \
111static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
112 void *offset, void *dest) \
113{ \
114 *(type *)dest = (type)get_user_stack_nth(regs, \
115 ((unsigned long)offset)); \
116}
117DEFINE_BASIC_FETCH_FUNCS(stack)
118/* No string on the stack entry */
119#define fetch_stack_string NULL
120#define fetch_stack_string_size NULL
121
122#define DEFINE_FETCH_memory(type) \
123static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
124 void *addr, void *dest) \
125{ \
126 type retval; \
127 void __user *vaddr = (void __force __user *) addr; \
128 \
129 if (copy_from_user(&retval, vaddr, sizeof(type))) \
130 *(type *)dest = 0; \
131 else \
132 *(type *) dest = retval; \
133}
134DEFINE_BASIC_FETCH_FUNCS(memory)
135/*
136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
137 * length and relative data location.
138 */
139static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
140 void *addr, void *dest)
141{
142 long ret;
143 u32 rloc = *(u32 *)dest;
144 int maxlen = get_rloc_len(rloc);
145 u8 *dst = get_rloc_data(dest);
146 void __user *src = (void __force __user *) addr;
147
148 if (!maxlen)
149 return;
150
151 ret = strncpy_from_user(dst, src, maxlen);
152
153 if (ret < 0) { /* Failed to fetch string */
154 ((u8 *)get_rloc_data(dest))[0] = '\0';
155 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
156 } else {
157 *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
158 }
159}
160
161static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
162 void *addr, void *dest)
163{
164 int len;
165 void __user *vaddr = (void __force __user *) addr;
166
167 len = strnlen_user(vaddr, MAX_STRING_SIZE);
168
169 if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */
170 *(u32 *)dest = 0;
171 else
172 *(u32 *)dest = len;
173}
174
175static unsigned long translate_user_vaddr(void *file_offset)
176{
177 unsigned long base_addr;
178 struct uprobe_dispatch_data *udd;
179
180 udd = (void *) current->utask->vaddr;
181
182 base_addr = udd->bp_addr - udd->tu->offset;
183 return base_addr + (unsigned long)file_offset;
184}
185
186#define DEFINE_FETCH_file_offset(type) \
187static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
188 void *offset, void *dest) \
189{ \
190 void *vaddr = (void *)translate_user_vaddr(offset); \
191 \
192 FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \
193}
194DEFINE_BASIC_FETCH_FUNCS(file_offset)
195DEFINE_FETCH_file_offset(string)
196DEFINE_FETCH_file_offset(string_size)
197
198/* Fetch type information table */
199const struct fetch_type uprobes_fetch_type_table[] = {
200 /* Special types */
201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
202 sizeof(u32), 1, "__data_loc char[]"),
203 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
204 string_size, sizeof(u32), 0, "u32"),
205 /* Basic types */
206 ASSIGN_FETCH_TYPE(u8, u8, 0),
207 ASSIGN_FETCH_TYPE(u16, u16, 0),
208 ASSIGN_FETCH_TYPE(u32, u32, 0),
209 ASSIGN_FETCH_TYPE(u64, u64, 0),
210 ASSIGN_FETCH_TYPE(s8, u8, 1),
211 ASSIGN_FETCH_TYPE(s16, u16, 1),
212 ASSIGN_FETCH_TYPE(s32, u32, 1),
213 ASSIGN_FETCH_TYPE(s64, u64, 1),
214
215 ASSIGN_FETCH_TYPE_END
216};
217
82static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) 218static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
83{ 219{
84 rwlock_init(&filter->rwlock); 220 rwlock_init(&filter->rwlock);
@@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
114 if (!tu) 250 if (!tu)
115 return ERR_PTR(-ENOMEM); 251 return ERR_PTR(-ENOMEM);
116 252
117 tu->call.class = &tu->class; 253 tu->tp.call.class = &tu->tp.class;
118 tu->call.name = kstrdup(event, GFP_KERNEL); 254 tu->tp.call.name = kstrdup(event, GFP_KERNEL);
119 if (!tu->call.name) 255 if (!tu->tp.call.name)
120 goto error; 256 goto error;
121 257
122 tu->class.system = kstrdup(group, GFP_KERNEL); 258 tu->tp.class.system = kstrdup(group, GFP_KERNEL);
123 if (!tu->class.system) 259 if (!tu->tp.class.system)
124 goto error; 260 goto error;
125 261
126 INIT_LIST_HEAD(&tu->list); 262 INIT_LIST_HEAD(&tu->list);
@@ -128,11 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
128 if (is_ret) 264 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher; 265 tu->consumer.ret_handler = uretprobe_dispatcher;
130 init_trace_uprobe_filter(&tu->filter); 266 init_trace_uprobe_filter(&tu->filter);
131 tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; 267 tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
132 return tu; 268 return tu;
133 269
134error: 270error:
135 kfree(tu->call.name); 271 kfree(tu->tp.call.name);
136 kfree(tu); 272 kfree(tu);
137 273
138 return ERR_PTR(-ENOMEM); 274 return ERR_PTR(-ENOMEM);
@@ -142,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
142{ 278{
143 int i; 279 int i;
144 280
145 for (i = 0; i < tu->nr_args; i++) 281 for (i = 0; i < tu->tp.nr_args; i++)
146 traceprobe_free_probe_arg(&tu->args[i]); 282 traceprobe_free_probe_arg(&tu->tp.args[i]);
147 283
148 iput(tu->inode); 284 iput(tu->inode);
149 kfree(tu->call.class->system); 285 kfree(tu->tp.call.class->system);
150 kfree(tu->call.name); 286 kfree(tu->tp.call.name);
151 kfree(tu->filename); 287 kfree(tu->filename);
152 kfree(tu); 288 kfree(tu);
153} 289}
@@ -157,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
157 struct trace_uprobe *tu; 293 struct trace_uprobe *tu;
158 294
159 list_for_each_entry(tu, &uprobe_list, list) 295 list_for_each_entry(tu, &uprobe_list, list)
160 if (strcmp(tu->call.name, event) == 0 && 296 if (strcmp(tu->tp.call.name, event) == 0 &&
161 strcmp(tu->call.class->system, group) == 0) 297 strcmp(tu->tp.call.class->system, group) == 0)
162 return tu; 298 return tu;
163 299
164 return NULL; 300 return NULL;
@@ -181,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
181/* Register a trace_uprobe and probe_event */ 317/* Register a trace_uprobe and probe_event */
182static int register_trace_uprobe(struct trace_uprobe *tu) 318static int register_trace_uprobe(struct trace_uprobe *tu)
183{ 319{
184 struct trace_uprobe *old_tp; 320 struct trace_uprobe *old_tu;
185 int ret; 321 int ret;
186 322
187 mutex_lock(&uprobe_lock); 323 mutex_lock(&uprobe_lock);
188 324
189 /* register as an event */ 325 /* register as an event */
190 old_tp = find_probe_event(tu->call.name, tu->call.class->system); 326 old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
191 if (old_tp) { 327 if (old_tu) {
192 /* delete old event */ 328 /* delete old event */
193 ret = unregister_trace_uprobe(old_tp); 329 ret = unregister_trace_uprobe(old_tu);
194 if (ret) 330 if (ret)
195 goto end; 331 goto end;
196 } 332 }
@@ -211,7 +347,7 @@ end:
211 347
212/* 348/*
213 * Argument syntax: 349 * Argument syntax:
214 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] 350 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
215 * 351 *
216 * - Remove uprobe: -:[GRP/]EVENT 352 * - Remove uprobe: -:[GRP/]EVENT
217 */ 353 */
@@ -360,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv)
360 /* parse arguments */ 496 /* parse arguments */
361 ret = 0; 497 ret = 0;
362 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 498 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
499 struct probe_arg *parg = &tu->tp.args[i];
500
363 /* Increment count for freeing args in error case */ 501 /* Increment count for freeing args in error case */
364 tu->nr_args++; 502 tu->tp.nr_args++;
365 503
366 /* Parse argument name */ 504 /* Parse argument name */
367 arg = strchr(argv[i], '='); 505 arg = strchr(argv[i], '=');
368 if (arg) { 506 if (arg) {
369 *arg++ = '\0'; 507 *arg++ = '\0';
370 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); 508 parg->name = kstrdup(argv[i], GFP_KERNEL);
371 } else { 509 } else {
372 arg = argv[i]; 510 arg = argv[i];
373 /* If argument name is omitted, set "argN" */ 511 /* If argument name is omitted, set "argN" */
374 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); 512 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
375 tu->args[i].name = kstrdup(buf, GFP_KERNEL); 513 parg->name = kstrdup(buf, GFP_KERNEL);
376 } 514 }
377 515
378 if (!tu->args[i].name) { 516 if (!parg->name) {
379 pr_info("Failed to allocate argument[%d] name.\n", i); 517 pr_info("Failed to allocate argument[%d] name.\n", i);
380 ret = -ENOMEM; 518 ret = -ENOMEM;
381 goto error; 519 goto error;
382 } 520 }
383 521
384 if (!is_good_name(tu->args[i].name)) { 522 if (!is_good_name(parg->name)) {
385 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); 523 pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
386 ret = -EINVAL; 524 ret = -EINVAL;
387 goto error; 525 goto error;
388 } 526 }
389 527
390 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { 528 if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
391 pr_info("Argument[%d] name '%s' conflicts with " 529 pr_info("Argument[%d] name '%s' conflicts with "
392 "another field.\n", i, argv[i]); 530 "another field.\n", i, argv[i]);
393 ret = -EINVAL; 531 ret = -EINVAL;
@@ -395,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv)
395 } 533 }
396 534
397 /* Parse fetch argument */ 535 /* Parse fetch argument */
398 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); 536 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
537 is_return, false);
399 if (ret) { 538 if (ret) {
400 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 539 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
401 goto error; 540 goto error;
@@ -459,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v)
459 char c = is_ret_probe(tu) ? 'r' : 'p'; 598 char c = is_ret_probe(tu) ? 'r' : 'p';
460 int i; 599 int i;
461 600
462 seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); 601 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
463 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 602 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
464 603
465 for (i = 0; i < tu->nr_args; i++) 604 for (i = 0; i < tu->tp.nr_args; i++)
466 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); 605 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
467 606
468 seq_printf(m, "\n"); 607 seq_printf(m, "\n");
469 return 0; 608 return 0;
@@ -509,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
509{ 648{
510 struct trace_uprobe *tu = v; 649 struct trace_uprobe *tu = v;
511 650
512 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); 651 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
513 return 0; 652 return 0;
514} 653}
515 654
@@ -533,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = {
533 .release = seq_release, 672 .release = seq_release,
534}; 673};
535 674
675struct uprobe_cpu_buffer {
676 struct mutex mutex;
677 void *buf;
678};
679static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
680static int uprobe_buffer_refcnt;
681
682static int uprobe_buffer_init(void)
683{
684 int cpu, err_cpu;
685
686 uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
687 if (uprobe_cpu_buffer == NULL)
688 return -ENOMEM;
689
690 for_each_possible_cpu(cpu) {
691 struct page *p = alloc_pages_node(cpu_to_node(cpu),
692 GFP_KERNEL, 0);
693 if (p == NULL) {
694 err_cpu = cpu;
695 goto err;
696 }
697 per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
698 mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
699 }
700
701 return 0;
702
703err:
704 for_each_possible_cpu(cpu) {
705 if (cpu == err_cpu)
706 break;
707 free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
708 }
709
710 free_percpu(uprobe_cpu_buffer);
711 return -ENOMEM;
712}
713
714static int uprobe_buffer_enable(void)
715{
716 int ret = 0;
717
718 BUG_ON(!mutex_is_locked(&event_mutex));
719
720 if (uprobe_buffer_refcnt++ == 0) {
721 ret = uprobe_buffer_init();
722 if (ret < 0)
723 uprobe_buffer_refcnt--;
724 }
725
726 return ret;
727}
728
729static void uprobe_buffer_disable(void)
730{
731 BUG_ON(!mutex_is_locked(&event_mutex));
732
733 if (--uprobe_buffer_refcnt == 0) {
734 free_percpu(uprobe_cpu_buffer);
735 uprobe_cpu_buffer = NULL;
736 }
737}
738
739static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
740{
741 struct uprobe_cpu_buffer *ucb;
742 int cpu;
743
744 cpu = raw_smp_processor_id();
745 ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
746
747 /*
748 * Use per-cpu buffers for fastest access, but we might migrate
749 * so the mutex makes sure we have sole access to it.
750 */
751 mutex_lock(&ucb->mutex);
752
753 return ucb;
754}
755
756static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
757{
758 mutex_unlock(&ucb->mutex);
759}
760
536static void uprobe_trace_print(struct trace_uprobe *tu, 761static void uprobe_trace_print(struct trace_uprobe *tu,
537 unsigned long func, struct pt_regs *regs) 762 unsigned long func, struct pt_regs *regs)
538{ 763{
539 struct uprobe_trace_entry_head *entry; 764 struct uprobe_trace_entry_head *entry;
540 struct ring_buffer_event *event; 765 struct ring_buffer_event *event;
541 struct ring_buffer *buffer; 766 struct ring_buffer *buffer;
767 struct uprobe_cpu_buffer *ucb;
542 void *data; 768 void *data;
543 int size, i; 769 int size, dsize, esize;
544 struct ftrace_event_call *call = &tu->call; 770 struct ftrace_event_call *call = &tu->tp.call;
771
772 dsize = __get_data_size(&tu->tp, regs);
773 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
545 774
546 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 775 if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE))
776 return;
777
778 ucb = uprobe_buffer_get();
779 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
780
781 size = esize + tu->tp.size + dsize;
547 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 782 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
548 size + tu->size, 0, 0); 783 size, 0, 0);
549 if (!event) 784 if (!event)
550 return; 785 goto out;
551 786
552 entry = ring_buffer_event_data(event); 787 entry = ring_buffer_event_data(event);
553 if (is_ret_probe(tu)) { 788 if (is_ret_probe(tu)) {
@@ -559,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
559 data = DATAOF_TRACE_ENTRY(entry, false); 794 data = DATAOF_TRACE_ENTRY(entry, false);
560 } 795 }
561 796
562 for (i = 0; i < tu->nr_args; i++) 797 memcpy(data, ucb->buf, tu->tp.size + dsize);
563 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
564 798
565 if (!call_filter_check_discard(call, entry, buffer, event)) 799 if (!call_filter_check_discard(call, entry, buffer, event))
566 trace_buffer_unlock_commit(buffer, event, 0, 0); 800 trace_buffer_unlock_commit(buffer, event, 0, 0);
801
802out:
803 uprobe_buffer_put(ucb);
567} 804}
568 805
569/* uprobe handler */ 806/* uprobe handler */
@@ -591,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
591 int i; 828 int i;
592 829
593 entry = (struct uprobe_trace_entry_head *)iter->ent; 830 entry = (struct uprobe_trace_entry_head *)iter->ent;
594 tu = container_of(event, struct trace_uprobe, call.event); 831 tu = container_of(event, struct trace_uprobe, tp.call.event);
595 832
596 if (is_ret_probe(tu)) { 833 if (is_ret_probe(tu)) {
597 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, 834 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
598 entry->vaddr[1], entry->vaddr[0])) 835 entry->vaddr[1], entry->vaddr[0]))
599 goto partial; 836 goto partial;
600 data = DATAOF_TRACE_ENTRY(entry, true); 837 data = DATAOF_TRACE_ENTRY(entry, true);
601 } else { 838 } else {
602 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, 839 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
603 entry->vaddr[0])) 840 entry->vaddr[0]))
604 goto partial; 841 goto partial;
605 data = DATAOF_TRACE_ENTRY(entry, false); 842 data = DATAOF_TRACE_ENTRY(entry, false);
606 } 843 }
607 844
608 for (i = 0; i < tu->nr_args; i++) { 845 for (i = 0; i < tu->tp.nr_args; i++) {
609 if (!tu->args[i].type->print(s, tu->args[i].name, 846 struct probe_arg *parg = &tu->tp.args[i];
610 data + tu->args[i].offset, entry)) 847
848 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
611 goto partial; 849 goto partial;
612 } 850 }
613 851
@@ -618,11 +856,6 @@ partial:
618 return TRACE_TYPE_PARTIAL_LINE; 856 return TRACE_TYPE_PARTIAL_LINE;
619} 857}
620 858
621static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
622{
623 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
624}
625
626typedef bool (*filter_func_t)(struct uprobe_consumer *self, 859typedef bool (*filter_func_t)(struct uprobe_consumer *self,
627 enum uprobe_filter_ctx ctx, 860 enum uprobe_filter_ctx ctx,
628 struct mm_struct *mm); 861 struct mm_struct *mm);
@@ -632,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
632{ 865{
633 int ret = 0; 866 int ret = 0;
634 867
635 if (is_trace_uprobe_enabled(tu)) 868 if (trace_probe_is_enabled(&tu->tp))
636 return -EINTR; 869 return -EINTR;
637 870
871 ret = uprobe_buffer_enable();
872 if (ret < 0)
873 return ret;
874
638 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 875 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
639 876
640 tu->flags |= flag; 877 tu->tp.flags |= flag;
641 tu->consumer.filter = filter; 878 tu->consumer.filter = filter;
642 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 879 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
643 if (ret) 880 if (ret)
644 tu->flags &= ~flag; 881 tu->tp.flags &= ~flag;
645 882
646 return ret; 883 return ret;
647} 884}
648 885
649static void probe_event_disable(struct trace_uprobe *tu, int flag) 886static void probe_event_disable(struct trace_uprobe *tu, int flag)
650{ 887{
651 if (!is_trace_uprobe_enabled(tu)) 888 if (!trace_probe_is_enabled(&tu->tp))
652 return; 889 return;
653 890
654 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 891 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
655 892
656 uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 893 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
657 tu->flags &= ~flag; 894 tu->tp.flags &= ~flag;
895
896 uprobe_buffer_disable();
658} 897}
659 898
660static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 899static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -672,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
672 size = SIZEOF_TRACE_ENTRY(false); 911 size = SIZEOF_TRACE_ENTRY(false);
673 } 912 }
674 /* Set argument names as fields */ 913 /* Set argument names as fields */
675 for (i = 0; i < tu->nr_args; i++) { 914 for (i = 0; i < tu->tp.nr_args; i++) {
676 ret = trace_define_field(event_call, tu->args[i].type->fmttype, 915 struct probe_arg *parg = &tu->tp.args[i];
677 tu->args[i].name, 916
678 size + tu->args[i].offset, 917 ret = trace_define_field(event_call, parg->type->fmttype,
679 tu->args[i].type->size, 918 parg->name, size + parg->offset,
680 tu->args[i].type->is_signed, 919 parg->type->size, parg->type->is_signed,
681 FILTER_OTHER); 920 FILTER_OTHER);
682 921
683 if (ret) 922 if (ret)
@@ -686,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
686 return 0; 925 return 0;
687} 926}
688 927
689#define LEN_OR_ZERO (len ? len - pos : 0)
690static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
691{
692 const char *fmt, *arg;
693 int i;
694 int pos = 0;
695
696 if (is_ret_probe(tu)) {
697 fmt = "(%lx <- %lx)";
698 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
699 } else {
700 fmt = "(%lx)";
701 arg = "REC->" FIELD_STRING_IP;
702 }
703
704 /* When len=0, we just calculate the needed length */
705
706 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
707
708 for (i = 0; i < tu->nr_args; i++) {
709 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
710 tu->args[i].name, tu->args[i].type->fmt);
711 }
712
713 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
714
715 for (i = 0; i < tu->nr_args; i++) {
716 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
717 tu->args[i].name);
718 }
719
720 return pos; /* return the length of print_fmt */
721}
722#undef LEN_OR_ZERO
723
724static int set_print_fmt(struct trace_uprobe *tu)
725{
726 char *print_fmt;
727 int len;
728
729 /* First: called with 0 length to calculate the needed length */
730 len = __set_print_fmt(tu, NULL, 0);
731 print_fmt = kmalloc(len + 1, GFP_KERNEL);
732 if (!print_fmt)
733 return -ENOMEM;
734
735 /* Second: actually write the @print_fmt */
736 __set_print_fmt(tu, print_fmt, len + 1);
737 tu->call.print_fmt = print_fmt;
738
739 return 0;
740}
741
742#ifdef CONFIG_PERF_EVENTS 928#ifdef CONFIG_PERF_EVENTS
743static bool 929static bool
744__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) 930__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
@@ -831,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
831static void uprobe_perf_print(struct trace_uprobe *tu, 1017static void uprobe_perf_print(struct trace_uprobe *tu,
832 unsigned long func, struct pt_regs *regs) 1018 unsigned long func, struct pt_regs *regs)
833{ 1019{
834 struct ftrace_event_call *call = &tu->call; 1020 struct ftrace_event_call *call = &tu->tp.call;
835 struct uprobe_trace_entry_head *entry; 1021 struct uprobe_trace_entry_head *entry;
836 struct hlist_head *head; 1022 struct hlist_head *head;
1023 struct uprobe_cpu_buffer *ucb;
837 void *data; 1024 void *data;
838 int size, rctx, i; 1025 int size, dsize, esize;
1026 int rctx;
839 1027
840 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1028 dsize = __get_data_size(&tu->tp, regs);
841 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); 1029 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1030
1031 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1032 return;
1033
1034 size = esize + tu->tp.size + dsize;
1035 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1036 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1037 return;
1038
1039 ucb = uprobe_buffer_get();
1040 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
842 1041
843 preempt_disable(); 1042 preempt_disable();
844 head = this_cpu_ptr(call->perf_events); 1043 head = this_cpu_ptr(call->perf_events);
@@ -858,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
858 data = DATAOF_TRACE_ENTRY(entry, false); 1057 data = DATAOF_TRACE_ENTRY(entry, false);
859 } 1058 }
860 1059
861 for (i = 0; i < tu->nr_args; i++) 1060 memcpy(data, ucb->buf, tu->tp.size + dsize);
862 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 1061
1062 if (size - esize > tu->tp.size + dsize) {
1063 int len = tu->tp.size + dsize;
1064
1065 memset(data + len, 0, size - esize - len);
1066 }
863 1067
864 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1068 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
865 out: 1069 out:
866 preempt_enable(); 1070 preempt_enable();
1071 uprobe_buffer_put(ucb);
867} 1072}
868 1073
869/* uprobe profile handler */ 1074/* uprobe profile handler */
@@ -921,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
921static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 1126static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
922{ 1127{
923 struct trace_uprobe *tu; 1128 struct trace_uprobe *tu;
1129 struct uprobe_dispatch_data udd;
924 int ret = 0; 1130 int ret = 0;
925 1131
926 tu = container_of(con, struct trace_uprobe, consumer); 1132 tu = container_of(con, struct trace_uprobe, consumer);
927 tu->nhit++; 1133 tu->nhit++;
928 1134
929 if (tu->flags & TP_FLAG_TRACE) 1135 udd.tu = tu;
1136 udd.bp_addr = instruction_pointer(regs);
1137
1138 current->utask->vaddr = (unsigned long) &udd;
1139
1140 if (tu->tp.flags & TP_FLAG_TRACE)
930 ret |= uprobe_trace_func(tu, regs); 1141 ret |= uprobe_trace_func(tu, regs);
931 1142
932#ifdef CONFIG_PERF_EVENTS 1143#ifdef CONFIG_PERF_EVENTS
933 if (tu->flags & TP_FLAG_PROFILE) 1144 if (tu->tp.flags & TP_FLAG_PROFILE)
934 ret |= uprobe_perf_func(tu, regs); 1145 ret |= uprobe_perf_func(tu, regs);
935#endif 1146#endif
936 return ret; 1147 return ret;
@@ -940,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
940 unsigned long func, struct pt_regs *regs) 1151 unsigned long func, struct pt_regs *regs)
941{ 1152{
942 struct trace_uprobe *tu; 1153 struct trace_uprobe *tu;
1154 struct uprobe_dispatch_data udd;
943 1155
944 tu = container_of(con, struct trace_uprobe, consumer); 1156 tu = container_of(con, struct trace_uprobe, consumer);
945 1157
946 if (tu->flags & TP_FLAG_TRACE) 1158 udd.tu = tu;
1159 udd.bp_addr = func;
1160
1161 current->utask->vaddr = (unsigned long) &udd;
1162
1163 if (tu->tp.flags & TP_FLAG_TRACE)
947 uretprobe_trace_func(tu, func, regs); 1164 uretprobe_trace_func(tu, func, regs);
948 1165
949#ifdef CONFIG_PERF_EVENTS 1166#ifdef CONFIG_PERF_EVENTS
950 if (tu->flags & TP_FLAG_PROFILE) 1167 if (tu->tp.flags & TP_FLAG_PROFILE)
951 uretprobe_perf_func(tu, func, regs); 1168 uretprobe_perf_func(tu, func, regs);
952#endif 1169#endif
953 return 0; 1170 return 0;
@@ -959,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = {
959 1176
960static int register_uprobe_event(struct trace_uprobe *tu) 1177static int register_uprobe_event(struct trace_uprobe *tu)
961{ 1178{
962 struct ftrace_event_call *call = &tu->call; 1179 struct ftrace_event_call *call = &tu->tp.call;
963 int ret; 1180 int ret;
964 1181
965 /* Initialize ftrace_event_call */ 1182 /* Initialize ftrace_event_call */
@@ -967,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
967 call->event.funcs = &uprobe_funcs; 1184 call->event.funcs = &uprobe_funcs;
968 call->class->define_fields = uprobe_event_define_fields; 1185 call->class->define_fields = uprobe_event_define_fields;
969 1186
970 if (set_print_fmt(tu) < 0) 1187 if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
971 return -ENOMEM; 1188 return -ENOMEM;
972 1189
973 ret = register_ftrace_event(&call->event); 1190 ret = register_ftrace_event(&call->event);
@@ -994,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
994 int ret; 1211 int ret;
995 1212
996 /* tu->event is unregistered in trace_remove_event_call() */ 1213 /* tu->event is unregistered in trace_remove_event_call() */
997 ret = trace_remove_event_call(&tu->call); 1214 ret = trace_remove_event_call(&tu->tp.call);
998 if (ret) 1215 if (ret)
999 return ret; 1216 return ret;
1000 kfree(tu->call.print_fmt); 1217 kfree(tu->tp.call.print_fmt);
1001 tu->call.print_fmt = NULL; 1218 tu->tp.call.print_fmt = NULL;
1002 return 0; 1219 return 0;
1003} 1220}
1004 1221
diff --git a/kernel/user.c b/kernel/user.c
index a3a0dbfda329..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,9 +51,9 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_KEYS_KERBEROS_CACHE 54#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .krb_cache_register_sem = 55 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), 56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
57#endif 57#endif
58}; 58};
59EXPORT_SYMBOL_GPL(init_user_ns); 59EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..82ef9f3b7473 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
305/* I: attributes used when instantiating standard unbound pools on demand */ 305/* I: attributes used when instantiating standard unbound pools on demand */
306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
307 307
308/* I: attributes used when instantiating ordered pools on demand */
309static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
310
308struct workqueue_struct *system_wq __read_mostly; 311struct workqueue_struct *system_wq __read_mostly;
309EXPORT_SYMBOL(system_wq); 312EXPORT_SYMBOL(system_wq);
310struct workqueue_struct *system_highpri_wq __read_mostly; 313struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
518static inline void debug_work_deactivate(struct work_struct *work) { } 521static inline void debug_work_deactivate(struct work_struct *work) { }
519#endif 522#endif
520 523
521/* allocate ID and assign it to @pool */ 524/**
525 * worker_pool_assign_id - allocate ID and assing it to @pool
526 * @pool: the pool pointer of interest
527 *
528 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
529 * successfully, -errno on failure.
530 */
522static int worker_pool_assign_id(struct worker_pool *pool) 531static int worker_pool_assign_id(struct worker_pool *pool)
523{ 532{
524 int ret; 533 int ret;
525 534
526 lockdep_assert_held(&wq_pool_mutex); 535 lockdep_assert_held(&wq_pool_mutex);
527 536
528 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); 537 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
538 GFP_KERNEL);
529 if (ret >= 0) { 539 if (ret >= 0) {
530 pool->id = ret; 540 pool->id = ret;
531 return 0; 541 return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1320 1330
1321 debug_work_activate(work); 1331 debug_work_activate(work);
1322 1332
1323 /* if dying, only works from the same workqueue are allowed */ 1333 /* if draining, only works from the same workqueue are allowed */
1324 if (unlikely(wq->flags & __WQ_DRAINING) && 1334 if (unlikely(wq->flags & __WQ_DRAINING) &&
1325 WARN_ON_ONCE(!is_chained_work(wq))) 1335 WARN_ON_ONCE(!is_chained_work(wq)))
1326 return; 1336 return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
1736 if (IS_ERR(worker->task)) 1746 if (IS_ERR(worker->task))
1737 goto fail; 1747 goto fail;
1738 1748
1749 set_user_nice(worker->task, pool->attrs->nice);
1750
1751 /* prevent userland from meddling with cpumask of workqueue workers */
1752 worker->task->flags |= PF_NO_SETAFFINITY;
1753
1739 /* 1754 /*
1740 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1755 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1741 * online CPUs. It'll be re-applied when any of the CPUs come up. 1756 * online CPUs. It'll be re-applied when any of the CPUs come up.
1742 */ 1757 */
1743 set_user_nice(worker->task, pool->attrs->nice);
1744 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 1758 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1745 1759
1746 /* prevent userland from meddling with cpumask of workqueue workers */
1747 worker->task->flags |= PF_NO_SETAFFINITY;
1748
1749 /* 1760 /*
1750 * The caller is responsible for ensuring %POOL_DISASSOCIATED 1761 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1751 * remains stable across this function. See the comments above the 1762 * remains stable across this function. See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
2840 return false; 2851 return false;
2841} 2852}
2842 2853
2843static bool __flush_work(struct work_struct *work)
2844{
2845 struct wq_barrier barr;
2846
2847 if (start_flush_work(work, &barr)) {
2848 wait_for_completion(&barr.done);
2849 destroy_work_on_stack(&barr.work);
2850 return true;
2851 } else {
2852 return false;
2853 }
2854}
2855
2856/** 2854/**
2857 * flush_work - wait for a work to finish executing the last queueing instance 2855 * flush_work - wait for a work to finish executing the last queueing instance
2858 * @work: the work to flush 2856 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
2866 */ 2864 */
2867bool flush_work(struct work_struct *work) 2865bool flush_work(struct work_struct *work)
2868{ 2866{
2867 struct wq_barrier barr;
2868
2869 lock_map_acquire(&work->lockdep_map); 2869 lock_map_acquire(&work->lockdep_map);
2870 lock_map_release(&work->lockdep_map); 2870 lock_map_release(&work->lockdep_map);
2871 2871
2872 return __flush_work(work); 2872 if (start_flush_work(work, &barr)) {
2873 wait_for_completion(&barr.done);
2874 destroy_work_on_stack(&barr.work);
2875 return true;
2876 } else {
2877 return false;
2878 }
2873} 2879}
2874EXPORT_SYMBOL_GPL(flush_work); 2880EXPORT_SYMBOL_GPL(flush_work);
2875 2881
@@ -4106,7 +4112,7 @@ out_unlock:
4106static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4112static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4107{ 4113{
4108 bool highpri = wq->flags & WQ_HIGHPRI; 4114 bool highpri = wq->flags & WQ_HIGHPRI;
4109 int cpu; 4115 int cpu, ret;
4110 4116
4111 if (!(wq->flags & WQ_UNBOUND)) { 4117 if (!(wq->flags & WQ_UNBOUND)) {
4112 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); 4118 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4126 mutex_unlock(&wq->mutex); 4132 mutex_unlock(&wq->mutex);
4127 } 4133 }
4128 return 0; 4134 return 0;
4135 } else if (wq->flags & __WQ_ORDERED) {
4136 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
4137 /* there should only be single pwq for ordering guarantee */
4138 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
4139 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
4140 "ordering guarantee broken for workqueue %s\n", wq->name);
4141 return ret;
4129 } else { 4142 } else {
4130 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); 4143 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
4131 } 4144 }
@@ -4776,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4776 4789
4777 /* wait for per-cpu unbinding to finish */ 4790 /* wait for per-cpu unbinding to finish */
4778 flush_work(&unbind_work); 4791 flush_work(&unbind_work);
4792 destroy_work_on_stack(&unbind_work);
4779 break; 4793 break;
4780 } 4794 }
4781 return NOTIFY_OK; 4795 return NOTIFY_OK;
@@ -4814,14 +4828,8 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4814 4828
4815 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4829 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4816 schedule_work_on(cpu, &wfc.work); 4830 schedule_work_on(cpu, &wfc.work);
4817 4831 flush_work(&wfc.work);
4818 /* 4832 destroy_work_on_stack(&wfc.work);
4819 * The work item is on-stack and can't lead to deadlock through
4820 * flushing. Use __flush_work() to avoid spurious lockdep warnings
4821 * when work_on_cpu()s are nested.
4822 */
4823 __flush_work(&wfc.work);
4824
4825 return wfc.ret; 4833 return wfc.ret;
4826} 4834}
4827EXPORT_SYMBOL_GPL(work_on_cpu); 4835EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5017,6 @@ static int __init init_workqueues(void)
5009 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5017 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5010 int i, cpu; 5018 int i, cpu;
5011 5019
5012 /* make sure we have enough bits for OFFQ pool ID */
5013 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
5014 WORK_CPU_END * NR_STD_WORKER_POOLS);
5015
5016 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5020 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5017 5021
5018 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5022 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5055,23 @@ static int __init init_workqueues(void)
5051 } 5055 }
5052 } 5056 }
5053 5057
5054 /* create default unbound wq attrs */ 5058 /* create default unbound and ordered wq attrs */
5055 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 5059 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
5056 struct workqueue_attrs *attrs; 5060 struct workqueue_attrs *attrs;
5057 5061
5058 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); 5062 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5059 attrs->nice = std_nice[i]; 5063 attrs->nice = std_nice[i];
5060 unbound_std_wq_attrs[i] = attrs; 5064 unbound_std_wq_attrs[i] = attrs;
5065
5066 /*
5067 * An ordered wq should have only one pwq as ordering is
5068 * guaranteed by max_active which is enforced by pwqs.
5069 * Turn off NUMA so that dfl_pwq is used for all nodes.
5070 */
5071 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5072 attrs->nice = std_nice[i];
5073 attrs->no_numa = true;
5074 ordered_wq_attrs[i] = attrs;
5061 } 5075 }
5062 5076
5063 system_wq = alloc_workqueue("events", 0, 0); 5077 system_wq = alloc_workqueue("events", 0, 0);