aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-05-07 07:15:46 -0400
committerIngo Molnar <mingo@kernel.org>2014-05-07 07:15:46 -0400
commit2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc (patch)
tree9478e8cf470c1d5bdb2d89b57a7e35919ab95e72 /kernel
parent08f8aeb55d7727d644dbbbbfb798fe937d47751d (diff)
parent2b4cfe64dee0d84506b951d81bf55d9891744d25 (diff)
Merge branch 'sched/urgent' into sched/core, to avoid conflicts
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c27
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c33
-rw-r--r--kernel/auditsc.c133
-rw-r--r--kernel/capability.c29
-rw-r--r--kernel/cgroup.c3725
-rw-r--r--kernel/cgroup_freezer.c40
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/cpuset.c264
-rw-r--r--kernel/debug/debug_core.c14
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/exit.c112
-rw-r--r--kernel/fork.c39
-rw-r--r--kernel/futex.c37
-rw-r--r--kernel/groups.c14
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c9
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/mutex-debug.c19
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/panic.c15
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/printk.c15
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/res_counter.c23
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c74
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/idle.c150
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/seccomp.c126
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c15
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/ftrace.c162
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c197
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_events.c85
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_functions.c147
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c38
-rw-r--r--kernel/trace/trace_nop.c5
-rw-r--r--kernel/trace/trace_output.c33
-rw-r--r--kernel/trace/trace_probe.h17
-rw-r--r--kernel/trace/trace_sched_wakeup.c10
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/trace_uprobe.c217
-rw-r--r--kernel/tracepoint.c686
-rw-r--r--kernel/user.c3
-rw-r--r--kernel/user_namespace.c13
-rw-r--r--kernel/watchdog.c22
77 files changed, 3318 insertions, 3585 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 95a20f3f52f1..7c2893602d06 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
182 182
183struct audit_reply { 183struct audit_reply {
184 __u32 portid; 184 __u32 portid;
185 struct net *net; 185 struct net *net;
186 struct sk_buff *skb; 186 struct sk_buff *skb;
187}; 187};
188 188
@@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb)
396 if (printk_ratelimit()) 396 if (printk_ratelimit())
397 pr_notice("type=%d %s\n", nlh->nlmsg_type, data); 397 pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
398 else 398 else
399 audit_log_lost("printk limit exceeded\n"); 399 audit_log_lost("printk limit exceeded");
400 } 400 }
401 401
402 audit_hold_skb(skb); 402 audit_hold_skb(skb);
@@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
412 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 412 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
413 if (audit_pid) { 413 if (audit_pid) {
414 pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); 414 pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
415 audit_log_lost("auditd disappeared\n"); 415 audit_log_lost("auditd disappeared");
416 audit_pid = 0; 416 audit_pid = 0;
417 audit_sock = NULL; 417 audit_sock = NULL;
418 } 418 }
@@ -607,7 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
607{ 607{
608 int err = 0; 608 int err = 0;
609 609
610 /* Only support the initial namespaces for now. */ 610 /* Only support initial user namespace for now. */
611 /* 611 /*
612 * We return ECONNREFUSED because it tricks userspace into thinking 612 * We return ECONNREFUSED because it tricks userspace into thinking
613 * that audit was not configured into the kernel. Lots of users 613 * that audit was not configured into the kernel. Lots of users
@@ -618,8 +618,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
618 * userspace will reject all logins. This should be removed when we 618 * userspace will reject all logins. This should be removed when we
619 * support non init namespaces!! 619 * support non init namespaces!!
620 */ 620 */
621 if ((current_user_ns() != &init_user_ns) || 621 if (current_user_ns() != &init_user_ns)
622 (task_active_pid_ns(current) != &init_pid_ns))
623 return -ECONNREFUSED; 622 return -ECONNREFUSED;
624 623
625 switch (msg_type) { 624 switch (msg_type) {
@@ -639,6 +638,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
639 case AUDIT_TTY_SET: 638 case AUDIT_TTY_SET:
640 case AUDIT_TRIM: 639 case AUDIT_TRIM:
641 case AUDIT_MAKE_EQUIV: 640 case AUDIT_MAKE_EQUIV:
641 /* Only support auditd and auditctl in initial pid namespace
642 * for now. */
643 if ((task_active_pid_ns(current) != &init_pid_ns))
644 return -EPERM;
645
642 if (!capable(CAP_AUDIT_CONTROL)) 646 if (!capable(CAP_AUDIT_CONTROL))
643 err = -EPERM; 647 err = -EPERM;
644 break; 648 break;
@@ -659,6 +663,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
659{ 663{
660 int rc = 0; 664 int rc = 0;
661 uid_t uid = from_kuid(&init_user_ns, current_uid()); 665 uid_t uid = from_kuid(&init_user_ns, current_uid());
666 pid_t pid = task_tgid_nr(current);
662 667
663 if (!audit_enabled && msg_type != AUDIT_USER_AVC) { 668 if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
664 *ab = NULL; 669 *ab = NULL;
@@ -668,7 +673,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
668 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 673 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
669 if (unlikely(!*ab)) 674 if (unlikely(!*ab))
670 return rc; 675 return rc;
671 audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); 676 audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
672 audit_log_session_info(*ab); 677 audit_log_session_info(*ab);
673 audit_log_task_context(*ab); 678 audit_log_task_context(*ab);
674 679
@@ -1097,7 +1102,7 @@ static void __net_exit audit_net_exit(struct net *net)
1097 audit_sock = NULL; 1102 audit_sock = NULL;
1098 } 1103 }
1099 1104
1100 rcu_assign_pointer(aunet->nlsk, NULL); 1105 RCU_INIT_POINTER(aunet->nlsk, NULL);
1101 synchronize_net(); 1106 synchronize_net();
1102 netlink_kernel_release(sock); 1107 netlink_kernel_release(sock);
1103} 1108}
@@ -1829,11 +1834,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1829 spin_unlock_irq(&tsk->sighand->siglock); 1834 spin_unlock_irq(&tsk->sighand->siglock);
1830 1835
1831 audit_log_format(ab, 1836 audit_log_format(ab,
1832 " ppid=%ld pid=%d auid=%u uid=%u gid=%u" 1837 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
1833 " euid=%u suid=%u fsuid=%u" 1838 " euid=%u suid=%u fsuid=%u"
1834 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", 1839 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1835 sys_getppid(), 1840 task_ppid_nr(tsk),
1836 tsk->pid, 1841 task_pid_nr(tsk),
1837 from_kuid(&init_user_ns, audit_get_loginuid(tsk)), 1842 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
1838 from_kuid(&init_user_ns, cred->uid), 1843 from_kuid(&init_user_ns, cred->uid),
1839 from_kgid(&init_user_ns, cred->gid), 1844 from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit.h b/kernel/audit.h
index 8df132214606..7bb65730c890 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -106,6 +106,11 @@ struct audit_names {
106 bool should_free; 106 bool should_free;
107}; 107};
108 108
109struct audit_proctitle {
110 int len; /* length of the cmdline field. */
111 char *value; /* the cmdline field */
112};
113
109/* The per-task audit context. */ 114/* The per-task audit context. */
110struct audit_context { 115struct audit_context {
111 int dummy; /* must be the first element */ 116 int dummy; /* must be the first element */
@@ -202,6 +207,7 @@ struct audit_context {
202 } execve; 207 } execve;
203 }; 208 };
204 int fds[2]; 209 int fds[2];
210 struct audit_proctitle proctitle;
205 211
206#if AUDIT_DEBUG 212#if AUDIT_DEBUG
207 int put_count; 213 int put_count;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 92062fd6cc8c..8e9bc9c3dbb7 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
23
22#include <linux/kernel.h> 24#include <linux/kernel.h>
23#include <linux/audit.h> 25#include <linux/audit.h>
24#include <linux/kthread.h> 26#include <linux/kthread.h>
@@ -226,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
226#endif 228#endif
227 229
228/* Common user-space to kernel rule translation. */ 230/* Common user-space to kernel rule translation. */
229static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 231static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
230{ 232{
231 unsigned listnr; 233 unsigned listnr;
232 struct audit_entry *entry; 234 struct audit_entry *entry;
@@ -249,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
249 ; 251 ;
250 } 252 }
251 if (unlikely(rule->action == AUDIT_POSSIBLE)) { 253 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
252 printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); 254 pr_err("AUDIT_POSSIBLE is deprecated\n");
253 goto exit_err; 255 goto exit_err;
254 } 256 }
255 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) 257 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -403,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
403 int i; 405 int i;
404 char *str; 406 char *str;
405 407
406 entry = audit_to_entry_common((struct audit_rule *)data); 408 entry = audit_to_entry_common(data);
407 if (IS_ERR(entry)) 409 if (IS_ERR(entry))
408 goto exit_nofree; 410 goto exit_nofree;
409 411
@@ -431,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
431 f->val = 0; 433 f->val = 0;
432 } 434 }
433 435
436 if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
437 struct pid *pid;
438 rcu_read_lock();
439 pid = find_vpid(f->val);
440 if (!pid) {
441 rcu_read_unlock();
442 err = -ESRCH;
443 goto exit_free;
444 }
445 f->val = pid_nr(pid);
446 rcu_read_unlock();
447 }
448
434 err = audit_field_valid(entry, f); 449 err = audit_field_valid(entry, f);
435 if (err) 450 if (err)
436 goto exit_free; 451 goto exit_free;
@@ -479,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
479 /* Keep currently invalid fields around in case they 494 /* Keep currently invalid fields around in case they
480 * become valid after a policy reload. */ 495 * become valid after a policy reload. */
481 if (err == -EINVAL) { 496 if (err == -EINVAL) {
482 printk(KERN_WARNING "audit rule for LSM " 497 pr_warn("audit rule for LSM \'%s\' is invalid\n",
483 "\'%s\' is invalid\n", str); 498 str);
484 err = 0; 499 err = 0;
485 } 500 }
486 if (err) { 501 if (err) {
@@ -709,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
709 /* Keep currently invalid fields around in case they 724 /* Keep currently invalid fields around in case they
710 * become valid after a policy reload. */ 725 * become valid after a policy reload. */
711 if (ret == -EINVAL) { 726 if (ret == -EINVAL) {
712 printk(KERN_WARNING "audit rule for LSM \'%s\' is " 727 pr_warn("audit rule for LSM \'%s\' is invalid\n",
713 "invalid\n", df->lsm_str); 728 df->lsm_str);
714 ret = 0; 729 ret = 0;
715 } 730 }
716 731
@@ -1240,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
1240 1255
1241 for (i = 0; i < rule->field_count; i++) { 1256 for (i = 0; i < rule->field_count; i++) {
1242 struct audit_field *f = &rule->fields[i]; 1257 struct audit_field *f = &rule->fields[i];
1258 pid_t pid;
1243 int result = 0; 1259 int result = 0;
1244 u32 sid; 1260 u32 sid;
1245 1261
1246 switch (f->type) { 1262 switch (f->type) {
1247 case AUDIT_PID: 1263 case AUDIT_PID:
1248 result = audit_comparator(task_pid_vnr(current), f->op, f->val); 1264 pid = task_pid_nr(current);
1265 result = audit_comparator(pid, f->op, f->val);
1249 break; 1266 break;
1250 case AUDIT_UID: 1267 case AUDIT_UID:
1251 result = audit_uid_comparator(current_uid(), f->op, f->uid); 1268 result = audit_uid_comparator(current_uid(), f->op, f->uid);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aef2f4b6c64..f251a5e8d17a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
42 * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. 42 * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
43 */ 43 */
44 44
45#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
46
45#include <linux/init.h> 47#include <linux/init.h>
46#include <asm/types.h> 48#include <asm/types.h>
47#include <linux/atomic.h> 49#include <linux/atomic.h>
@@ -68,6 +70,7 @@
68#include <linux/capability.h> 70#include <linux/capability.h>
69#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
70#include <linux/compat.h> 72#include <linux/compat.h>
73#include <linux/ctype.h>
71 74
72#include "audit.h" 75#include "audit.h"
73 76
@@ -79,6 +82,9 @@
79/* no execve audit message should be longer than this (userspace limits) */ 82/* no execve audit message should be longer than this (userspace limits) */
80#define MAX_EXECVE_AUDIT_LEN 7500 83#define MAX_EXECVE_AUDIT_LEN 7500
81 84
85/* max length to print of cmdline/proctitle value during audit */
86#define MAX_PROCTITLE_AUDIT_LEN 128
87
82/* number of audit rules */ 88/* number of audit rules */
83int audit_n_rules; 89int audit_n_rules;
84 90
@@ -451,15 +457,17 @@ static int audit_filter_rules(struct task_struct *tsk,
451 struct audit_field *f = &rule->fields[i]; 457 struct audit_field *f = &rule->fields[i];
452 struct audit_names *n; 458 struct audit_names *n;
453 int result = 0; 459 int result = 0;
460 pid_t pid;
454 461
455 switch (f->type) { 462 switch (f->type) {
456 case AUDIT_PID: 463 case AUDIT_PID:
457 result = audit_comparator(tsk->pid, f->op, f->val); 464 pid = task_pid_nr(tsk);
465 result = audit_comparator(pid, f->op, f->val);
458 break; 466 break;
459 case AUDIT_PPID: 467 case AUDIT_PPID:
460 if (ctx) { 468 if (ctx) {
461 if (!ctx->ppid) 469 if (!ctx->ppid)
462 ctx->ppid = sys_getppid(); 470 ctx->ppid = task_ppid_nr(tsk);
463 result = audit_comparator(ctx->ppid, f->op, f->val); 471 result = audit_comparator(ctx->ppid, f->op, f->val);
464 } 472 }
465 break; 473 break;
@@ -805,7 +813,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
805 rcu_read_unlock(); 813 rcu_read_unlock();
806} 814}
807 815
808static inline struct audit_context *audit_get_context(struct task_struct *tsk, 816/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
817static inline struct audit_context *audit_take_context(struct task_struct *tsk,
809 int return_valid, 818 int return_valid,
810 long return_code) 819 long return_code)
811{ 820{
@@ -842,6 +851,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
842 return context; 851 return context;
843} 852}
844 853
854static inline void audit_proctitle_free(struct audit_context *context)
855{
856 kfree(context->proctitle.value);
857 context->proctitle.value = NULL;
858 context->proctitle.len = 0;
859}
860
845static inline void audit_free_names(struct audit_context *context) 861static inline void audit_free_names(struct audit_context *context)
846{ 862{
847 struct audit_names *n, *next; 863 struct audit_names *n, *next;
@@ -850,16 +866,15 @@ static inline void audit_free_names(struct audit_context *context)
850 if (context->put_count + context->ino_count != context->name_count) { 866 if (context->put_count + context->ino_count != context->name_count) {
851 int i = 0; 867 int i = 0;
852 868
853 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 869 pr_err("%s:%d(:%d): major=%d in_syscall=%d"
854 " name_count=%d put_count=%d" 870 " name_count=%d put_count=%d ino_count=%d"
855 " ino_count=%d [NOT freeing]\n", 871 " [NOT freeing]\n", __FILE__, __LINE__,
856 __FILE__, __LINE__,
857 context->serial, context->major, context->in_syscall, 872 context->serial, context->major, context->in_syscall,
858 context->name_count, context->put_count, 873 context->name_count, context->put_count,
859 context->ino_count); 874 context->ino_count);
860 list_for_each_entry(n, &context->names_list, list) { 875 list_for_each_entry(n, &context->names_list, list) {
861 printk(KERN_ERR "names[%d] = %p = %s\n", i++, 876 pr_err("names[%d] = %p = %s\n", i++, n->name,
862 n->name, n->name->name ?: "(null)"); 877 n->name->name ?: "(null)");
863 } 878 }
864 dump_stack(); 879 dump_stack();
865 return; 880 return;
@@ -955,6 +970,7 @@ static inline void audit_free_context(struct audit_context *context)
955 audit_free_aux(context); 970 audit_free_aux(context);
956 kfree(context->filterkey); 971 kfree(context->filterkey);
957 kfree(context->sockaddr); 972 kfree(context->sockaddr);
973 audit_proctitle_free(context);
958 kfree(context); 974 kfree(context);
959} 975}
960 976
@@ -1157,7 +1173,7 @@ static void audit_log_execve_info(struct audit_context *context,
1157 */ 1173 */
1158 buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); 1174 buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
1159 if (!buf) { 1175 if (!buf) {
1160 audit_panic("out of memory for argv string\n"); 1176 audit_panic("out of memory for argv string");
1161 return; 1177 return;
1162 } 1178 }
1163 1179
@@ -1271,6 +1287,59 @@ static void show_special(struct audit_context *context, int *call_panic)
1271 audit_log_end(ab); 1287 audit_log_end(ab);
1272} 1288}
1273 1289
1290static inline int audit_proctitle_rtrim(char *proctitle, int len)
1291{
1292 char *end = proctitle + len - 1;
1293 while (end > proctitle && !isprint(*end))
1294 end--;
1295
1296 /* catch the case where proctitle is only 1 non-print character */
1297 len = end - proctitle + 1;
1298 len -= isprint(proctitle[len-1]) == 0;
1299 return len;
1300}
1301
1302static void audit_log_proctitle(struct task_struct *tsk,
1303 struct audit_context *context)
1304{
1305 int res;
1306 char *buf;
1307 char *msg = "(null)";
1308 int len = strlen(msg);
1309 struct audit_buffer *ab;
1310
1311 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
1312 if (!ab)
1313 return; /* audit_panic or being filtered */
1314
1315 audit_log_format(ab, "proctitle=");
1316
1317 /* Not cached */
1318 if (!context->proctitle.value) {
1319 buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
1320 if (!buf)
1321 goto out;
1322 /* Historically called this from procfs naming */
1323 res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
1324 if (res == 0) {
1325 kfree(buf);
1326 goto out;
1327 }
1328 res = audit_proctitle_rtrim(buf, res);
1329 if (res == 0) {
1330 kfree(buf);
1331 goto out;
1332 }
1333 context->proctitle.value = buf;
1334 context->proctitle.len = res;
1335 }
1336 msg = context->proctitle.value;
1337 len = context->proctitle.len;
1338out:
1339 audit_log_n_untrustedstring(ab, msg, len);
1340 audit_log_end(ab);
1341}
1342
1274static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1343static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1275{ 1344{
1276 int i, call_panic = 0; 1345 int i, call_panic = 0;
@@ -1388,6 +1457,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1388 audit_log_name(context, n, NULL, i++, &call_panic); 1457 audit_log_name(context, n, NULL, i++, &call_panic);
1389 } 1458 }
1390 1459
1460 audit_log_proctitle(tsk, context);
1461
1391 /* Send end of event record to help user space know we are finished */ 1462 /* Send end of event record to help user space know we are finished */
1392 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1463 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
1393 if (ab) 1464 if (ab)
@@ -1406,7 +1477,7 @@ void __audit_free(struct task_struct *tsk)
1406{ 1477{
1407 struct audit_context *context; 1478 struct audit_context *context;
1408 1479
1409 context = audit_get_context(tsk, 0, 0); 1480 context = audit_take_context(tsk, 0, 0);
1410 if (!context) 1481 if (!context)
1411 return; 1482 return;
1412 1483
@@ -1500,7 +1571,7 @@ void __audit_syscall_exit(int success, long return_code)
1500 else 1571 else
1501 success = AUDITSC_FAILURE; 1572 success = AUDITSC_FAILURE;
1502 1573
1503 context = audit_get_context(tsk, success, return_code); 1574 context = audit_take_context(tsk, success, return_code);
1504 if (!context) 1575 if (!context)
1505 return; 1576 return;
1506 1577
@@ -1550,7 +1621,7 @@ static inline void handle_one(const struct inode *inode)
1550 if (likely(put_tree_ref(context, chunk))) 1621 if (likely(put_tree_ref(context, chunk)))
1551 return; 1622 return;
1552 if (unlikely(!grow_tree_refs(context))) { 1623 if (unlikely(!grow_tree_refs(context))) {
1553 printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); 1624 pr_warn("out of memory, audit has lost a tree reference\n");
1554 audit_set_auditable(context); 1625 audit_set_auditable(context);
1555 audit_put_chunk(chunk); 1626 audit_put_chunk(chunk);
1556 unroll_tree_refs(context, p, count); 1627 unroll_tree_refs(context, p, count);
@@ -1609,8 +1680,7 @@ retry:
1609 goto retry; 1680 goto retry;
1610 } 1681 }
1611 /* too bad */ 1682 /* too bad */
1612 printk(KERN_WARNING 1683 pr_warn("out of memory, audit has lost a tree reference\n");
1613 "out of memory, audit has lost a tree reference\n");
1614 unroll_tree_refs(context, p, count); 1684 unroll_tree_refs(context, p, count);
1615 audit_set_auditable(context); 1685 audit_set_auditable(context);
1616 return; 1686 return;
@@ -1682,7 +1752,7 @@ void __audit_getname(struct filename *name)
1682 1752
1683 if (!context->in_syscall) { 1753 if (!context->in_syscall) {
1684#if AUDIT_DEBUG == 2 1754#if AUDIT_DEBUG == 2
1685 printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", 1755 pr_err("%s:%d(:%d): ignoring getname(%p)\n",
1686 __FILE__, __LINE__, context->serial, name); 1756 __FILE__, __LINE__, context->serial, name);
1687 dump_stack(); 1757 dump_stack();
1688#endif 1758#endif
@@ -1721,15 +1791,15 @@ void audit_putname(struct filename *name)
1721 BUG_ON(!context); 1791 BUG_ON(!context);
1722 if (!name->aname || !context->in_syscall) { 1792 if (!name->aname || !context->in_syscall) {
1723#if AUDIT_DEBUG == 2 1793#if AUDIT_DEBUG == 2
1724 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", 1794 pr_err("%s:%d(:%d): final_putname(%p)\n",
1725 __FILE__, __LINE__, context->serial, name); 1795 __FILE__, __LINE__, context->serial, name);
1726 if (context->name_count) { 1796 if (context->name_count) {
1727 struct audit_names *n; 1797 struct audit_names *n;
1728 int i = 0; 1798 int i = 0;
1729 1799
1730 list_for_each_entry(n, &context->names_list, list) 1800 list_for_each_entry(n, &context->names_list, list)
1731 printk(KERN_ERR "name[%d] = %p = %s\n", i++, 1801 pr_err("name[%d] = %p = %s\n", i++, n->name,
1732 n->name, n->name->name ?: "(null)"); 1802 n->name->name ?: "(null)");
1733 } 1803 }
1734#endif 1804#endif
1735 final_putname(name); 1805 final_putname(name);
@@ -1738,9 +1808,8 @@ void audit_putname(struct filename *name)
1738 else { 1808 else {
1739 ++context->put_count; 1809 ++context->put_count;
1740 if (context->put_count > context->name_count) { 1810 if (context->put_count > context->name_count) {
1741 printk(KERN_ERR "%s:%d(:%d): major=%d" 1811 pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
1742 " in_syscall=%d putname(%p) name_count=%d" 1812 " name_count=%d put_count=%d\n",
1743 " put_count=%d\n",
1744 __FILE__, __LINE__, 1813 __FILE__, __LINE__,
1745 context->serial, context->major, 1814 context->serial, context->major,
1746 context->in_syscall, name->name, 1815 context->in_syscall, name->name,
@@ -1981,12 +2050,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1981 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 2050 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1982 if (!ab) 2051 if (!ab)
1983 return; 2052 return;
1984 audit_log_format(ab, "pid=%d uid=%u" 2053 audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
1985 " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" 2054 audit_log_task_context(ab);
1986 " res=%d", 2055 audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
1987 current->pid, uid, 2056 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
1988 oldloginuid, loginuid, oldsessionid, sessionid,
1989 !rc);
1990 audit_log_end(ab); 2057 audit_log_end(ab);
1991} 2058}
1992 2059
@@ -2208,7 +2275,7 @@ void __audit_ptrace(struct task_struct *t)
2208{ 2275{
2209 struct audit_context *context = current->audit_context; 2276 struct audit_context *context = current->audit_context;
2210 2277
2211 context->target_pid = t->pid; 2278 context->target_pid = task_pid_nr(t);
2212 context->target_auid = audit_get_loginuid(t); 2279 context->target_auid = audit_get_loginuid(t);
2213 context->target_uid = task_uid(t); 2280 context->target_uid = task_uid(t);
2214 context->target_sessionid = audit_get_sessionid(t); 2281 context->target_sessionid = audit_get_sessionid(t);
@@ -2233,7 +2300,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2233 2300
2234 if (audit_pid && t->tgid == audit_pid) { 2301 if (audit_pid && t->tgid == audit_pid) {
2235 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2302 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2236 audit_sig_pid = tsk->pid; 2303 audit_sig_pid = task_pid_nr(tsk);
2237 if (uid_valid(tsk->loginuid)) 2304 if (uid_valid(tsk->loginuid))
2238 audit_sig_uid = tsk->loginuid; 2305 audit_sig_uid = tsk->loginuid;
2239 else 2306 else
@@ -2247,7 +2314,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2247 /* optimize the common case by putting first signal recipient directly 2314 /* optimize the common case by putting first signal recipient directly
2248 * in audit_context */ 2315 * in audit_context */
2249 if (!ctx->target_pid) { 2316 if (!ctx->target_pid) {
2250 ctx->target_pid = t->tgid; 2317 ctx->target_pid = task_tgid_nr(t);
2251 ctx->target_auid = audit_get_loginuid(t); 2318 ctx->target_auid = audit_get_loginuid(t);
2252 ctx->target_uid = t_uid; 2319 ctx->target_uid = t_uid;
2253 ctx->target_sessionid = audit_get_sessionid(t); 2320 ctx->target_sessionid = audit_get_sessionid(t);
@@ -2268,7 +2335,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2268 } 2335 }
2269 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); 2336 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
2270 2337
2271 axp->target_pid[axp->pid_count] = t->tgid; 2338 axp->target_pid[axp->pid_count] = task_tgid_nr(t);
2272 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2339 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2273 axp->target_uid[axp->pid_count] = t_uid; 2340 axp->target_uid[axp->pid_count] = t_uid;
2274 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2341 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2368,7 +2435,7 @@ static void audit_log_task(struct audit_buffer *ab)
2368 from_kgid(&init_user_ns, gid), 2435 from_kgid(&init_user_ns, gid),
2369 sessionid); 2436 sessionid);
2370 audit_log_task_context(ab); 2437 audit_log_task_context(ab);
2371 audit_log_format(ab, " pid=%d comm=", current->pid); 2438 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
2372 audit_log_untrustedstring(ab, current->comm); 2439 audit_log_untrustedstring(ab, current->comm);
2373 if (mm) { 2440 if (mm) {
2374 down_read(&mm->mmap_sem); 2441 down_read(&mm->mmap_sem);
diff --git a/kernel/capability.c b/kernel/capability.c
index 34019c57888d..a8d63df0c322 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/audit.h> 12#include <linux/audit.h>
11#include <linux/capability.h> 13#include <linux/capability.h>
12#include <linux/mm.h> 14#include <linux/mm.h>
@@ -42,15 +44,10 @@ __setup("no_file_caps", file_caps_disable);
42 44
43static void warn_legacy_capability_use(void) 45static void warn_legacy_capability_use(void)
44{ 46{
45 static int warned; 47 char name[sizeof(current->comm)];
46 if (!warned) { 48
47 char name[sizeof(current->comm)]; 49 pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
48 50 get_task_comm(name, current));
49 printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
50 " (legacy support in use)\n",
51 get_task_comm(name, current));
52 warned = 1;
53 }
54} 51}
55 52
56/* 53/*
@@ -71,16 +68,10 @@ static void warn_legacy_capability_use(void)
71 68
72static void warn_deprecated_v2(void) 69static void warn_deprecated_v2(void)
73{ 70{
74 static int warned; 71 char name[sizeof(current->comm)];
75 72
76 if (!warned) { 73 pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
77 char name[sizeof(current->comm)]; 74 get_task_comm(name, current));
78
79 printk(KERN_INFO "warning: `%s' uses deprecated v2"
80 " capabilities in a way that may be insecure.\n",
81 get_task_comm(name, current));
82 warned = 1;
83 }
84} 75}
85 76
86/* 77/*
@@ -380,7 +371,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
380bool ns_capable(struct user_namespace *ns, int cap) 371bool ns_capable(struct user_namespace *ns, int cap)
381{ 372{
382 if (unlikely(!cap_valid(cap))) { 373 if (unlikely(!cap_valid(cap))) {
383 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 374 pr_crit("capable() called with invalid cap=%u\n", cap);
384 BUG(); 375 BUG();
385 } 376 }
386 377
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c753ddd223b..9fcdaa705b6c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,332 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374 1392
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb; 1490 bool new_sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1491
1525 /* First find the desired set of subsystems */ 1492 /*
1493 * The first time anyone tries to mount a cgroup, enable the list
1494 * linking each css_set to its tasks and fix up all existing tasks.
1495 */
1496 if (!use_task_css_set_links)
1497 cgroup_enable_task_cg_lists();
1498retry:
1499 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1500 mutex_lock(&cgroup_mutex);
1501
1502 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1503 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1504 if (ret)
1530 goto out_err; 1505 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1506
1543 /* Locate an existing or new sb for this hierarchy */ 1507 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1508 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1509 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1510 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1511 cgroup_get(&root->cgrp);
1548 goto out_err; 1512 ret = 0;
1513 goto out_unlock;
1549 } 1514 }
1550 1515
1551 root = sb->s_fs_info; 1516 for_each_root(root) {
1552 BUG_ON(!root); 1517 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561 1518
1562 ret = cgroup_get_rootdir(sb); 1519 if (root == &cgrp_dfl_root)
1563 if (ret) 1520 continue;
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1595 ret = cgroup_init_root_id(root, 2, 0);
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1521
1621 /* 1522 /*
1622 * There must be no failure case after here, since rebinding 1523 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1524 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1525 * Remember whether name matched.
1625 */ 1526 */
1527 if (opts.name) {
1528 if (strcmp(opts.name, root->name))
1529 continue;
1530 name_match = true;
1531 }
1626 1532
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1533 /*
1647 * We re-used an existing hierarchy - the new root (if 1534 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1535 * subsystems) then they must match.
1649 */ 1536 */
1650 cgroup_free_root(opts.new_root); 1537 if ((opts.subsys_mask || opts.none) &&
1538 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1539 if (!name_match)
1540 continue;
1541 ret = -EBUSY;
1542 goto out_unlock;
1543 }
1651 1544
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1545 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1546 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1547 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1548 ret = -EINVAL;
1656 goto drop_new_super; 1549 goto out_unlock;
1657 } else { 1550 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1551 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1552 }
1660 } 1553 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666 1554
1667 rm_base_files: 1555 /*
1668 free_cgrp_cset_links(&tmp_links); 1556 * A root's lifetime is governed by its root cgroup. Zero
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); 1557 * ref indicate that the root is being destroyed. Wait for
1670 revert_creds(cred); 1558 * destruction to complete so that the subsystems are free.
1671 unlock_drop: 1559 * We can use wait_queue for the wait but this path is
1672 cgroup_exit_root_id(root); 1560 * super cold. Let's just sleep for a bit and retry.
1673 mutex_unlock(&cgroup_root_mutex); 1561 */
1674 mutex_unlock(&cgroup_mutex); 1562 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1675 mutex_unlock(&inode->i_mutex); 1563 mutex_unlock(&cgroup_mutex);
1676 drop_new_super: 1564 mutex_unlock(&cgroup_tree_mutex);
1677 deactivate_locked_super(sb); 1565 kfree(opts.release_agent);
1678 out_err: 1566 kfree(opts.name);
1679 kfree(opts.release_agent); 1567 msleep(10);
1680 kfree(opts.name); 1568 goto retry;
1681 return ERR_PTR(ret); 1569 }
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1697 mutex_lock(&cgroup_mutex);
1698 mutex_lock(&cgroup_root_mutex);
1699 1570
1700 /* Rebind all subsystems back to the default hierarchy */ 1571 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1572 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1573 }
1706 1574
1707 /* 1575 /*
1708 * Release all the links from cset_links to this hierarchy's 1576 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1577 * specification is allowed for already existing hierarchies but we
1578 * can't create new one without subsys specification.
1710 */ 1579 */
1711 write_lock(&css_set_lock); 1580 if (!opts.subsys_mask && !opts.none) {
1712 1581 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1582 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1583 }
1718 write_unlock(&css_set_lock);
1719 1584
1720 if (!list_empty(&root->root_list)) { 1585 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1586 if (!root) {
1722 cgroup_root_count--; 1587 ret = -ENOMEM;
1588 goto out_unlock;
1723 } 1589 }
1724 1590
1725 cgroup_exit_root_id(root); 1591 init_cgroup_root(root, &opts);
1592
1593 ret = cgroup_setup_root(root, opts.subsys_mask);
1594 if (ret)
1595 cgroup_free_root(root);
1726 1596
1727 mutex_unlock(&cgroup_root_mutex); 1597out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1598 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1599 mutex_unlock(&cgroup_tree_mutex);
1600
1601 kfree(opts.release_agent);
1602 kfree(opts.name);
1730 1603
1731 simple_xattrs_free(&cgrp->xattrs); 1604 if (ret)
1605 return ERR_PTR(ret);
1732 1606
1733 kill_litter_super(sb); 1607 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1734 cgroup_free_root(root); 1608 if (IS_ERR(dentry) || !new_sb)
1609 cgroup_put(&root->cgrp);
1610 return dentry;
1611}
1612
1613static void cgroup_kill_sb(struct super_block *sb)
1614{
1615 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1616 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1617
1618 cgroup_put(&root->cgrp);
1619 kernfs_kill_sb(sb);
1735} 1620}
1736 1621
1737static struct file_system_type cgroup_fs_type = { 1622static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1628,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1628static struct kobject *cgroup_kobj;
1744 1629
1745/** 1630/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1631 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1632 * @task: target task
1799 * @buf: the buffer to write the path into 1633 * @buf: the buffer to write the path into
@@ -1804,49 +1638,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1638 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1639 * cgroup controller callbacks.
1806 * 1640 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1641 * Return value is the same as kernfs_path().
1808 */ 1642 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1643char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1644{
1811 struct cgroupfs_root *root; 1645 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1646 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1647 int hierarchy_id = 1;
1814 1648 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1649
1818 mutex_lock(&cgroup_mutex); 1650 mutex_lock(&cgroup_mutex);
1651 down_read(&css_set_rwsem);
1819 1652
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1653 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1654
1822 if (root) { 1655 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1656 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1657 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1658 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1659 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1660 if (strlcpy(buf, "/", buflen) < buflen)
1661 path = buf;
1828 } 1662 }
1829 1663
1664 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1665 mutex_unlock(&cgroup_mutex);
1831 return ret; 1666 return path;
1832} 1667}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1668EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1669
1835/* 1670/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1671struct cgroup_taskset {
1845 struct task_and_cgroup single; 1672 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1673 struct list_head src_csets;
1847 int tc_array_len; 1674 struct list_head dst_csets;
1848 int idx; 1675
1849 struct cgroup *cur_cgrp; 1676 /*
1677 * Fields for cgroup_taskset_*() iteration.
1678 *
1679 * Before migration is committed, the target migration tasks are on
1680 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1681 * the csets on ->dst_csets. ->csets point to either ->src_csets
1682 * or ->dst_csets depending on whether migration is committed.
1683 *
1684 * ->cur_csets and ->cur_task point to the current task position
1685 * during iteration.
1686 */
1687 struct list_head *csets;
1688 struct css_set *cur_cset;
1689 struct task_struct *cur_task;
1850}; 1690};
1851 1691
1852/** 1692/**
@@ -1857,15 +1697,11 @@ struct cgroup_taskset {
1857 */ 1697 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1698struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1699{
1860 if (tset->tc_array) { 1700 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1701 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1702
1863 } else { 1703 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1704}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1705
1870/** 1706/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1707 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1712,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1712 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1713struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1714{
1879 struct task_and_cgroup *tc; 1715 struct css_set *cset = tset->cur_cset;
1716 struct task_struct *task = tset->cur_task;
1880 1717
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1718 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1719 if (!task)
1720 task = list_first_entry(&cset->mg_tasks,
1721 struct task_struct, cg_list);
1722 else
1723 task = list_next_entry(task, cg_list);
1883 1724
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1725 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1726 tset->cur_cset = cset;
1886 return tc->task; 1727 tset->cur_task = task;
1887} 1728 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1729 }
1889 1730
1890/** 1731 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1732 task = NULL;
1892 * @tset: taskset of interest 1733 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1734
1906/** 1735 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1736}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915
1916 1737
1917/* 1738/**
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1739 * cgroup_task_migrate - move a task from one cgroup to another.
1740 * @old_cgrp; the cgroup @tsk is being migrated from
1741 * @tsk: the task being migrated
1742 * @new_cset: the new css_set @tsk is being attached to
1919 * 1743 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1744 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1745 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1746static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1747 struct task_struct *tsk,
@@ -1925,6 +1749,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1749{
1926 struct css_set *old_cset; 1750 struct css_set *old_cset;
1927 1751
1752 lockdep_assert_held(&cgroup_mutex);
1753 lockdep_assert_held(&css_set_rwsem);
1754
1928 /* 1755 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1756 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1757 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1760,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1760 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1761 old_cset = task_css_set(tsk);
1935 1762
1936 task_lock(tsk); 1763 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1764 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1765
1940 /* Update the css_set linked lists if we're using them */ 1766 /*
1941 write_lock(&css_set_lock); 1767 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1768 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1769 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1770 * tset's dst_csets list.
1771 */
1772 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1773
1946 /* 1774 /*
1947 * We just gained a reference on old_cset by taking it from the 1775 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1777,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1777 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1778 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1779 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1780 put_css_set_locked(old_cset, false);
1953} 1781}
1954 1782
1955/** 1783/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1784 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1785 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1786 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1787 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1788 * those functions for details.
1963 */ 1789 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1790static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1791{
1967 int retval, i, group_size; 1792 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1793
1976 /* 1794 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1795
1978 * every thread, we cannot iterate the thread group list, since it needs 1796 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1797 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1798 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1799 cset->mg_dst_cset = NULL;
1982 */ 1800 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1801 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1802 }
1985 else 1803 up_write(&css_set_rwsem);
1986 group_size = 1; 1804}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1805
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1806/**
1989 if (!group) 1807 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1808 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1809 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1810 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1811 *
1994 goto out_free_group_list; 1812 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1813 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1814 * up by cgroup_migrate_finish().
1815 *
1816 * This function may be called without holding threadgroup_lock even if the
1817 * target is a process. Threads may be created and destroyed but as long
1818 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1819 * the preloaded css_sets are guaranteed to cover all migrations.
1820 */
1821static void cgroup_migrate_add_src(struct css_set *src_cset,
1822 struct cgroup *dst_cgrp,
1823 struct list_head *preloaded_csets)
1824{
1825 struct cgroup *src_cgrp;
1826
1827 lockdep_assert_held(&cgroup_mutex);
1828 lockdep_assert_held(&css_set_rwsem);
1829
1830 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1831
1832 /* nothing to do if this cset already belongs to the cgroup */
1833 if (src_cgrp == dst_cgrp)
1834 return;
1835
1836 if (!list_empty(&src_cset->mg_preload_node))
1837 return;
1838
1839 WARN_ON(src_cset->mg_src_cgrp);
1840 WARN_ON(!list_empty(&src_cset->mg_tasks));
1841 WARN_ON(!list_empty(&src_cset->mg_node));
1842
1843 src_cset->mg_src_cgrp = src_cgrp;
1844 get_css_set(src_cset);
1845 list_add(&src_cset->mg_preload_node, preloaded_csets);
1846}
1847
1848/**
1849 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1850 * @dst_cgrp: the destination cgroup
1851 * @preloaded_csets: list of preloaded source css_sets
1852 *
1853 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1854 * have been preloaded to @preloaded_csets. This function looks up and
1855 * pins all destination css_sets, links each to its source, and put them on
1856 * @preloaded_csets.
1857 *
1858 * This function must be called after cgroup_migrate_add_src() has been
1859 * called on each migration source css_set. After migration is performed
1860 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1861 * @preloaded_csets.
1862 */
1863static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1864 struct list_head *preloaded_csets)
1865{
1866 LIST_HEAD(csets);
1867 struct css_set *src_cset;
1868
1869 lockdep_assert_held(&cgroup_mutex);
1870
1871 /* look up the dst cset for each src cset and link it to src */
1872 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1873 struct css_set *dst_cset;
1874
1875 dst_cset = find_css_set(src_cset, dst_cgrp);
1876 if (!dst_cset)
1877 goto err;
1878
1879 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1880 src_cset->mg_dst_cset = dst_cset;
1881
1882 if (list_empty(&dst_cset->mg_preload_node))
1883 list_add(&dst_cset->mg_preload_node, &csets);
1884 else
1885 put_css_set(dst_cset, false);
1886 }
1887
1888 list_splice(&csets, preloaded_csets);
1889 return 0;
1890err:
1891 cgroup_migrate_finish(&csets);
1892 return -ENOMEM;
1893}
1894
1895/**
1896 * cgroup_migrate - migrate a process or task to a cgroup
1897 * @cgrp: the destination cgroup
1898 * @leader: the leader of the process or the task to migrate
1899 * @threadgroup: whether @leader points to the whole process or a single task
1900 *
1901 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1902 * process, the caller must be holding threadgroup_lock of @leader. The
1903 * caller is also responsible for invoking cgroup_migrate_add_src() and
1904 * cgroup_migrate_prepare_dst() on the targets before invoking this
1905 * function and following up with cgroup_migrate_finish().
1906 *
1907 * As long as a controller's ->can_attach() doesn't fail, this function is
1908 * guaranteed to succeed. This means that, excluding ->can_attach()
1909 * failure, when migrating multiple targets, the success or failure can be
1910 * decided for all targets by invoking group_migrate_prepare_dst() before
1911 * actually starting migrating.
1912 */
1913static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1914 bool threadgroup)
1915{
1916 struct cgroup_taskset tset = {
1917 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1918 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1919 .csets = &tset.src_csets,
1920 };
1921 struct cgroup_subsys_state *css, *failed_css = NULL;
1922 struct css_set *cset, *tmp_cset;
1923 struct task_struct *task, *tmp_task;
1924 int i, ret;
1995 1925
1996 i = 0;
1997 /* 1926 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1927 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1928 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1929 * take an rcu_read_lock.
2001 */ 1930 */
1931 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1932 rcu_read_lock();
1933 task = leader;
2003 do { 1934 do {
2004 struct task_and_cgroup ent; 1935 /* @task either already exited or can't exit until the end */
1936 if (task->flags & PF_EXITING)
1937 goto next;
2005 1938
2006 /* @tsk either already exited or can't exit until the end */ 1939 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1940 if (list_empty(&task->cg_list))
2008 goto next; 1941 goto next;
2009 1942
2010 /* as per above, nr_threads may decrease, but not increase. */ 1943 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1944 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1945 goto next;
1946
2017 /* 1947 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1948 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1949 * Take care to avoid disturbing the ordering.
2020 */ 1950 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1951 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1952 if (list_empty(&cset->mg_node))
2023 i++; 1953 list_add_tail(&cset->mg_node, &tset.src_csets);
1954 if (list_empty(&cset->mg_dst_cset->mg_node))
1955 list_move_tail(&cset->mg_dst_cset->mg_node,
1956 &tset.dst_csets);
2024 next: 1957 next:
2025 if (!threadgroup) 1958 if (!threadgroup)
2026 break; 1959 break;
2027 } while_each_thread(leader, tsk); 1960 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1961 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1962 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1963
2034 /* methods shouldn't be called if no task is actually migrating */ 1964 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1965 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1966 return 0;
2037 goto out_free_group_list;
2038 1967
2039 /* 1968 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1969 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1970 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1971 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1972 if (ret) {
2046 failed_css = css; 1973 failed_css = css;
2047 goto out_cancel_attach; 1974 goto out_cancel_attach;
2048 } 1975 }
@@ -2050,70 +1977,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1977 }
2051 1978
2052 /* 1979 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1980 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1981 * the new cgroup. There are no failure cases after here, so this
1982 * is the commit point.
2055 */ 1983 */
2056 for (i = 0; i < group_size; i++) { 1984 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1985 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1986 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1987 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1988 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1989 }
1990 up_write(&css_set_rwsem);
2067 1991
2068 /* 1992 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1993 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1994 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1995 * controllers that migration is complete.
2072 */ 1996 */
2073 for (i = 0; i < group_size; i++) { 1997 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1998
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1999 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 2000 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2001 css->ss->attach(css, &tset);
2085 2002
2086 /* 2003 ret = 0;
2087 * step 5: success! and cleanup 2004 goto out_release_tset;
2088 */ 2005
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2006out_cancel_attach:
2100 if (retval) { 2007 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2008 if (css == failed_css)
2102 if (css == failed_css) 2009 break;
2103 break; 2010 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2011 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2012 }
2108out_free_group_list: 2013out_release_tset:
2109 flex_array_free(group); 2014 down_write(&css_set_rwsem);
2110 return retval; 2015 list_splice_init(&tset.dst_csets, &tset.src_csets);
2016 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2017 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2018 list_del_init(&cset->mg_node);
2019 }
2020 up_write(&css_set_rwsem);
2021 return ret;
2022}
2023
2024/**
2025 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2026 * @dst_cgrp: the cgroup to attach to
2027 * @leader: the task or the leader of the threadgroup to be attached
2028 * @threadgroup: attach the whole threadgroup?
2029 *
2030 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2031 */
2032static int cgroup_attach_task(struct cgroup *dst_cgrp,
2033 struct task_struct *leader, bool threadgroup)
2034{
2035 LIST_HEAD(preloaded_csets);
2036 struct task_struct *task;
2037 int ret;
2038
2039 /* look up all src csets */
2040 down_read(&css_set_rwsem);
2041 rcu_read_lock();
2042 task = leader;
2043 do {
2044 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2045 &preloaded_csets);
2046 if (!threadgroup)
2047 break;
2048 } while_each_thread(leader, task);
2049 rcu_read_unlock();
2050 up_read(&css_set_rwsem);
2051
2052 /* prepare dst csets and commit */
2053 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2054 if (!ret)
2055 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2056
2057 cgroup_migrate_finish(&preloaded_csets);
2058 return ret;
2111} 2059}
2112 2060
2113/* 2061/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2062 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2063 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2064 * cgroup_mutex and threadgroup.
2117 */ 2065 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2066static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2067{
@@ -2198,12 +2146,19 @@ out_unlock_cgroup:
2198 */ 2146 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2147int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2148{
2201 struct cgroupfs_root *root; 2149 struct cgroup_root *root;
2202 int retval = 0; 2150 int retval = 0;
2203 2151
2204 mutex_lock(&cgroup_mutex); 2152 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2153 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2154 struct cgroup *from_cgrp;
2155
2156 if (root == &cgrp_dfl_root)
2157 continue;
2158
2159 down_read(&css_set_rwsem);
2160 from_cgrp = task_cgroup_from_root(from, root);
2161 up_read(&css_set_rwsem);
2207 2162
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2163 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2164 if (retval)
@@ -2228,16 +2183,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2183}
2229 2184
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2185static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2186 struct cftype *cft, char *buffer)
2232{ 2187{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2188 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2189
2235 return -EINVAL; 2190 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2191 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2192 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2193 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2194 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2195 sizeof(root->release_agent_path));
2196 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2197 mutex_unlock(&cgroup_mutex);
2242 return 0; 2198 return 0;
2243} 2199}
@@ -2262,32 +2218,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2218 return 0;
2263} 2219}
2264 2220
2265/* A buffer size big enough for numbers or short strings */ 2221static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2222 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2223{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2224 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2225 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2226 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2227 int ret;
2277 2228
2278 if (nbytes >= max_bytes) 2229 /*
2279 return -E2BIG; 2230 * kernfs guarantees that a file isn't deleted with operations in
2280 2231 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2232 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2233 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2234 */
2284 2235 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2236 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2237 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2238
2292 if (cft->write_string) { 2239 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2240 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2253,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2253 } else {
2307 ret = -EINVAL; 2254 ret = -EINVAL;
2308 } 2255 }
2309out_free: 2256
2310 kfree(buf);
2311 return ret ?: nbytes; 2257 return ret ?: nbytes;
2312} 2258}
2313 2259
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2260static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2261{
2321 struct cftype *cft = seq_cft(seq); 2262 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2263}
2333 2264
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2265static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2266{
2336 struct cftype *cft = seq_cft(seq); 2267 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2268}
2349 2269
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2270static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2271{
2352 struct cftype *cft = seq_cft(seq); 2272 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2273}
2357 2274
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2275static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2289,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2289 return 0;
2373} 2290}
2374 2291
2375static struct seq_operations cgroup_seq_operations = { 2292static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2293 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2294 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2295 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2296};
2381 2297
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2298static struct kernfs_ops cgroup_kf_ops = {
2383{ 2299 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2300 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2301 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2302 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2303 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2304 .seq_show = cgroup_seqfile_show,
2389 int err; 2305};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2306
2440/* 2307/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2308 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2309 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2310static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2311 const char *new_name_str)
2445{ 2312{
2313 struct cgroup *cgrp = kn->priv;
2446 int ret; 2314 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2315
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2316 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2317 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2318 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2319 return -EIO;
2462 2320
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2321 /*
2466 * This isn't a proper migration and its usefulness is very 2322 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2323 * limited. Disallow if sane_behavior.
@@ -2469,218 +2325,61 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2325 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2326 return -EPERM;
2471 2327
2472 name = cgroup_alloc_name(new_dentry); 2328 /*
2473 if (!name) 2329 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2330 * active_ref. kernfs_rename() doesn't require active_ref
2475 2331 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2332 */
2477 if (ret) { 2333 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2334 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582 2335
2583 inode = cgroup_new_inode(mode, sb); 2336 mutex_lock(&cgroup_tree_mutex);
2584 if (!inode) 2337 mutex_lock(&cgroup_mutex);
2585 return -ENOMEM;
2586 2338
2587 if (S_ISDIR(mode)) { 2339 ret = kernfs_rename(kn, new_parent, new_name_str);
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590 2340
2591 /* start off with i_nlink == 2 (for "." entry) */ 2341 mutex_unlock(&cgroup_mutex);
2592 inc_nlink(inode); 2342 mutex_unlock(&cgroup_tree_mutex);
2593 inc_nlink(dentry->d_parent->d_inode);
2594 2343
2595 /* 2344 kernfs_unbreak_active_protection(kn);
2596 * Control reaches here with cgroup_mutex held. 2345 kernfs_unbreak_active_protection(new_parent);
2597 * @inode->i_mutex should nest outside cgroup_mutex but we 2346 return ret;
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612} 2347}
2613 2348
2614/** 2349/* set uid and gid of cgroup dirs and files to that of the creator */
2615 * cgroup_file_mode - deduce file mode of a control file 2350static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{ 2351{
2625 umode_t mode = 0; 2352 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2626 2353 .ia_uid = current_fsuid(),
2627 if (cft->mode) 2354 .ia_gid = current_fsgid(), };
2628 return cft->mode;
2629
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2631 mode |= S_IRUGO;
2632 2355
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2356 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2634 cft->trigger) 2357 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2635 mode |= S_IWUSR; 2358 return 0;
2636 2359
2637 return mode; 2360 return kernfs_setattr(kn, &iattr);
2638} 2361}
2639 2362
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2363static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2364{
2642 struct dentry *dir = cgrp->dentry; 2365 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2366 struct kernfs_node *kn;
2644 struct dentry *dentry; 2367 struct lock_class_key *key = NULL;
2645 struct cfent *cfe; 2368 int ret;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662
2663 dentry = lookup_one_len(name, dir, strlen(name));
2664 if (IS_ERR(dentry)) {
2665 error = PTR_ERR(dentry);
2666 goto out;
2667 }
2668 2369
2669 cfe->type = (void *)cft; 2370#ifdef CONFIG_DEBUG_LOCK_ALLOC
2670 cfe->dentry = dentry; 2371 key = &cft->lockdep_key;
2671 dentry->d_fsdata = cfe; 2372#endif
2672 simple_xattrs_init(&cfe->xattrs); 2373 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2374 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2375 NULL, false, key);
2376 if (IS_ERR(kn))
2377 return PTR_ERR(kn);
2673 2378
2674 mode = cgroup_file_mode(cft); 2379 ret = cgroup_kn_set_ugid(kn);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); 2380 if (ret)
2676 if (!error) { 2381 kernfs_remove(kn);
2677 list_add_tail(&cfe->node, &parent->files); 2382 return ret;
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2383}
2685 2384
2686/** 2385/**
@@ -2700,11 +2399,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2399 struct cftype *cft;
2701 int ret; 2400 int ret;
2702 2401
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2402 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2403
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2404 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2405 /* does cft->flags tell us to skip this file on @cgrp? */
2406 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2407 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2408 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2409 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2410 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2426,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2426 return 0;
2727} 2427}
2728 2428
2729static void cgroup_cfts_prepare(void) 2429static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2430{
2744 LIST_HEAD(pending); 2431 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2432 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2433 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2434 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2435 int ret = 0;
2753 2436
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2437 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2438
2761 /* 2439 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2440 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2441 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2442
2768 /* add/rm files for all cgroups created before */ 2443 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2444 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2447,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2447 if (cgroup_is_dead(cgrp))
2773 continue; 2448 continue;
2774 2449
2775 inode = cgrp->dentry->d_inode; 2450 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2451 if (ret)
2787 break; 2452 break;
2788 } 2453 }
2789 mutex_unlock(&cgroup_mutex); 2454
2790 dput(prev); 2455 if (is_add && !ret)
2791 deactivate_super(sb); 2456 kernfs_activate(root->kn);
2792 return ret; 2457 return ret;
2793} 2458}
2794 2459
2795/** 2460static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2461{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2462 struct cftype *cft;
2813 int ret;
2814 2463
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2464 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2465 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2466 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2467 kfree(cft->kf_ops);
2468 cft->kf_ops = NULL;
2469 cft->ss = NULL;
2470 }
2471}
2472
2473static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2474{
2475 struct cftype *cft;
2476
2477 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2478 struct kernfs_ops *kf_ops;
2818 2479
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2480 WARN_ON(cft->ss || cft->kf_ops);
2481
2482 if (cft->seq_start)
2483 kf_ops = &cgroup_kf_ops;
2484 else
2485 kf_ops = &cgroup_kf_single_ops;
2486
2487 /*
2488 * Ugh... if @cft wants a custom max_write_len, we need to
2489 * make a copy of kf_ops to set its atomic_write_len.
2490 */
2491 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2492 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2493 if (!kf_ops) {
2494 cgroup_exit_cftypes(cfts);
2495 return -ENOMEM;
2496 }
2497 kf_ops->atomic_write_len = cft->max_write_len;
2498 }
2499
2500 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2501 cft->ss = ss;
2502 }
2821 2503
2822 cgroup_cfts_prepare(); 2504 return 0;
2823 set->cfts = cfts; 2505}
2824 list_add_tail(&set->node, &ss->cftsets); 2506
2825 ret = cgroup_cfts_commit(cfts, true); 2507static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2508{
2827 cgroup_rm_cftypes(cfts); 2509 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2510
2511 if (!cfts || !cfts[0].ss)
2512 return -ENOENT;
2513
2514 list_del(&cfts->node);
2515 cgroup_apply_cftypes(cfts, false);
2516 cgroup_exit_cftypes(cfts);
2517 return 0;
2829} 2518}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2519
2832/** 2520/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2521 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2530,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2530 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2531int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2532{
2845 struct cftype_set *set; 2533 int ret;
2846 2534
2847 if (!cfts || !cfts[0].ss) 2535 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2536 ret = cgroup_rm_cftypes_locked(cfts);
2537 mutex_unlock(&cgroup_tree_mutex);
2538 return ret;
2539}
2849 2540
2850 cgroup_cfts_prepare(); 2541/**
2542 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2543 * @ss: target cgroup subsystem
2544 * @cfts: zero-length name terminated array of cftypes
2545 *
2546 * Register @cfts to @ss. Files described by @cfts are created for all
2547 * existing cgroups to which @ss is attached and all future cgroups will
2548 * have them too. This function can be called anytime whether @ss is
2549 * attached or not.
2550 *
2551 * Returns 0 on successful registration, -errno on failure. Note that this
2552 * function currently returns 0 as long as @cfts registration is successful
2553 * even if some file creation attempts on existing cgroups fail.
2554 */
2555int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2556{
2557 int ret;
2851 2558
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2559 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2560 return 0;
2854 list_del(&set->node);
2855 kfree(set);
2856 cgroup_cfts_commit(cfts, false);
2857 return 0;
2858 }
2859 }
2860 2561
2861 cgroup_cfts_commit(NULL, false); 2562 ret = cgroup_init_cftypes(ss, cfts);
2862 return -ENOENT; 2563 if (ret)
2564 return ret;
2565
2566 mutex_lock(&cgroup_tree_mutex);
2567
2568 list_add_tail(&cfts->node, &ss->cfts);
2569 ret = cgroup_apply_cftypes(cfts, true);
2570 if (ret)
2571 cgroup_rm_cftypes_locked(cfts);
2572
2573 mutex_unlock(&cgroup_tree_mutex);
2574 return ret;
2863} 2575}
2864 2576
2865/** 2577/**
@@ -2868,57 +2580,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2580 *
2869 * Return the number of tasks in the cgroup. 2581 * Return the number of tasks in the cgroup.
2870 */ 2582 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2583static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2584{
2873 int count = 0; 2585 int count = 0;
2874 struct cgrp_cset_link *link; 2586 struct cgrp_cset_link *link;
2875 2587
2876 read_lock(&css_set_lock); 2588 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2589 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2590 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2591 up_read(&css_set_rwsem);
2880 return count; 2592 return count;
2881} 2593}
2882 2594
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2595/**
2923 * css_next_child - find the next child of a given css 2596 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2597 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2610,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2610 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2611 struct cgroup *next;
2939 2612
2940 cgroup_assert_mutex_or_rcu_locked(); 2613 cgroup_assert_mutexes_or_rcu_locked();
2941 2614
2942 /* 2615 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2616 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2646,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2646
2974 return cgroup_css(next, parent_css->ss); 2647 return cgroup_css(next, parent_css->ss);
2975} 2648}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2649
2978/** 2650/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2651 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2667,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2667{
2996 struct cgroup_subsys_state *next; 2668 struct cgroup_subsys_state *next;
2997 2669
2998 cgroup_assert_mutex_or_rcu_locked(); 2670 cgroup_assert_mutexes_or_rcu_locked();
2999 2671
3000 /* if first iteration, visit @root */ 2672 /* if first iteration, visit @root */
3001 if (!pos) 2673 if (!pos)
@@ -3016,7 +2688,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2688
3017 return NULL; 2689 return NULL;
3018} 2690}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2691
3021/** 2692/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2693 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2707,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2707{
3037 struct cgroup_subsys_state *last, *tmp; 2708 struct cgroup_subsys_state *last, *tmp;
3038 2709
3039 cgroup_assert_mutex_or_rcu_locked(); 2710 cgroup_assert_mutexes_or_rcu_locked();
3040 2711
3041 do { 2712 do {
3042 last = pos; 2713 last = pos;
@@ -3048,7 +2719,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2719
3049 return last; 2720 return last;
3050} 2721}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2722
3053static struct cgroup_subsys_state * 2723static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2724css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2754,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2754{
3085 struct cgroup_subsys_state *next; 2755 struct cgroup_subsys_state *next;
3086 2756
3087 cgroup_assert_mutex_or_rcu_locked(); 2757 cgroup_assert_mutexes_or_rcu_locked();
3088 2758
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2759 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2760 if (!pos)
@@ -3102,7 +2772,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2772 /* no sibling left, visit parent */
3103 return css_parent(pos); 2773 return css_parent(pos);
3104} 2774}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2775
3107/** 2776/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2777 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2794,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2794 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2795 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2796 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2797 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2798
3129 it->cset_link = l; 2799 it->cset_link = l;
3130 it->task = cset->tasks.next; 2800
2801 if (!list_empty(&cset->tasks))
2802 it->task = cset->tasks.next;
2803 else
2804 it->task = cset->mg_tasks.next;
3131} 2805}
3132 2806
3133/** 2807/**
@@ -3146,17 +2820,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2820 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2821void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2822 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2823 __acquires(css_set_rwsem)
3150{ 2824{
3151 /* 2825 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2826 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2827
3159 read_lock(&css_set_lock); 2828 down_read(&css_set_rwsem);
3160 2829
3161 it->origin_css = css; 2830 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2831 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2845,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2845{
3177 struct task_struct *res; 2846 struct task_struct *res;
3178 struct list_head *l = it->task; 2847 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2848 struct cgrp_cset_link *link = list_entry(it->cset_link,
2849 struct cgrp_cset_link, cset_link);
3180 2850
3181 /* If the iterator cg is NULL, we have no tasks */ 2851 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2852 if (!it->cset_link)
3183 return NULL; 2853 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2854 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2855
2856 /*
2857 * Advance iterator to find next entry. cset->tasks is consumed
2858 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2859 * next cset.
2860 */
3186 l = l->next; 2861 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2862
3188 if (l == &link->cset->tasks) { 2863 if (l == &link->cset->tasks)
3189 /* 2864 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2865
3191 * next cgrp_cset_link. 2866 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2867 css_advance_task_iter(it);
3194 } else { 2868 else
3195 it->task = l; 2869 it->task = l;
3196 } 2870
3197 return res; 2871 return res;
3198} 2872}
3199 2873
@@ -3204,191 +2878,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2878 * Finish task iteration started by css_task_iter_start().
3205 */ 2879 */
3206void css_task_iter_end(struct css_task_iter *it) 2880void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2881 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2882{
3241 struct task_struct *t1 = p1; 2883 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2884}
3245 2885
3246/** 2886/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2887 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2888 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2889 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2890 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2891 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2892 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2893 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2894 * parent's migration is complete or put into the target cgroup. No task
2895 * can slip out of migration through forking.
3274 */ 2896 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2897int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2898{
3280 int retval, i; 2899 LIST_HEAD(preloaded_csets);
2900 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2901 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2902 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2903 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2904
3300 again: 2905 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2906
3349 if (heap->size) { 2907 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2908 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2909 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2910 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2911 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2912
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2913 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2914 if (ret)
3376 struct cgroup *new_cgroup = data; 2915 goto out_err;
3377 2916
3378 mutex_lock(&cgroup_mutex); 2917 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2918 * Migrate tasks one-by-one until @form is empty. This fails iff
2919 * ->can_attach() fails.
2920 */
2921 do {
2922 css_task_iter_start(&from->dummy_css, &it);
2923 task = css_task_iter_next(&it);
2924 if (task)
2925 get_task_struct(task);
2926 css_task_iter_end(&it);
2927
2928 if (task) {
2929 ret = cgroup_migrate(to, task, false);
2930 put_task_struct(task);
2931 }
2932 } while (task && !ret);
2933out_err:
2934 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2935 mutex_unlock(&cgroup_mutex);
3381} 2936 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2937}
3393 2938
3394/* 2939/*
@@ -3687,21 +3232,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3232 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3233int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3234{
3690 int ret = -EINVAL; 3235 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3236 struct cgroup *cgrp;
3692 struct css_task_iter it; 3237 struct css_task_iter it;
3693 struct task_struct *tsk; 3238 struct task_struct *tsk;
3694 3239
3240 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3241 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3242 kernfs_type(kn) != KERNFS_DIR)
3243 return -EINVAL;
3244
3245 mutex_lock(&cgroup_mutex);
3246
3695 /* 3247 /*
3696 * Validate dentry by checking the superblock operations, 3248 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3249 * @kn->priv's validity. For this and css_tryget_from_dir(),
3250 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3251 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3252 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3253 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3254 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3255 rcu_read_unlock();
3703 ret = 0; 3256 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3257 return -ENOENT;
3258 }
3259 rcu_read_unlock();
3705 3260
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3261 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3262 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3281,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3281 }
3727 css_task_iter_end(&it); 3282 css_task_iter_end(&it);
3728 3283
3729err: 3284 mutex_unlock(&cgroup_mutex);
3730 return ret; 3285 return 0;
3731} 3286}
3732 3287
3733 3288
@@ -3745,7 +3300,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3300 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3301 * next pid to display, if any
3747 */ 3302 */
3748 struct cgroup_open_file *of = s->private; 3303 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3304 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3305 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3306 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3355,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3355
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3356static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3357{
3803 struct cgroup_open_file *of = s->private; 3358 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3359 struct cgroup_pidlist *l = of->priv;
3805 3360
3806 if (l) 3361 if (l)
@@ -3811,7 +3366,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3366
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3367static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3368{
3814 struct cgroup_open_file *of = s->private; 3369 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3370 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3371 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3372 pid_t *end = l->list + l->length;
@@ -3861,23 +3416,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3416 return 0;
3862} 3417}
3863 3418
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3419static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3420 struct cftype *cft)
3883{ 3421{
@@ -3944,7 +3482,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3482 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3483 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3484 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3485 .max_write_len = PATH_MAX - 1,
3948 }, 3486 },
3949 { } /* terminate */ 3487 { } /* terminate */
3950}; 3488};
@@ -3963,13 +3501,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3501
3964 /* process cftsets of each subsystem */ 3502 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3503 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3504 struct cftype *cfts;
3967 3505
3968 if (!test_bit(i, &subsys_mask)) 3506 if (!test_bit(i, &subsys_mask))
3969 continue; 3507 continue;
3970 3508
3971 list_for_each_entry(set, &ss->cftsets, node) { 3509 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3510 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3511 if (ret < 0)
3974 goto err; 3512 goto err;
3975 } 3513 }
@@ -4012,7 +3550,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3550 css_put(css->parent);
4013 3551
4014 css->ss->css_free(css); 3552 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3553 cgroup_put(cgrp);
4016} 3554}
4017 3555
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3556static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3558,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3558 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3559 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3560
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3561 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3562 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3563}
@@ -4033,7 +3567,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3567 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3568 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3569
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3570 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3571 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3572}
4039 3573
@@ -4058,6 +3592,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3592 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3593 int ret = 0;
4060 3594
3595 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3596 lockdep_assert_held(&cgroup_mutex);
4062 3597
4063 if (ss->css_online) 3598 if (ss->css_online)
@@ -4065,7 +3600,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3600 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3601 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3602 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3603 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3604 }
4070 return ret; 3605 return ret;
4071} 3606}
@@ -4075,6 +3610,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3610{
4076 struct cgroup_subsys *ss = css->ss; 3611 struct cgroup_subsys *ss = css->ss;
4077 3612
3613 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3614 lockdep_assert_held(&cgroup_mutex);
4079 3615
4080 if (!(css->flags & CSS_ONLINE)) 3616 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3621,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3621
4086 css->flags &= ~CSS_ONLINE; 3622 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3623 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3624 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3625}
4090 3626
4091/** 3627/**
@@ -4103,7 +3639,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3639 struct cgroup_subsys_state *css;
4104 int err; 3640 int err;
4105 3641
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3642 lockdep_assert_held(&cgroup_mutex);
4108 3643
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3644 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3651,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4116 3651
4117 init_css(css, ss, cgrp); 3652 init_css(css, ss, cgrp);
4118 3653
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3654 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3655 if (err)
4121 goto err_free_percpu_ref; 3656 goto err_free_percpu_ref;
4122 3657
@@ -4124,9 +3659,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4124 if (err) 3659 if (err)
4125 goto err_clear_dir; 3660 goto err_clear_dir;
4126 3661
4127 dget(cgrp->dentry); 3662 cgroup_get(cgrp);
4128 css_get(css->parent); 3663 css_get(css->parent);
4129 3664
3665 cgrp->subsys_mask |= 1 << ss->id;
3666
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3667 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3668 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3669 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4139,7 +3676,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4139 return 0; 3676 return 0;
4140 3677
4141err_clear_dir: 3678err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3679 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4143err_free_percpu_ref: 3680err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt); 3681 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css: 3682err_free_css:
@@ -4147,35 +3684,34 @@ err_free_css:
4147 return err; 3684 return err;
4148} 3685}
4149 3686
4150/* 3687/**
4151 * cgroup_create - create a cgroup 3688 * cgroup_create - create a cgroup
4152 * @parent: cgroup that will be parent of the new cgroup 3689 * @parent: cgroup that will be parent of the new cgroup
4153 * @dentry: dentry of the new cgroup 3690 * @name: name of the new cgroup
4154 * @mode: mode to set on new inode 3691 * @mode: mode to set on new cgroup
4155 *
4156 * Must be called with the mutex on the parent inode held
4157 */ 3692 */
4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3693static long cgroup_create(struct cgroup *parent, const char *name,
4159 umode_t mode) 3694 umode_t mode)
4160{ 3695{
4161 struct cgroup *cgrp; 3696 struct cgroup *cgrp;
4162 struct cgroup_name *name; 3697 struct cgroup_root *root = parent->root;
4163 struct cgroupfs_root *root = parent->root;
4164 int ssid, err; 3698 int ssid, err;
4165 struct cgroup_subsys *ss; 3699 struct cgroup_subsys *ss;
4166 struct super_block *sb = root->sb; 3700 struct kernfs_node *kn;
3701
3702 /*
3703 * XXX: The default hierarchy isn't fully implemented yet. Block
3704 * !root cgroup creation on it for now.
3705 */
3706 if (root == &cgrp_dfl_root)
3707 return -EINVAL;
4167 3708
4168 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3709 /* allocate the cgroup and its ID, 0 is reserved for the root */
4169 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3710 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4170 if (!cgrp) 3711 if (!cgrp)
4171 return -ENOMEM; 3712 return -ENOMEM;
4172 3713
4173 name = cgroup_alloc_name(dentry); 3714 mutex_lock(&cgroup_tree_mutex);
4174 if (!name) {
4175 err = -ENOMEM;
4176 goto err_free_cgrp;
4177 }
4178 rcu_assign_pointer(cgrp->name, name);
4179 3715
4180 /* 3716 /*
4181 * Only live parents can have children. Note that the liveliness 3717 * Only live parents can have children. Note that the liveliness
@@ -4186,7 +3722,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4186 */ 3722 */
4187 if (!cgroup_lock_live_group(parent)) { 3723 if (!cgroup_lock_live_group(parent)) {
4188 err = -ENODEV; 3724 err = -ENODEV;
4189 goto err_free_name; 3725 goto err_unlock_tree;
4190 } 3726 }
4191 3727
4192 /* 3728 /*
@@ -4199,18 +3735,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4199 goto err_unlock; 3735 goto err_unlock;
4200 } 3736 }
4201 3737
4202 /* Grab a reference on the superblock so the hierarchy doesn't
4203 * get deleted on unmount if there are child cgroups. This
4204 * can be done outside cgroup_mutex, since the sb can't
4205 * disappear while someone has an open control file on the
4206 * fs */
4207 atomic_inc(&sb->s_active);
4208
4209 init_cgroup_housekeeping(cgrp); 3738 init_cgroup_housekeeping(cgrp);
4210 3739
4211 dentry->d_fsdata = cgrp;
4212 cgrp->dentry = dentry;
4213
4214 cgrp->parent = parent; 3740 cgrp->parent = parent;
4215 cgrp->dummy_css.parent = &parent->dummy_css; 3741 cgrp->dummy_css.parent = &parent->dummy_css;
4216 cgrp->root = parent->root; 3742 cgrp->root = parent->root;
@@ -4221,24 +3747,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3747 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3748 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223 3749
3750 /* create the directory */
3751 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3752 if (IS_ERR(kn)) {
3753 err = PTR_ERR(kn);
3754 goto err_free_id;
3755 }
3756 cgrp->kn = kn;
3757
4224 /* 3758 /*
4225 * Create directory. cgroup_create_file() returns with the new 3759 * This extra ref will be put in cgroup_free_fn() and guarantees
4226 * directory locked on success so that it can be populated without 3760 * that @cgrp->kn is always accessible.
4227 * dropping cgroup_mutex.
4228 */ 3761 */
4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3762 kernfs_get(kn);
4230 if (err < 0)
4231 goto err_free_id;
4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4233 3763
4234 cgrp->serial_nr = cgroup_serial_nr_next++; 3764 cgrp->serial_nr = cgroup_serial_nr_next++;
4235 3765
4236 /* allocation complete, commit to creation */ 3766 /* allocation complete, commit to creation */
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3767 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++; 3768 atomic_inc(&root->nr_cgrps);
4239 3769 cgroup_get(parent);
4240 /* hold a ref to the parent's dentry */
4241 dget(parent->dentry);
4242 3770
4243 /* 3771 /*
4244 * @cgrp is now fully operational. If something fails after this 3772 * @cgrp is now fully operational. If something fails after this
@@ -4246,49 +3774,66 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4246 */ 3774 */
4247 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 3775 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4248 3776
3777 err = cgroup_kn_set_ugid(kn);
3778 if (err)
3779 goto err_destroy;
3780
4249 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 3781 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4250 if (err) 3782 if (err)
4251 goto err_destroy; 3783 goto err_destroy;
4252 3784
4253 /* let's create and online css's */ 3785 /* let's create and online css's */
4254 for_each_subsys(ss, ssid) { 3786 for_each_subsys(ss, ssid) {
4255 if (root->subsys_mask & (1 << ssid)) { 3787 if (root->cgrp.subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss); 3788 err = create_css(cgrp, ss);
4257 if (err) 3789 if (err)
4258 goto err_destroy; 3790 goto err_destroy;
4259 } 3791 }
4260 } 3792 }
4261 3793
3794 kernfs_activate(kn);
3795
4262 mutex_unlock(&cgroup_mutex); 3796 mutex_unlock(&cgroup_mutex);
4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3797 mutex_unlock(&cgroup_tree_mutex);
4264 3798
4265 return 0; 3799 return 0;
4266 3800
4267err_free_id: 3801err_free_id:
4268 idr_remove(&root->cgroup_idr, cgrp->id); 3802 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock: 3803err_unlock:
4272 mutex_unlock(&cgroup_mutex); 3804 mutex_unlock(&cgroup_mutex);
4273err_free_name: 3805err_unlock_tree:
4274 kfree(rcu_dereference_raw(cgrp->name)); 3806 mutex_unlock(&cgroup_tree_mutex);
4275err_free_cgrp:
4276 kfree(cgrp); 3807 kfree(cgrp);
4277 return err; 3808 return err;
4278 3809
4279err_destroy: 3810err_destroy:
4280 cgroup_destroy_locked(cgrp); 3811 cgroup_destroy_locked(cgrp);
4281 mutex_unlock(&cgroup_mutex); 3812 mutex_unlock(&cgroup_mutex);
4282 mutex_unlock(&dentry->d_inode->i_mutex); 3813 mutex_unlock(&cgroup_tree_mutex);
4283 return err; 3814 return err;
4284} 3815}
4285 3816
4286static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3817static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3818 umode_t mode)
4287{ 3819{
4288 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3820 struct cgroup *parent = parent_kn->priv;
3821 int ret;
3822
3823 /*
3824 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3825 * kernfs active_ref and cgroup_create() already synchronizes
3826 * properly against removal through cgroup_lock_live_group().
3827 * Break it before calling cgroup_create().
3828 */
3829 cgroup_get(parent);
3830 kernfs_break_active_protection(parent_kn);
3831
3832 ret = cgroup_create(parent, name, mode);
4289 3833
4290 /* the vfs holds inode->i_mutex already */ 3834 kernfs_unbreak_active_protection(parent_kn);
4291 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3835 cgroup_put(parent);
3836 return ret;
4292} 3837}
4293 3838
4294/* 3839/*
@@ -4301,6 +3846,7 @@ static void css_killed_work_fn(struct work_struct *work)
4301 container_of(work, struct cgroup_subsys_state, destroy_work); 3846 container_of(work, struct cgroup_subsys_state, destroy_work);
4302 struct cgroup *cgrp = css->cgroup; 3847 struct cgroup *cgrp = css->cgroup;
4303 3848
3849 mutex_lock(&cgroup_tree_mutex);
4304 mutex_lock(&cgroup_mutex); 3850 mutex_lock(&cgroup_mutex);
4305 3851
4306 /* 3852 /*
@@ -4318,6 +3864,7 @@ static void css_killed_work_fn(struct work_struct *work)
4318 cgroup_destroy_css_killed(cgrp); 3864 cgroup_destroy_css_killed(cgrp);
4319 3865
4320 mutex_unlock(&cgroup_mutex); 3866 mutex_unlock(&cgroup_mutex);
3867 mutex_unlock(&cgroup_tree_mutex);
4321 3868
4322 /* 3869 /*
4323 * Put the css refs from kill_css(). Each css holds an extra 3870 * Put the css refs from kill_css(). Each css holds an extra
@@ -4339,18 +3886,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4339 queue_work(cgroup_destroy_wq, &css->destroy_work); 3886 queue_work(cgroup_destroy_wq, &css->destroy_work);
4340} 3887}
4341 3888
4342/** 3889static void __kill_css(struct cgroup_subsys_state *css)
4343 * kill_css - destroy a css
4344 * @css: css to destroy
4345 *
4346 * This function initiates destruction of @css by removing cgroup interface
4347 * files and putting its base reference. ->css_offline() will be invoked
4348 * asynchronously once css_tryget() is guaranteed to fail and when the
4349 * reference count reaches zero, @css will be released.
4350 */
4351static void kill_css(struct cgroup_subsys_state *css)
4352{ 3890{
4353 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3891 lockdep_assert_held(&cgroup_tree_mutex);
3892
3893 /*
3894 * This must happen before css is disassociated with its cgroup.
3895 * See seq_css() for details.
3896 */
3897 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4354 3898
4355 /* 3899 /*
4356 * Killing would put the base ref, but we need to keep it alive 3900 * Killing would put the base ref, but we need to keep it alive
@@ -4372,6 +3916,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4372} 3916}
4373 3917
4374/** 3918/**
3919 * kill_css - destroy a css
3920 * @css: css to destroy
3921 *
3922 * This function initiates destruction of @css by removing cgroup interface
3923 * files and putting its base reference. ->css_offline() will be invoked
3924 * asynchronously once css_tryget() is guaranteed to fail and when the
3925 * reference count reaches zero, @css will be released.
3926 */
3927static void kill_css(struct cgroup_subsys_state *css)
3928{
3929 struct cgroup *cgrp = css->cgroup;
3930
3931 lockdep_assert_held(&cgroup_tree_mutex);
3932
3933 /* if already killed, noop */
3934 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3935 cgrp->subsys_mask &= ~(1 << css->ss->id);
3936 __kill_css(css);
3937 }
3938}
3939
3940/**
4375 * cgroup_destroy_locked - the first stage of cgroup destruction 3941 * cgroup_destroy_locked - the first stage of cgroup destruction
4376 * @cgrp: cgroup to be destroyed 3942 * @cgrp: cgroup to be destroyed
4377 * 3943 *
@@ -4398,22 +3964,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4398static int cgroup_destroy_locked(struct cgroup *cgrp) 3964static int cgroup_destroy_locked(struct cgroup *cgrp)
4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3965 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4400{ 3966{
4401 struct dentry *d = cgrp->dentry;
4402 struct cgroup_subsys_state *css;
4403 struct cgroup *child; 3967 struct cgroup *child;
3968 struct cgroup_subsys_state *css;
4404 bool empty; 3969 bool empty;
4405 int ssid; 3970 int ssid;
4406 3971
4407 lockdep_assert_held(&d->d_inode->i_mutex); 3972 lockdep_assert_held(&cgroup_tree_mutex);
4408 lockdep_assert_held(&cgroup_mutex); 3973 lockdep_assert_held(&cgroup_mutex);
4409 3974
4410 /* 3975 /*
4411 * css_set_lock synchronizes access to ->cset_links and prevents 3976 * css_set_rwsem synchronizes access to ->cset_links and prevents
4412 * @cgrp from being removed while __put_css_set() is in progress. 3977 * @cgrp from being removed while put_css_set() is in progress.
4413 */ 3978 */
4414 read_lock(&css_set_lock); 3979 down_read(&css_set_rwsem);
4415 empty = list_empty(&cgrp->cset_links); 3980 empty = list_empty(&cgrp->cset_links);
4416 read_unlock(&css_set_lock); 3981 up_read(&css_set_rwsem);
4417 if (!empty) 3982 if (!empty)
4418 return -EBUSY; 3983 return -EBUSY;
4419 3984
@@ -4434,14 +3999,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4434 return -EBUSY; 3999 return -EBUSY;
4435 4000
4436 /* 4001 /*
4437 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4438 * will be invoked to perform the rest of destruction once the
4439 * percpu refs of all css's are confirmed to be killed.
4440 */
4441 for_each_css(css, ssid, cgrp)
4442 kill_css(css);
4443
4444 /*
4445 * Mark @cgrp dead. This prevents further task migration and child 4002 * Mark @cgrp dead. This prevents further task migration and child
4446 * creation by disabling cgroup_lock_live_group(). Note that 4003 * creation by disabling cgroup_lock_live_group(). Note that
4447 * CGRP_DEAD assertion is depended upon by css_next_child() to 4004 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4450,6 +4007,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4450 */ 4007 */
4451 set_bit(CGRP_DEAD, &cgrp->flags); 4008 set_bit(CGRP_DEAD, &cgrp->flags);
4452 4009
4010 /*
4011 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4012 * will be invoked to perform the rest of destruction once the
4013 * percpu refs of all css's are confirmed to be killed. This
4014 * involves removing the subsystem's files, drop cgroup_mutex.
4015 */
4016 mutex_unlock(&cgroup_mutex);
4017 for_each_css(css, ssid, cgrp)
4018 kill_css(css);
4019 mutex_lock(&cgroup_mutex);
4020
4453 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4021 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4454 raw_spin_lock(&release_list_lock); 4022 raw_spin_lock(&release_list_lock);
4455 if (!list_empty(&cgrp->release_list)) 4023 if (!list_empty(&cgrp->release_list))
@@ -4465,14 +4033,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4465 if (!cgrp->nr_css) 4033 if (!cgrp->nr_css)
4466 cgroup_destroy_css_killed(cgrp); 4034 cgroup_destroy_css_killed(cgrp);
4467 4035
4036 /* remove @cgrp directory along with the base files */
4037 mutex_unlock(&cgroup_mutex);
4038
4468 /* 4039 /*
4469 * Clear the base files and remove @cgrp directory. The removal 4040 * There are two control paths which try to determine cgroup from
4470 * puts the base ref but we aren't quite done with @cgrp yet, so 4041 * dentry without going through kernfs - cgroupstats_build() and
4471 * hold onto it. 4042 * css_tryget_from_dir(). Those are supported by RCU protecting
4043 * clearing of cgrp->kn->priv backpointer, which should happen
4044 * after all files under it have been removed.
4472 */ 4045 */
4473 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4046 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4474 dget(d); 4047 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4475 cgroup_d_remove_dir(d); 4048
4049 mutex_lock(&cgroup_mutex);
4476 4050
4477 return 0; 4051 return 0;
4478}; 4052};
@@ -4489,72 +4063,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4489static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4063static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4490{ 4064{
4491 struct cgroup *parent = cgrp->parent; 4065 struct cgroup *parent = cgrp->parent;
4492 struct dentry *d = cgrp->dentry;
4493 4066
4067 lockdep_assert_held(&cgroup_tree_mutex);
4494 lockdep_assert_held(&cgroup_mutex); 4068 lockdep_assert_held(&cgroup_mutex);
4495 4069
4496 /* delete this cgroup from parent->children */ 4070 /* delete this cgroup from parent->children */
4497 list_del_rcu(&cgrp->sibling); 4071 list_del_rcu(&cgrp->sibling);
4498 4072
4499 dput(d); 4073 cgroup_put(cgrp);
4500 4074
4501 set_bit(CGRP_RELEASABLE, &parent->flags); 4075 set_bit(CGRP_RELEASABLE, &parent->flags);
4502 check_for_release(parent); 4076 check_for_release(parent);
4503} 4077}
4504 4078
4505static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4079static int cgroup_rmdir(struct kernfs_node *kn)
4506{ 4080{
4507 int ret; 4081 struct cgroup *cgrp = kn->priv;
4508 4082 int ret = 0;
4509 mutex_lock(&cgroup_mutex);
4510 ret = cgroup_destroy_locked(dentry->d_fsdata);
4511 mutex_unlock(&cgroup_mutex);
4512 4083
4513 return ret; 4084 /*
4514} 4085 * This is self-destruction but @kn can't be removed while this
4086 * callback is in progress. Let's break active protection. Once
4087 * the protection is broken, @cgrp can be destroyed at any point.
4088 * Pin it so that it stays accessible.
4089 */
4090 cgroup_get(cgrp);
4091 kernfs_break_active_protection(kn);
4515 4092
4516static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4093 mutex_lock(&cgroup_tree_mutex);
4517{ 4094 mutex_lock(&cgroup_mutex);
4518 INIT_LIST_HEAD(&ss->cftsets);
4519 4095
4520 /* 4096 /*
4521 * base_cftset is embedded in subsys itself, no need to worry about 4097 * @cgrp might already have been destroyed while we're trying to
4522 * deregistration. 4098 * grab the mutexes.
4523 */ 4099 */
4524 if (ss->base_cftypes) { 4100 if (!cgroup_is_dead(cgrp))
4525 struct cftype *cft; 4101 ret = cgroup_destroy_locked(cgrp);
4526 4102
4527 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4103 mutex_unlock(&cgroup_mutex);
4528 cft->ss = ss; 4104 mutex_unlock(&cgroup_tree_mutex);
4529 4105
4530 ss->base_cftset.cfts = ss->base_cftypes; 4106 kernfs_unbreak_active_protection(kn);
4531 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4107 cgroup_put(cgrp);
4532 } 4108 return ret;
4533} 4109}
4534 4110
4111static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4112 .remount_fs = cgroup_remount,
4113 .show_options = cgroup_show_options,
4114 .mkdir = cgroup_mkdir,
4115 .rmdir = cgroup_rmdir,
4116 .rename = cgroup_rename,
4117};
4118
4535static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4119static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4536{ 4120{
4537 struct cgroup_subsys_state *css; 4121 struct cgroup_subsys_state *css;
4538 4122
4539 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4123 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4540 4124
4125 mutex_lock(&cgroup_tree_mutex);
4541 mutex_lock(&cgroup_mutex); 4126 mutex_lock(&cgroup_mutex);
4542 4127
4543 /* init base cftset */ 4128 INIT_LIST_HEAD(&ss->cfts);
4544 cgroup_init_cftsets(ss);
4545 4129
4546 /* Create the top cgroup state for this subsystem */ 4130 /* Create the root cgroup state for this subsystem */
4547 ss->root = &cgroup_dummy_root; 4131 ss->root = &cgrp_dfl_root;
4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4132 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4549 /* We don't handle early failures gracefully */ 4133 /* We don't handle early failures gracefully */
4550 BUG_ON(IS_ERR(css)); 4134 BUG_ON(IS_ERR(css));
4551 init_css(css, ss, cgroup_dummy_top); 4135 init_css(css, ss, &cgrp_dfl_root.cgrp);
4552 4136
4553 /* Update the init_css_set to contain a subsys 4137 /* Update the init_css_set to contain a subsys
4554 * pointer to this state - since the subsystem is 4138 * pointer to this state - since the subsystem is
4555 * newly registered, all tasks and hence the 4139 * newly registered, all tasks and hence the
4556 * init_css_set is in the subsystem's top cgroup. */ 4140 * init_css_set is in the subsystem's root cgroup. */
4557 init_css_set.subsys[ss->subsys_id] = css; 4141 init_css_set.subsys[ss->id] = css;
4558 4142
4559 need_forkexit_callback |= ss->fork || ss->exit; 4143 need_forkexit_callback |= ss->fork || ss->exit;
4560 4144
@@ -4565,185 +4149,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4565 4149
4566 BUG_ON(online_css(css)); 4150 BUG_ON(online_css(css));
4567 4151
4568 mutex_unlock(&cgroup_mutex); 4152 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4569
4570 /* this function shouldn't be used with modular subsystems, since they
4571 * need to register a subsys_id, among other things */
4572 BUG_ON(ss->module);
4573}
4574
4575/**
4576 * cgroup_load_subsys: load and register a modular subsystem at runtime
4577 * @ss: the subsystem to load
4578 *
4579 * This function should be called in a modular subsystem's initcall. If the
4580 * subsystem is built as a module, it will be assigned a new subsys_id and set
4581 * up for use. If the subsystem is built-in anyway, work is delegated to the
4582 * simpler cgroup_init_subsys.
4583 */
4584int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4585{
4586 struct cgroup_subsys_state *css;
4587 int i, ret;
4588 struct hlist_node *tmp;
4589 struct css_set *cset;
4590 unsigned long key;
4591
4592 /* check name and function validity */
4593 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4594 ss->css_alloc == NULL || ss->css_free == NULL)
4595 return -EINVAL;
4596
4597 /*
4598 * we don't support callbacks in modular subsystems. this check is
4599 * before the ss->module check for consistency; a subsystem that could
4600 * be a module should still have no callbacks even if the user isn't
4601 * compiling it as one.
4602 */
4603 if (ss->fork || ss->exit)
4604 return -EINVAL;
4605
4606 /*
4607 * an optionally modular subsystem is built-in: we want to do nothing,
4608 * since cgroup_init_subsys will have already taken care of it.
4609 */
4610 if (ss->module == NULL) {
4611 /* a sanity check */
4612 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4613 return 0;
4614 }
4615
4616 /* init base cftset */
4617 cgroup_init_cftsets(ss);
4618
4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4621 cgroup_subsys[ss->subsys_id] = ss;
4622
4623 /*
4624 * no ss->css_alloc seems to need anything important in the ss
4625 * struct, so this can happen first (i.e. before the dummy root
4626 * attachment).
4627 */
4628 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4629 if (IS_ERR(css)) {
4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4633 mutex_unlock(&cgroup_mutex);
4634 return PTR_ERR(css);
4635 }
4636
4637 ss->root = &cgroup_dummy_root;
4638
4639 /* our new subsystem will be attached to the dummy hierarchy. */
4640 init_css(css, ss, cgroup_dummy_top);
4641
4642 /*
4643 * Now we need to entangle the css into the existing css_sets. unlike
4644 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4645 * will need a new pointer to it; done by iterating the css_set_table.
4646 * furthermore, modifying the existing css_sets will corrupt the hash
4647 * table state, so each changed css_set will need its hash recomputed.
4648 * this is all done under the css_set_lock.
4649 */
4650 write_lock(&css_set_lock);
4651 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4652 /* skip entries that we already rehashed */
4653 if (cset->subsys[ss->subsys_id])
4654 continue;
4655 /* remove existing entry */
4656 hash_del(&cset->hlist);
4657 /* set new value */
4658 cset->subsys[ss->subsys_id] = css;
4659 /* recompute hash and restore entry */
4660 key = css_set_hash(cset->subsys);
4661 hash_add(css_set_table, &cset->hlist, key);
4662 }
4663 write_unlock(&css_set_lock);
4664
4665 ret = online_css(css);
4666 if (ret) {
4667 ss->css_free(css);
4668 goto err_unload;
4669 }
4670
4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4673 mutex_unlock(&cgroup_mutex);
4674 return 0;
4675
4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4678 mutex_unlock(&cgroup_mutex);
4679 /* @ss can't be mounted here as try_module_get() would fail */
4680 cgroup_unload_subsys(ss);
4681 return ret;
4682}
4683EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4684
4685/**
4686 * cgroup_unload_subsys: unload a modular subsystem
4687 * @ss: the subsystem to unload
4688 *
4689 * This function should be called in a modular subsystem's exitcall. When this
4690 * function is invoked, the refcount on the subsystem's module will be 0, so
4691 * the subsystem will not be attached to any hierarchy.
4692 */
4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4694{
4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4697
4698 BUG_ON(ss->module == NULL);
4699
4700 /*
4701 * we shouldn't be called if the subsystem is in use, and the use of
4702 * try_module_get() in rebind_subsystems() should ensure that it
4703 * doesn't start being used while we're killing it off.
4704 */
4705 BUG_ON(ss->root != &cgroup_dummy_root);
4706
4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4709
4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4713 4153
4714 /* deassign the subsys_id */
4715 cgroup_subsys[ss->subsys_id] = NULL;
4716
4717 /*
4718 * disentangle the css from all css_sets attached to the dummy
4719 * top. as in loading, we need to pay our respects to the hashtable
4720 * gods.
4721 */
4722 write_lock(&css_set_lock);
4723 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4724 struct css_set *cset = link->cset;
4725 unsigned long key;
4726
4727 hash_del(&cset->hlist);
4728 cset->subsys[ss->subsys_id] = NULL;
4729 key = css_set_hash(cset->subsys);
4730 hash_add(css_set_table, &cset->hlist, key);
4731 }
4732 write_unlock(&css_set_lock);
4733
4734 /*
4735 * remove subsystem's css from the cgroup_dummy_top and free it -
4736 * need to free before marking as null because ss->css_free needs
4737 * the cgrp->subsys pointer to find their state.
4738 */
4739 if (css)
4740 ss->css_free(css);
4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4742
4743 mutex_unlock(&cgroup_root_mutex);
4744 mutex_unlock(&cgroup_mutex); 4154 mutex_unlock(&cgroup_mutex);
4155 mutex_unlock(&cgroup_tree_mutex);
4745} 4156}
4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4747 4157
4748/** 4158/**
4749 * cgroup_init_early - cgroup initialization at system boot 4159 * cgroup_init_early - cgroup initialization at system boot
@@ -4753,34 +4163,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4753 */ 4163 */
4754int __init cgroup_init_early(void) 4164int __init cgroup_init_early(void)
4755{ 4165{
4166 static struct cgroup_sb_opts __initdata opts =
4167 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4756 struct cgroup_subsys *ss; 4168 struct cgroup_subsys *ss;
4757 int i; 4169 int i;
4758 4170
4759 atomic_set(&init_css_set.refcount, 1); 4171 init_cgroup_root(&cgrp_dfl_root, &opts);
4760 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4761 INIT_LIST_HEAD(&init_css_set.tasks);
4762 INIT_HLIST_NODE(&init_css_set.hlist);
4763 css_set_count = 1;
4764 init_cgroup_root(&cgroup_dummy_root);
4765 cgroup_root_count = 1;
4766 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4172 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4767 4173
4768 init_cgrp_cset_link.cset = &init_css_set; 4174 for_each_subsys(ss, i) {
4769 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4175 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4770 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4176 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4771 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4177 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4772 4178 ss->id, ss->name);
4773 /* at bootup time, we don't worry about modular subsystems */ 4179 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4774 for_each_builtin_subsys(ss, i) { 4180 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4775 BUG_ON(!ss->name); 4181
4776 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4182 ss->id = i;
4777 BUG_ON(!ss->css_alloc); 4183 ss->name = cgroup_subsys_name[i];
4778 BUG_ON(!ss->css_free);
4779 if (ss->subsys_id != i) {
4780 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4781 ss->name, ss->subsys_id);
4782 BUG();
4783 }
4784 4184
4785 if (ss->early_init) 4185 if (ss->early_init)
4786 cgroup_init_subsys(ss); 4186 cgroup_init_subsys(ss);
@@ -4798,53 +4198,46 @@ int __init cgroup_init(void)
4798{ 4198{
4799 struct cgroup_subsys *ss; 4199 struct cgroup_subsys *ss;
4800 unsigned long key; 4200 unsigned long key;
4801 int i, err; 4201 int ssid, err;
4802 4202
4803 err = bdi_init(&cgroup_backing_dev_info); 4203 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4804 if (err)
4805 return err;
4806 4204
4807 for_each_builtin_subsys(ss, i) { 4205 mutex_lock(&cgroup_tree_mutex);
4808 if (!ss->early_init)
4809 cgroup_init_subsys(ss);
4810 }
4811
4812 /* allocate id for the dummy hierarchy */
4813 mutex_lock(&cgroup_mutex); 4206 mutex_lock(&cgroup_mutex);
4814 mutex_lock(&cgroup_root_mutex);
4815 4207
4816 /* Add init_css_set to the hash table */ 4208 /* Add init_css_set to the hash table */
4817 key = css_set_hash(init_css_set.subsys); 4209 key = css_set_hash(init_css_set.subsys);
4818 hash_add(css_set_table, &init_css_set.hlist, key); 4210 hash_add(css_set_table, &init_css_set.hlist, key);
4819 4211
4820 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4212 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4821 4213
4822 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4823 0, 1, GFP_KERNEL);
4824 BUG_ON(err < 0);
4825
4826 mutex_unlock(&cgroup_root_mutex);
4827 mutex_unlock(&cgroup_mutex); 4214 mutex_unlock(&cgroup_mutex);
4215 mutex_unlock(&cgroup_tree_mutex);
4828 4216
4829 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4217 for_each_subsys(ss, ssid) {
4830 if (!cgroup_kobj) { 4218 if (!ss->early_init)
4831 err = -ENOMEM; 4219 cgroup_init_subsys(ss);
4832 goto out; 4220
4221 /*
4222 * cftype registration needs kmalloc and can't be done
4223 * during early_init. Register base cftypes separately.
4224 */
4225 if (ss->base_cftypes)
4226 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4833 } 4227 }
4834 4228
4229 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4230 if (!cgroup_kobj)
4231 return -ENOMEM;
4232
4835 err = register_filesystem(&cgroup_fs_type); 4233 err = register_filesystem(&cgroup_fs_type);
4836 if (err < 0) { 4234 if (err < 0) {
4837 kobject_put(cgroup_kobj); 4235 kobject_put(cgroup_kobj);
4838 goto out; 4236 return err;
4839 } 4237 }
4840 4238
4841 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4239 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4842 4240 return 0;
4843out:
4844 if (err)
4845 bdi_destroy(&cgroup_backing_dev_info);
4846
4847 return err;
4848} 4241}
4849 4242
4850static int __init cgroup_wq_init(void) 4243static int __init cgroup_wq_init(void)
@@ -4876,12 +4269,6 @@ core_initcall(cgroup_wq_init);
4876 * proc_cgroup_show() 4269 * proc_cgroup_show()
4877 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4270 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4878 * - Used for /proc/<pid>/cgroup. 4271 * - Used for /proc/<pid>/cgroup.
4879 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4880 * doesn't really matter if tsk->cgroup changes after we read it,
4881 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4882 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4883 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4884 * cgroup to top_cgroup.
4885 */ 4272 */
4886 4273
4887/* TODO: Use a proper seq_file iterator */ 4274/* TODO: Use a proper seq_file iterator */
@@ -4889,12 +4276,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4889{ 4276{
4890 struct pid *pid; 4277 struct pid *pid;
4891 struct task_struct *tsk; 4278 struct task_struct *tsk;
4892 char *buf; 4279 char *buf, *path;
4893 int retval; 4280 int retval;
4894 struct cgroupfs_root *root; 4281 struct cgroup_root *root;
4895 4282
4896 retval = -ENOMEM; 4283 retval = -ENOMEM;
4897 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4284 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4898 if (!buf) 4285 if (!buf)
4899 goto out; 4286 goto out;
4900 4287
@@ -4907,29 +4294,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4907 retval = 0; 4294 retval = 0;
4908 4295
4909 mutex_lock(&cgroup_mutex); 4296 mutex_lock(&cgroup_mutex);
4297 down_read(&css_set_rwsem);
4910 4298
4911 for_each_active_root(root) { 4299 for_each_root(root) {
4912 struct cgroup_subsys *ss; 4300 struct cgroup_subsys *ss;
4913 struct cgroup *cgrp; 4301 struct cgroup *cgrp;
4914 int ssid, count = 0; 4302 int ssid, count = 0;
4915 4303
4304 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4305 continue;
4306
4916 seq_printf(m, "%d:", root->hierarchy_id); 4307 seq_printf(m, "%d:", root->hierarchy_id);
4917 for_each_subsys(ss, ssid) 4308 for_each_subsys(ss, ssid)
4918 if (root->subsys_mask & (1 << ssid)) 4309 if (root->cgrp.subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4310 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4920 if (strlen(root->name)) 4311 if (strlen(root->name))
4921 seq_printf(m, "%sname=%s", count ? "," : "", 4312 seq_printf(m, "%sname=%s", count ? "," : "",
4922 root->name); 4313 root->name);
4923 seq_putc(m, ':'); 4314 seq_putc(m, ':');
4924 cgrp = task_cgroup_from_root(tsk, root); 4315 cgrp = task_cgroup_from_root(tsk, root);
4925 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4316 path = cgroup_path(cgrp, buf, PATH_MAX);
4926 if (retval < 0) 4317 if (!path) {
4318 retval = -ENAMETOOLONG;
4927 goto out_unlock; 4319 goto out_unlock;
4928 seq_puts(m, buf); 4320 }
4321 seq_puts(m, path);
4929 seq_putc(m, '\n'); 4322 seq_putc(m, '\n');
4930 } 4323 }
4931 4324
4932out_unlock: 4325out_unlock:
4326 up_read(&css_set_rwsem);
4933 mutex_unlock(&cgroup_mutex); 4327 mutex_unlock(&cgroup_mutex);
4934 put_task_struct(tsk); 4328 put_task_struct(tsk);
4935out_free: 4329out_free:
@@ -4955,7 +4349,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4955 for_each_subsys(ss, i) 4349 for_each_subsys(ss, i)
4956 seq_printf(m, "%s\t%d\t%d\t%d\n", 4350 seq_printf(m, "%s\t%d\t%d\t%d\n",
4957 ss->name, ss->root->hierarchy_id, 4351 ss->name, ss->root->hierarchy_id,
4958 ss->root->number_of_cgroups, !ss->disabled); 4352 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4959 4353
4960 mutex_unlock(&cgroup_mutex); 4354 mutex_unlock(&cgroup_mutex);
4961 return 0; 4355 return 0;
@@ -4974,27 +4368,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4974}; 4368};
4975 4369
4976/** 4370/**
4977 * cgroup_fork - attach newly forked task to its parents cgroup. 4371 * cgroup_fork - initialize cgroup related fields during copy_process()
4978 * @child: pointer to task_struct of forking parent process. 4372 * @child: pointer to task_struct of forking parent process.
4979 * 4373 *
4980 * Description: A task inherits its parent's cgroup at fork(). 4374 * A task is associated with the init_css_set until cgroup_post_fork()
4981 * 4375 * attaches it to the parent's css_set. Empty cg_list indicates that
4982 * A pointer to the shared css_set was automatically copied in 4376 * @child isn't holding reference to its css_set.
4983 * fork.c by dup_task_struct(). However, we ignore that copy, since
4984 * it was not made under the protection of RCU or cgroup_mutex, so
4985 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4986 * have already changed current->cgroups, allowing the previously
4987 * referenced cgroup group to be removed and freed.
4988 *
4989 * At the point that cgroup_fork() is called, 'current' is the parent
4990 * task, and the passed argument 'child' points to the child task.
4991 */ 4377 */
4992void cgroup_fork(struct task_struct *child) 4378void cgroup_fork(struct task_struct *child)
4993{ 4379{
4994 task_lock(current); 4380 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4995 get_css_set(task_css_set(current));
4996 child->cgroups = current->cgroups;
4997 task_unlock(current);
4998 INIT_LIST_HEAD(&child->cg_list); 4381 INIT_LIST_HEAD(&child->cg_list);
4999} 4382}
5000 4383
@@ -5014,23 +4397,37 @@ void cgroup_post_fork(struct task_struct *child)
5014 int i; 4397 int i;
5015 4398
5016 /* 4399 /*
5017 * use_task_css_set_links is set to 1 before we walk the tasklist 4400 * This may race against cgroup_enable_task_cg_links(). As that
5018 * under the tasklist_lock and we read it here after we added the child 4401 * function sets use_task_css_set_links before grabbing
5019 * to the tasklist under the tasklist_lock as well. If the child wasn't 4402 * tasklist_lock and we just went through tasklist_lock to add
5020 * yet in the tasklist when we walked through it from 4403 * @child, it's guaranteed that either we see the set
5021 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4404 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5022 * should be visible now due to the paired locking and barriers implied 4405 * @child during its iteration.
5023 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4406 *
5024 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4407 * If we won the race, @child is associated with %current's
5025 * lock on fork. 4408 * css_set. Grabbing css_set_rwsem guarantees both that the
4409 * association is stable, and, on completion of the parent's
4410 * migration, @child is visible in the source of migration or
4411 * already in the destination cgroup. This guarantee is necessary
4412 * when implementing operations which need to migrate all tasks of
4413 * a cgroup to another.
4414 *
4415 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4416 * will remain in init_css_set. This is safe because all tasks are
4417 * in the init_css_set before cg_links is enabled and there's no
4418 * operation which transfers all tasks out of init_css_set.
5026 */ 4419 */
5027 if (use_task_css_set_links) { 4420 if (use_task_css_set_links) {
5028 write_lock(&css_set_lock); 4421 struct css_set *cset;
5029 task_lock(child); 4422
5030 if (list_empty(&child->cg_list)) 4423 down_write(&css_set_rwsem);
5031 list_add(&child->cg_list, &task_css_set(child)->tasks); 4424 cset = task_css_set(current);
5032 task_unlock(child); 4425 if (list_empty(&child->cg_list)) {
5033 write_unlock(&css_set_lock); 4426 rcu_assign_pointer(child->cgroups, cset);
4427 list_add(&child->cg_list, &cset->tasks);
4428 get_css_set(cset);
4429 }
4430 up_write(&css_set_rwsem);
5034 } 4431 }
5035 4432
5036 /* 4433 /*
@@ -5039,15 +4436,7 @@ void cgroup_post_fork(struct task_struct *child)
5039 * and addition to css_set. 4436 * and addition to css_set.
5040 */ 4437 */
5041 if (need_forkexit_callback) { 4438 if (need_forkexit_callback) {
5042 /* 4439 for_each_subsys(ss, i)
5043 * fork/exit callbacks are supported only for builtin
5044 * subsystems, and the builtin section of the subsys
5045 * array is immutable, so we don't need to lock the
5046 * subsys array here. On the other hand, modular section
5047 * of the array can be freed at module unload, so we
5048 * can't touch that.
5049 */
5050 for_each_builtin_subsys(ss, i)
5051 if (ss->fork) 4440 if (ss->fork)
5052 ss->fork(child); 4441 ss->fork(child);
5053 } 4442 }
@@ -5056,7 +4445,6 @@ void cgroup_post_fork(struct task_struct *child)
5056/** 4445/**
5057 * cgroup_exit - detach cgroup from exiting task 4446 * cgroup_exit - detach cgroup from exiting task
5058 * @tsk: pointer to task_struct of exiting process 4447 * @tsk: pointer to task_struct of exiting process
5059 * @run_callback: run exit callbacks?
5060 * 4448 *
5061 * Description: Detach cgroup from @tsk and release it. 4449 * Description: Detach cgroup from @tsk and release it.
5062 * 4450 *
@@ -5066,57 +4454,38 @@ void cgroup_post_fork(struct task_struct *child)
5066 * use notify_on_release cgroups where very high task exit scaling 4454 * use notify_on_release cgroups where very high task exit scaling
5067 * is required on large systems. 4455 * is required on large systems.
5068 * 4456 *
5069 * the_top_cgroup_hack: 4457 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5070 * 4458 * call cgroup_exit() while the task is still competent to handle
5071 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4459 * notify_on_release(), then leave the task attached to the root cgroup in
5072 * 4460 * each hierarchy for the remainder of its exit. No need to bother with
5073 * We call cgroup_exit() while the task is still competent to 4461 * init_css_set refcnting. init_css_set never goes away and we can't race
5074 * handle notify_on_release(), then leave the task attached to the 4462 * with migration path - PF_EXITING is visible to migration path.
5075 * root cgroup in each hierarchy for the remainder of its exit.
5076 *
5077 * To do this properly, we would increment the reference count on
5078 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5079 * code we would add a second cgroup function call, to drop that
5080 * reference. This would just create an unnecessary hot spot on
5081 * the top_cgroup reference count, to no avail.
5082 *
5083 * Normally, holding a reference to a cgroup without bumping its
5084 * count is unsafe. The cgroup could go away, or someone could
5085 * attach us to a different cgroup, decrementing the count on
5086 * the first cgroup that we never incremented. But in this case,
5087 * top_cgroup isn't going away, and either task has PF_EXITING set,
5088 * which wards off any cgroup_attach_task() attempts, or task is a failed
5089 * fork, never visible to cgroup_attach_task.
5090 */ 4463 */
5091void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4464void cgroup_exit(struct task_struct *tsk)
5092{ 4465{
5093 struct cgroup_subsys *ss; 4466 struct cgroup_subsys *ss;
5094 struct css_set *cset; 4467 struct css_set *cset;
4468 bool put_cset = false;
5095 int i; 4469 int i;
5096 4470
5097 /* 4471 /*
5098 * Unlink from the css_set task list if necessary. 4472 * Unlink from @tsk from its css_set. As migration path can't race
5099 * Optimistically check cg_list before taking 4473 * with us, we can check cg_list without grabbing css_set_rwsem.
5100 * css_set_lock
5101 */ 4474 */
5102 if (!list_empty(&tsk->cg_list)) { 4475 if (!list_empty(&tsk->cg_list)) {
5103 write_lock(&css_set_lock); 4476 down_write(&css_set_rwsem);
5104 if (!list_empty(&tsk->cg_list)) 4477 list_del_init(&tsk->cg_list);
5105 list_del_init(&tsk->cg_list); 4478 up_write(&css_set_rwsem);
5106 write_unlock(&css_set_lock); 4479 put_cset = true;
5107 } 4480 }
5108 4481
5109 /* Reassign the task to the init_css_set. */ 4482 /* Reassign the task to the init_css_set. */
5110 task_lock(tsk);
5111 cset = task_css_set(tsk); 4483 cset = task_css_set(tsk);
5112 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4484 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5113 4485
5114 if (run_callbacks && need_forkexit_callback) { 4486 if (need_forkexit_callback) {
5115 /* 4487 /* see cgroup_post_fork() for details */
5116 * fork/exit callbacks are supported only for builtin 4488 for_each_subsys(ss, i) {
5117 * subsystems, see cgroup_post_fork() for details.
5118 */
5119 for_each_builtin_subsys(ss, i) {
5120 if (ss->exit) { 4489 if (ss->exit) {
5121 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4490 struct cgroup_subsys_state *old_css = cset->subsys[i];
5122 struct cgroup_subsys_state *css = task_css(tsk, i); 4491 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5125,9 +4494,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5125 } 4494 }
5126 } 4495 }
5127 } 4496 }
5128 task_unlock(tsk);
5129 4497
5130 put_css_set_taskexit(cset); 4498 if (put_cset)
4499 put_css_set(cset, true);
5131} 4500}
5132 4501
5133static void check_for_release(struct cgroup *cgrp) 4502static void check_for_release(struct cgroup *cgrp)
@@ -5184,16 +4553,17 @@ static void cgroup_release_agent(struct work_struct *work)
5184 while (!list_empty(&release_list)) { 4553 while (!list_empty(&release_list)) {
5185 char *argv[3], *envp[3]; 4554 char *argv[3], *envp[3];
5186 int i; 4555 int i;
5187 char *pathbuf = NULL, *agentbuf = NULL; 4556 char *pathbuf = NULL, *agentbuf = NULL, *path;
5188 struct cgroup *cgrp = list_entry(release_list.next, 4557 struct cgroup *cgrp = list_entry(release_list.next,
5189 struct cgroup, 4558 struct cgroup,
5190 release_list); 4559 release_list);
5191 list_del_init(&cgrp->release_list); 4560 list_del_init(&cgrp->release_list);
5192 raw_spin_unlock(&release_list_lock); 4561 raw_spin_unlock(&release_list_lock);
5193 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4562 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5194 if (!pathbuf) 4563 if (!pathbuf)
5195 goto continue_free; 4564 goto continue_free;
5196 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4565 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4566 if (!path)
5197 goto continue_free; 4567 goto continue_free;
5198 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4568 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5199 if (!agentbuf) 4569 if (!agentbuf)
@@ -5201,7 +4571,7 @@ static void cgroup_release_agent(struct work_struct *work)
5201 4571
5202 i = 0; 4572 i = 0;
5203 argv[i++] = agentbuf; 4573 argv[i++] = agentbuf;
5204 argv[i++] = pathbuf; 4574 argv[i++] = path;
5205 argv[i] = NULL; 4575 argv[i] = NULL;
5206 4576
5207 i = 0; 4577 i = 0;
@@ -5235,11 +4605,7 @@ static int __init cgroup_disable(char *str)
5235 if (!*token) 4605 if (!*token)
5236 continue; 4606 continue;
5237 4607
5238 /* 4608 for_each_subsys(ss, i) {
5239 * cgroup_disable, being at boot time, can't know about
5240 * module subsystems, so we don't worry about them.
5241 */
5242 for_each_builtin_subsys(ss, i) {
5243 if (!strcmp(token, ss->name)) { 4609 if (!strcmp(token, ss->name)) {
5244 ss->disabled = 1; 4610 ss->disabled = 1;
5245 printk(KERN_INFO "Disabling %s control group" 4611 printk(KERN_INFO "Disabling %s control group"
@@ -5253,28 +4619,42 @@ static int __init cgroup_disable(char *str)
5253__setup("cgroup_disable=", cgroup_disable); 4619__setup("cgroup_disable=", cgroup_disable);
5254 4620
5255/** 4621/**
5256 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4622 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5257 * @dentry: directory dentry of interest 4623 * @dentry: directory dentry of interest
5258 * @ss: subsystem of interest 4624 * @ss: subsystem of interest
5259 * 4625 *
5260 * Must be called under cgroup_mutex or RCU read lock. The caller is 4626 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5261 * responsible for pinning the returned css if it needs to be accessed 4627 * to get the corresponding css and return it. If such css doesn't exist
5262 * outside the critical section. 4628 * or can't be pinned, an ERR_PTR value is returned.
5263 */ 4629 */
5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4630struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5265 struct cgroup_subsys *ss) 4631 struct cgroup_subsys *ss)
5266{ 4632{
4633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4634 struct cgroup_subsys_state *css = NULL;
5267 struct cgroup *cgrp; 4635 struct cgroup *cgrp;
5268 4636
5269 cgroup_assert_mutex_or_rcu_locked();
5270
5271 /* is @dentry a cgroup dir? */ 4637 /* is @dentry a cgroup dir? */
5272 if (!dentry->d_inode || 4638 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5273 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4639 kernfs_type(kn) != KERNFS_DIR)
5274 return ERR_PTR(-EBADF); 4640 return ERR_PTR(-EBADF);
5275 4641
5276 cgrp = __d_cgrp(dentry); 4642 rcu_read_lock();
5277 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4643
4644 /*
4645 * This path doesn't originate from kernfs and @kn could already
4646 * have been or be removed at any point. @kn->priv is RCU
4647 * protected for this access. See destroy_locked() for details.
4648 */
4649 cgrp = rcu_dereference(kn->priv);
4650 if (cgrp)
4651 css = cgroup_css(cgrp, ss);
4652
4653 if (!css || !css_tryget(css))
4654 css = ERR_PTR(-ENOENT);
4655
4656 rcu_read_unlock();
4657 return css;
5278} 4658}
5279 4659
5280/** 4660/**
@@ -5289,7 +4669,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5289{ 4669{
5290 struct cgroup *cgrp; 4670 struct cgroup *cgrp;
5291 4671
5292 cgroup_assert_mutex_or_rcu_locked(); 4672 cgroup_assert_mutexes_or_rcu_locked();
5293 4673
5294 cgrp = idr_find(&ss->root->cgroup_idr, id); 4674 cgrp = idr_find(&ss->root->cgroup_idr, id);
5295 if (cgrp) 4675 if (cgrp)
@@ -5341,23 +4721,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5341{ 4721{
5342 struct cgrp_cset_link *link; 4722 struct cgrp_cset_link *link;
5343 struct css_set *cset; 4723 struct css_set *cset;
4724 char *name_buf;
5344 4725
5345 read_lock(&css_set_lock); 4726 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4727 if (!name_buf)
4728 return -ENOMEM;
4729
4730 down_read(&css_set_rwsem);
5346 rcu_read_lock(); 4731 rcu_read_lock();
5347 cset = rcu_dereference(current->cgroups); 4732 cset = rcu_dereference(current->cgroups);
5348 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5349 struct cgroup *c = link->cgrp; 4734 struct cgroup *c = link->cgrp;
5350 const char *name;
5351 4735
5352 if (c->dentry) 4736 cgroup_name(c, name_buf, NAME_MAX + 1);
5353 name = c->dentry->d_name.name;
5354 else
5355 name = "?";
5356 seq_printf(seq, "Root %d group %s\n", 4737 seq_printf(seq, "Root %d group %s\n",
5357 c->root->hierarchy_id, name); 4738 c->root->hierarchy_id, name_buf);
5358 } 4739 }
5359 rcu_read_unlock(); 4740 rcu_read_unlock();
5360 read_unlock(&css_set_lock); 4741 up_read(&css_set_rwsem);
4742 kfree(name_buf);
5361 return 0; 4743 return 0;
5362} 4744}
5363 4745
@@ -5367,23 +4749,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5367 struct cgroup_subsys_state *css = seq_css(seq); 4749 struct cgroup_subsys_state *css = seq_css(seq);
5368 struct cgrp_cset_link *link; 4750 struct cgrp_cset_link *link;
5369 4751
5370 read_lock(&css_set_lock); 4752 down_read(&css_set_rwsem);
5371 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4753 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5372 struct css_set *cset = link->cset; 4754 struct css_set *cset = link->cset;
5373 struct task_struct *task; 4755 struct task_struct *task;
5374 int count = 0; 4756 int count = 0;
4757
5375 seq_printf(seq, "css_set %p\n", cset); 4758 seq_printf(seq, "css_set %p\n", cset);
4759
5376 list_for_each_entry(task, &cset->tasks, cg_list) { 4760 list_for_each_entry(task, &cset->tasks, cg_list) {
5377 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4761 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5378 seq_puts(seq, " ...\n"); 4762 goto overflow;
5379 break; 4763 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5380 } else { 4764 }
5381 seq_printf(seq, " task %d\n", 4765
5382 task_pid_vnr(task)); 4766 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5383 } 4767 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4768 goto overflow;
4769 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5384 } 4770 }
4771 continue;
4772 overflow:
4773 seq_puts(seq, " ...\n");
5385 } 4774 }
5386 read_unlock(&css_set_lock); 4775 up_read(&css_set_rwsem);
5387 return 0; 4776 return 0;
5388} 4777}
5389 4778
@@ -5426,11 +4815,9 @@ static struct cftype debug_files[] = {
5426 { } /* terminate */ 4815 { } /* terminate */
5427}; 4816};
5428 4817
5429struct cgroup_subsys debug_subsys = { 4818struct cgroup_subsys debug_cgrp_subsys = {
5430 .name = "debug",
5431 .css_alloc = debug_css_alloc, 4819 .css_alloc = debug_css_alloc,
5432 .css_free = debug_css_free, 4820 .css_free = debug_css_free,
5433 .subsys_id = debug_subsys_id,
5434 .base_cftypes = debug_files, 4821 .base_cftypes = debug_files,
5435}; 4822};
5436#endif /* CONFIG_CGROUP_DEBUG */ 4823#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
52 52
53static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
54{ 54{
55 return css_freezer(task_css(task, freezer_subsys_id)); 55 return css_freezer(task_css(task, freezer_cgrp_id));
56} 56}
57 57
58static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
84 return "THAWED"; 84 return "THAWED";
85}; 85};
86 86
87struct cgroup_subsys freezer_subsys;
88
89static struct cgroup_subsys_state * 87static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css) 88freezer_css_alloc(struct cgroup_subsys_state *parent_css)
91{ 89{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
189 * current state before executing the following - !frozen tasks may 187 * current state before executing the following - !frozen tasks may
190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 188 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
191 */ 189 */
192 cgroup_taskset_for_each(task, new_css, tset) { 190 cgroup_taskset_for_each(task, tset) {
193 if (!(freezer->state & CGROUP_FREEZING)) { 191 if (!(freezer->state & CGROUP_FREEZING)) {
194 __thaw_task(task); 192 __thaw_task(task);
195 } else { 193 } else {
@@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
216 } 214 }
217} 215}
218 216
217/**
218 * freezer_fork - cgroup post fork callback
219 * @task: a task which has just been forked
220 *
221 * @task has just been created and should conform to the current state of
222 * the cgroup_freezer it belongs to. This function may race against
223 * freezer_attach(). Losing to freezer_attach() means that we don't have
224 * to do anything as freezer_attach() will put @task into the appropriate
225 * state.
226 */
219static void freezer_fork(struct task_struct *task) 227static void freezer_fork(struct task_struct *task)
220{ 228{
221 struct freezer *freezer; 229 struct freezer *freezer;
@@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
224 freezer = task_freezer(task); 232 freezer = task_freezer(task);
225 233
226 /* 234 /*
227 * The root cgroup is non-freezable, so we can skip the 235 * The root cgroup is non-freezable, so we can skip locking the
228 * following check. 236 * freezer. This is safe regardless of race with task migration.
237 * If we didn't race or won, skipping is obviously the right thing
238 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do.
229 */ 240 */
230 if (!parent_freezer(freezer)) 241 if (!parent_freezer(freezer))
231 goto out; 242 goto out;
232 243
244 /*
245 * Grab @freezer->lock and freeze @task after verifying @task still
246 * belongs to @freezer and it's freezing. The former is for the
247 * case where we have raced against task migration and lost and
248 * @task is already in a different cgroup which may not be frozen.
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
233 spin_lock_irq(&freezer->lock); 253 spin_lock_irq(&freezer->lock);
234 if (freezer->state & CGROUP_FREEZING) 254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
235 freeze_task(task); 255 freeze_task(task);
236 spin_unlock_irq(&freezer->lock); 256 spin_unlock_irq(&freezer->lock);
237out: 257out:
@@ -422,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
422} 442}
423 443
424static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
425 const char *buffer) 445 char *buffer)
426{ 446{
427 bool freeze; 447 bool freeze;
428 448
@@ -473,13 +493,11 @@ static struct cftype files[] = {
473 { } /* terminate */ 493 { } /* terminate */
474}; 494};
475 495
476struct cgroup_subsys freezer_subsys = { 496struct cgroup_subsys freezer_cgrp_subsys = {
477 .name = "freezer",
478 .css_alloc = freezer_css_alloc, 497 .css_alloc = freezer_css_alloc,
479 .css_online = freezer_css_online, 498 .css_online = freezer_css_online,
480 .css_offline = freezer_css_offline, 499 .css_offline = freezer_css_offline,
481 .css_free = freezer_css_free, 500 .css_free = freezer_css_free,
482 .subsys_id = freezer_subsys_id,
483 .attach = freezer_attach, 501 .attach = freezer_attach,
484 .fork = freezer_fork, 502 .fork = freezer_fork,
485 .base_cftypes = files, 503 .base_cftypes = files,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index deff2e693766..a9e710eef0e2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,6 +19,7 @@
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h>
22 23
23#include "smpboot.h" 24#include "smpboot.h"
24 25
@@ -27,18 +28,23 @@
27static DEFINE_MUTEX(cpu_add_remove_lock); 28static DEFINE_MUTEX(cpu_add_remove_lock);
28 29
29/* 30/*
30 * The following two API's must be used when attempting 31 * The following two APIs (cpu_maps_update_begin/done) must be used when
31 * to serialize the updates to cpu_online_mask, cpu_present_mask. 32 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
33 * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
34 * hotplug callback (un)registration performed using __register_cpu_notifier()
35 * or __unregister_cpu_notifier().
32 */ 36 */
33void cpu_maps_update_begin(void) 37void cpu_maps_update_begin(void)
34{ 38{
35 mutex_lock(&cpu_add_remove_lock); 39 mutex_lock(&cpu_add_remove_lock);
36} 40}
41EXPORT_SYMBOL(cpu_notifier_register_begin);
37 42
38void cpu_maps_update_done(void) 43void cpu_maps_update_done(void)
39{ 44{
40 mutex_unlock(&cpu_add_remove_lock); 45 mutex_unlock(&cpu_add_remove_lock);
41} 46}
47EXPORT_SYMBOL(cpu_notifier_register_done);
42 48
43static RAW_NOTIFIER_HEAD(cpu_chain); 49static RAW_NOTIFIER_HEAD(cpu_chain);
44 50
@@ -57,17 +63,30 @@ static struct {
57 * an ongoing cpu hotplug operation. 63 * an ongoing cpu hotplug operation.
58 */ 64 */
59 int refcount; 65 int refcount;
66
67#ifdef CONFIG_DEBUG_LOCK_ALLOC
68 struct lockdep_map dep_map;
69#endif
60} cpu_hotplug = { 70} cpu_hotplug = {
61 .active_writer = NULL, 71 .active_writer = NULL,
62 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 72 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
63 .refcount = 0, 73 .refcount = 0,
74#ifdef CONFIG_DEBUG_LOCK_ALLOC
75 .dep_map = {.name = "cpu_hotplug.lock" },
76#endif
64}; 77};
65 78
79/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
80#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
81#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
82#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
83
66void get_online_cpus(void) 84void get_online_cpus(void)
67{ 85{
68 might_sleep(); 86 might_sleep();
69 if (cpu_hotplug.active_writer == current) 87 if (cpu_hotplug.active_writer == current)
70 return; 88 return;
89 cpuhp_lock_acquire_read();
71 mutex_lock(&cpu_hotplug.lock); 90 mutex_lock(&cpu_hotplug.lock);
72 cpu_hotplug.refcount++; 91 cpu_hotplug.refcount++;
73 mutex_unlock(&cpu_hotplug.lock); 92 mutex_unlock(&cpu_hotplug.lock);
@@ -87,6 +106,7 @@ void put_online_cpus(void)
87 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 106 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
88 wake_up_process(cpu_hotplug.active_writer); 107 wake_up_process(cpu_hotplug.active_writer);
89 mutex_unlock(&cpu_hotplug.lock); 108 mutex_unlock(&cpu_hotplug.lock);
109 cpuhp_lock_release();
90 110
91} 111}
92EXPORT_SYMBOL_GPL(put_online_cpus); 112EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -117,6 +137,7 @@ void cpu_hotplug_begin(void)
117{ 137{
118 cpu_hotplug.active_writer = current; 138 cpu_hotplug.active_writer = current;
119 139
140 cpuhp_lock_acquire();
120 for (;;) { 141 for (;;) {
121 mutex_lock(&cpu_hotplug.lock); 142 mutex_lock(&cpu_hotplug.lock);
122 if (likely(!cpu_hotplug.refcount)) 143 if (likely(!cpu_hotplug.refcount))
@@ -131,6 +152,7 @@ void cpu_hotplug_done(void)
131{ 152{
132 cpu_hotplug.active_writer = NULL; 153 cpu_hotplug.active_writer = NULL;
133 mutex_unlock(&cpu_hotplug.lock); 154 mutex_unlock(&cpu_hotplug.lock);
155 cpuhp_lock_release();
134} 156}
135 157
136/* 158/*
@@ -166,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
166 return ret; 188 return ret;
167} 189}
168 190
191int __ref __register_cpu_notifier(struct notifier_block *nb)
192{
193 return raw_notifier_chain_register(&cpu_chain, nb);
194}
195
169static int __cpu_notify(unsigned long val, void *v, int nr_to_call, 196static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
170 int *nr_calls) 197 int *nr_calls)
171{ 198{
@@ -189,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
189 BUG_ON(cpu_notify(val, v)); 216 BUG_ON(cpu_notify(val, v));
190} 217}
191EXPORT_SYMBOL(register_cpu_notifier); 218EXPORT_SYMBOL(register_cpu_notifier);
219EXPORT_SYMBOL(__register_cpu_notifier);
192 220
193void __ref unregister_cpu_notifier(struct notifier_block *nb) 221void __ref unregister_cpu_notifier(struct notifier_block *nb)
194{ 222{
@@ -198,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
198} 226}
199EXPORT_SYMBOL(unregister_cpu_notifier); 227EXPORT_SYMBOL(unregister_cpu_notifier);
200 228
229void __ref __unregister_cpu_notifier(struct notifier_block *nb)
230{
231 raw_notifier_chain_unregister(&cpu_chain, nb);
232}
233EXPORT_SYMBOL(__unregister_cpu_notifier);
234
201/** 235/**
202 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU 236 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
203 * @cpu: a CPU id 237 * @cpu: a CPU id
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..3d54c418bd06 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
120static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
121{ 121{
122 return css_cs(task_css(task, cpuset_subsys_id)); 122 return css_cs(task_css(task, cpuset_cgrp_id));
123} 123}
124 124
125static inline struct cpuset *parent_cs(struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
467 * be changed to have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
468 */ 468 */
469 ret = -ENOSPC; 469 ret = -ENOSPC;
470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { 470 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
471 if (!cpumask_empty(cur->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
472 cpumask_empty(trial->cpus_allowed)) 472 cpumask_empty(trial->cpus_allowed))
473 goto out; 473 goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
829} 829}
830 830
831/** 831/**
832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
833 * @tsk: task to test
834 * @data: cpuset to @tsk belongs to
835 *
836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
837 * mask needs to be changed.
838 *
839 * We don't need to re-check for the cgroup/cpuset membership, since we're
840 * holding cpuset_mutex at this point.
841 */
842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
843{
844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
846
847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
848}
849
850/**
851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 832 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 833 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
854 *
855 * Called with cpuset_mutex held
856 * 834 *
857 * The css_scan_tasks() function will scan all the tasks in a cgroup, 835 * Iterate through each task of @cs updating its cpus_allowed to the
858 * calling callback functions for each. 836 * effective cpuset's. As this function is called with cpuset_mutex held,
859 * 837 * cpuset membership stays stable.
860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
861 * if @heap != NULL.
862 */ 838 */
863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 839static void update_tasks_cpumask(struct cpuset *cs)
864{ 840{
865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); 841 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
842 struct css_task_iter it;
843 struct task_struct *task;
844
845 css_task_iter_start(&cs->css, &it);
846 while ((task = css_task_iter_next(&it)))
847 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
848 css_task_iter_end(&it);
866} 849}
867 850
868/* 851/*
869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 852 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
870 * @root_cs: the root cpuset of the hierarchy 853 * @root_cs: the root cpuset of the hierarchy
871 * @update_root: update root cpuset or not? 854 * @update_root: update root cpuset or not?
872 * @heap: the heap used by css_scan_tasks()
873 * 855 *
874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 856 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
875 * which take on cpumask of @root_cs. 857 * which take on cpumask of @root_cs.
876 * 858 *
877 * Called with cpuset_mutex held 859 * Called with cpuset_mutex held
878 */ 860 */
879static void update_tasks_cpumask_hier(struct cpuset *root_cs, 861static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
880 bool update_root, struct ptr_heap *heap)
881{ 862{
882 struct cpuset *cp; 863 struct cpuset *cp;
883 struct cgroup_subsys_state *pos_css; 864 struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
898 continue; 879 continue;
899 rcu_read_unlock(); 880 rcu_read_unlock();
900 881
901 update_tasks_cpumask(cp, heap); 882 update_tasks_cpumask(cp);
902 883
903 rcu_read_lock(); 884 rcu_read_lock();
904 css_put(&cp->css); 885 css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf) 896 const char *buf)
916{ 897{
917 struct ptr_heap heap;
918 int retval; 898 int retval;
919 int is_load_balanced; 899 int is_load_balanced;
920 900
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
947 if (retval < 0) 927 if (retval < 0)
948 return retval; 928 return retval;
949 929
950 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
951 if (retval)
952 return retval;
953
954 is_load_balanced = is_sched_load_balance(trialcs); 930 is_load_balanced = is_sched_load_balance(trialcs);
955 931
956 mutex_lock(&callback_mutex); 932 mutex_lock(&callback_mutex);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 933 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 934 mutex_unlock(&callback_mutex);
959 935
960 update_tasks_cpumask_hier(cs, true, &heap); 936 update_tasks_cpumask_hier(cs, true);
961
962 heap_free(&heap);
963 937
964 if (is_load_balanced) 938 if (is_load_balanced)
965 rebuild_sched_domains_locked(); 939 rebuild_sched_domains_locked();
@@ -1022,7 +996,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1022 task_lock(tsk); 996 task_lock(tsk);
1023 /* 997 /*
1024 * Determine if a loop is necessary if another thread is doing 998 * Determine if a loop is necessary if another thread is doing
1025 * get_mems_allowed(). If at least one node remains unchanged and 999 * read_mems_allowed_begin(). If at least one node remains unchanged and
1026 * tsk does not have a mempolicy, then an empty nodemask will not be 1000 * tsk does not have a mempolicy, then an empty nodemask will not be
1027 * possible when mems_allowed is larger than a word. 1001 * possible when mems_allowed is larger than a word.
1028 */ 1002 */
@@ -1048,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 task_unlock(tsk); 1022 task_unlock(tsk);
1049} 1023}
1050 1024
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1056/*
1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1060 */
1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1062{
1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1065 struct mm_struct *mm;
1066 int migrate;
1067
1068 cpuset_change_task_nodemask(p, arg->newmems);
1069
1070 mm = get_task_mm(p);
1071 if (!mm)
1072 return;
1073
1074 migrate = is_memory_migrate(cs);
1075
1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1077 if (migrate)
1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1079 mmput(mm);
1080}
1081
1082static void *cpuset_being_rebound; 1025static void *cpuset_being_rebound;
1083 1026
1084/** 1027/**
1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1028 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1029 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1088 * 1030 *
1089 * Called with cpuset_mutex held. No return value. It's guaranteed that 1031 * Iterate through each task of @cs updating its mems_allowed to the
1090 * css_scan_tasks() always returns 0 if @heap != NULL. 1032 * effective cpuset's. As this function is called with cpuset_mutex held,
1033 * cpuset membership stays stable.
1091 */ 1034 */
1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1035static void update_tasks_nodemask(struct cpuset *cs)
1093{ 1036{
1094 static nodemask_t newmems; /* protected by cpuset_mutex */ 1037 static nodemask_t newmems; /* protected by cpuset_mutex */
1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1038 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs, 1039 struct css_task_iter it;
1097 .newmems = &newmems }; 1040 struct task_struct *task;
1098 1041
1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1042 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1100 1043
@@ -1110,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1053 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1111 * is idempotent. Also migrate pages in each mm to new nodes. 1054 * is idempotent. Also migrate pages in each mm to new nodes.
1112 */ 1055 */
1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); 1056 css_task_iter_start(&cs->css, &it);
1057 while ((task = css_task_iter_next(&it))) {
1058 struct mm_struct *mm;
1059 bool migrate;
1060
1061 cpuset_change_task_nodemask(task, &newmems);
1062
1063 mm = get_task_mm(task);
1064 if (!mm)
1065 continue;
1066
1067 migrate = is_memory_migrate(cs);
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1072 mmput(mm);
1073 }
1074 css_task_iter_end(&it);
1114 1075
1115 /* 1076 /*
1116 * All the tasks' nodemasks have been updated, update 1077 * All the tasks' nodemasks have been updated, update
@@ -1126,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1087 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1127 * @cs: the root cpuset of the hierarchy 1088 * @cs: the root cpuset of the hierarchy
1128 * @update_root: update the root cpuset or not? 1089 * @update_root: update the root cpuset or not?
1129 * @heap: the heap used by css_scan_tasks()
1130 * 1090 *
1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1091 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1132 * which take on nodemask of @root_cs. 1092 * which take on nodemask of @root_cs.
1133 * 1093 *
1134 * Called with cpuset_mutex held 1094 * Called with cpuset_mutex held
1135 */ 1095 */
1136static void update_tasks_nodemask_hier(struct cpuset *root_cs, 1096static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1137 bool update_root, struct ptr_heap *heap)
1138{ 1097{
1139 struct cpuset *cp; 1098 struct cpuset *cp;
1140 struct cgroup_subsys_state *pos_css; 1099 struct cgroup_subsys_state *pos_css;
@@ -1155,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1155 continue; 1114 continue;
1156 rcu_read_unlock(); 1115 rcu_read_unlock();
1157 1116
1158 update_tasks_nodemask(cp, heap); 1117 update_tasks_nodemask(cp);
1159 1118
1160 rcu_read_lock(); 1119 rcu_read_lock();
1161 css_put(&cp->css); 1120 css_put(&cp->css);
@@ -1180,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf) 1139 const char *buf)
1181{ 1140{
1182 int retval; 1141 int retval;
1183 struct ptr_heap heap;
1184 1142
1185 /* 1143 /*
1186 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1144 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1219,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1219 if (retval < 0) 1177 if (retval < 0)
1220 goto done; 1178 goto done;
1221 1179
1222 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1223 if (retval < 0)
1224 goto done;
1225
1226 mutex_lock(&callback_mutex); 1180 mutex_lock(&callback_mutex);
1227 cs->mems_allowed = trialcs->mems_allowed; 1181 cs->mems_allowed = trialcs->mems_allowed;
1228 mutex_unlock(&callback_mutex); 1182 mutex_unlock(&callback_mutex);
1229 1183
1230 update_tasks_nodemask_hier(cs, true, &heap); 1184 update_tasks_nodemask_hier(cs, true);
1231
1232 heap_free(&heap);
1233done: 1185done:
1234 return retval; 1186 return retval;
1235} 1187}
@@ -1257,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1257} 1209}
1258 1210
1259/** 1211/**
1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1261 * @tsk: task to be updated
1262 * @data: cpuset to @tsk belongs to
1263 *
1264 * Called by css_scan_tasks() for each task in a cgroup.
1265 *
1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1267 * holding cpuset_mutex at this point.
1268 */
1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1270{
1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1274}
1275
1276/**
1277 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1212 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1278 * @cs: the cpuset in which each task's spread flags needs to be changed 1213 * @cs: the cpuset in which each task's spread flags needs to be changed
1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1280 *
1281 * Called with cpuset_mutex held
1282 * 1214 *
1283 * The css_scan_tasks() function will scan all the tasks in a cgroup, 1215 * Iterate through each task of @cs updating its spread flags. As this
1284 * calling callback functions for each. 1216 * function is called with cpuset_mutex held, cpuset membership stays
1285 * 1217 * stable.
1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1287 * if @heap != NULL.
1288 */ 1218 */
1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1219static void update_tasks_flags(struct cpuset *cs)
1290{ 1220{
1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); 1221 struct css_task_iter it;
1222 struct task_struct *task;
1223
1224 css_task_iter_start(&cs->css, &it);
1225 while ((task = css_task_iter_next(&it)))
1226 cpuset_update_task_spread_flag(cs, task);
1227 css_task_iter_end(&it);
1292} 1228}
1293 1229
1294/* 1230/*
@@ -1306,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1306 struct cpuset *trialcs; 1242 struct cpuset *trialcs;
1307 int balance_flag_changed; 1243 int balance_flag_changed;
1308 int spread_flag_changed; 1244 int spread_flag_changed;
1309 struct ptr_heap heap;
1310 int err; 1245 int err;
1311 1246
1312 trialcs = alloc_trial_cpuset(cs); 1247 trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1322 if (err < 0) 1257 if (err < 0)
1323 goto out; 1258 goto out;
1324 1259
1325 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1326 if (err < 0)
1327 goto out;
1328
1329 balance_flag_changed = (is_sched_load_balance(cs) != 1260 balance_flag_changed = (is_sched_load_balance(cs) !=
1330 is_sched_load_balance(trialcs)); 1261 is_sched_load_balance(trialcs));
1331 1262
@@ -1340,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1340 rebuild_sched_domains_locked(); 1271 rebuild_sched_domains_locked();
1341 1272
1342 if (spread_flag_changed) 1273 if (spread_flag_changed)
1343 update_tasks_flags(cs, &heap); 1274 update_tasks_flags(cs);
1344 heap_free(&heap);
1345out: 1275out:
1346 free_trial_cpuset(trialcs); 1276 free_trial_cpuset(trialcs);
1347 return err; 1277 return err;
@@ -1445,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp)
1445 return val; 1375 return val;
1446} 1376}
1447 1377
1378static struct cpuset *cpuset_attach_old_cs;
1379
1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1380/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1449static int cpuset_can_attach(struct cgroup_subsys_state *css, 1381static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset) 1382 struct cgroup_taskset *tset)
@@ -1453,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1453 struct task_struct *task; 1385 struct task_struct *task;
1454 int ret; 1386 int ret;
1455 1387
1388 /* used later by cpuset_attach() */
1389 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1390
1456 mutex_lock(&cpuset_mutex); 1391 mutex_lock(&cpuset_mutex);
1457 1392
1458 /* 1393 /*
@@ -1464,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1399 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock; 1400 goto out_unlock;
1466 1401
1467 cgroup_taskset_for_each(task, css, tset) { 1402 cgroup_taskset_for_each(task, tset) {
1468 /* 1403 /*
1469 * Kthreads which disallow setaffinity shouldn't be moved 1404 * Kthreads which disallow setaffinity shouldn't be moved
1470 * to a new cpuset; we don't want to change their cpu 1405 * to a new cpuset; we don't want to change their cpu
@@ -1516,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1516 struct mm_struct *mm; 1451 struct mm_struct *mm;
1517 struct task_struct *task; 1452 struct task_struct *task;
1518 struct task_struct *leader = cgroup_taskset_first(tset); 1453 struct task_struct *leader = cgroup_taskset_first(tset);
1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1520 cpuset_subsys_id);
1521 struct cpuset *cs = css_cs(css); 1454 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss); 1455 struct cpuset *oldcs = cpuset_attach_old_cs;
1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1456 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1457 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1525 1458
@@ -1533,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1533 1466
1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1467 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1535 1468
1536 cgroup_taskset_for_each(task, css, tset) { 1469 cgroup_taskset_for_each(task, tset) {
1537 /* 1470 /*
1538 * can_attach beforehand should guarantee that this doesn't 1471 * can_attach beforehand should guarantee that this doesn't
1539 * fail. TODO: have a better way to handle failure here 1472 * fail. TODO: have a better way to handle failure here
@@ -1673,7 +1606,7 @@ out_unlock:
1673 * Common handling for a write to a "cpus" or "mems" file. 1606 * Common handling for a write to a "cpus" or "mems" file.
1674 */ 1607 */
1675static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1608static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1676 struct cftype *cft, const char *buf) 1609 struct cftype *cft, char *buf)
1677{ 1610{
1678 struct cpuset *cs = css_cs(css); 1611 struct cpuset *cs = css_cs(css);
1679 struct cpuset *trialcs; 1612 struct cpuset *trialcs;
@@ -2020,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2020 kfree(cs); 1953 kfree(cs);
2021} 1954}
2022 1955
2023struct cgroup_subsys cpuset_subsys = { 1956struct cgroup_subsys cpuset_cgrp_subsys = {
2024 .name = "cpuset",
2025 .css_alloc = cpuset_css_alloc, 1957 .css_alloc = cpuset_css_alloc,
2026 .css_online = cpuset_css_online, 1958 .css_online = cpuset_css_online,
2027 .css_offline = cpuset_css_offline, 1959 .css_offline = cpuset_css_offline,
@@ -2029,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = {
2029 .can_attach = cpuset_can_attach, 1961 .can_attach = cpuset_can_attach,
2030 .cancel_attach = cpuset_cancel_attach, 1962 .cancel_attach = cpuset_cancel_attach,
2031 .attach = cpuset_attach, 1963 .attach = cpuset_attach,
2032 .subsys_id = cpuset_subsys_id,
2033 .base_cftypes = files, 1964 .base_cftypes = files,
2034 .early_init = 1, 1965 .early_init = 1,
2035}; 1966};
@@ -2086,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2086 parent = parent_cs(parent); 2017 parent = parent_cs(parent);
2087 2018
2088 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2089 rcu_read_lock(); 2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
2090 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", 2021 pr_cont_cgroup_name(cs->css.cgroup);
2091 cgroup_name(cs->css.cgroup)); 2022 pr_cont("\n");
2092 rcu_read_unlock();
2093 } 2023 }
2094} 2024}
2095 2025
@@ -2137,7 +2067,7 @@ retry:
2137 */ 2067 */
2138 if ((sane && cpumask_empty(cs->cpus_allowed)) || 2068 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2139 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2069 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2140 update_tasks_cpumask(cs, NULL); 2070 update_tasks_cpumask(cs);
2141 2071
2142 mutex_lock(&callback_mutex); 2072 mutex_lock(&callback_mutex);
2143 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2073 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2151,7 +2081,7 @@ retry:
2151 */ 2081 */
2152 if ((sane && nodes_empty(cs->mems_allowed)) || 2082 if ((sane && nodes_empty(cs->mems_allowed)) ||
2153 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2083 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2154 update_tasks_nodemask(cs, NULL); 2084 update_tasks_nodemask(cs);
2155 2085
2156 is_empty = cpumask_empty(cs->cpus_allowed) || 2086 is_empty = cpumask_empty(cs->cpus_allowed) ||
2157 nodes_empty(cs->mems_allowed); 2087 nodes_empty(cs->mems_allowed);
@@ -2213,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2213 mutex_lock(&callback_mutex); 2143 mutex_lock(&callback_mutex);
2214 top_cpuset.mems_allowed = new_mems; 2144 top_cpuset.mems_allowed = new_mems;
2215 mutex_unlock(&callback_mutex); 2145 mutex_unlock(&callback_mutex);
2216 update_tasks_nodemask(&top_cpuset, NULL); 2146 update_tasks_nodemask(&top_cpuset);
2217 } 2147 }
2218 2148
2219 mutex_unlock(&cpuset_mutex); 2149 mutex_unlock(&cpuset_mutex);
@@ -2305,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2305 struct cpuset *cpus_cs; 2235 struct cpuset *cpus_cs;
2306 2236
2307 mutex_lock(&callback_mutex); 2237 mutex_lock(&callback_mutex);
2308 task_lock(tsk); 2238 rcu_read_lock();
2309 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2239 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2310 guarantee_online_cpus(cpus_cs, pmask); 2240 guarantee_online_cpus(cpus_cs, pmask);
2311 task_unlock(tsk); 2241 rcu_read_unlock();
2312 mutex_unlock(&callback_mutex); 2242 mutex_unlock(&callback_mutex);
2313} 2243}
2314 2244
@@ -2361,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2361 nodemask_t mask; 2291 nodemask_t mask;
2362 2292
2363 mutex_lock(&callback_mutex); 2293 mutex_lock(&callback_mutex);
2364 task_lock(tsk); 2294 rcu_read_lock();
2365 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2295 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2366 guarantee_online_mems(mems_cs, &mask); 2296 guarantee_online_mems(mems_cs, &mask);
2367 task_unlock(tsk); 2297 rcu_read_unlock();
2368 mutex_unlock(&callback_mutex); 2298 mutex_unlock(&callback_mutex);
2369 2299
2370 return mask; 2300 return mask;
@@ -2480,10 +2410,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2480 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2481 mutex_lock(&callback_mutex); 2411 mutex_lock(&callback_mutex);
2482 2412
2483 task_lock(current); 2413 rcu_read_lock();
2484 cs = nearest_hardwall_ancestor(task_cs(current)); 2414 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed); 2415 allowed = node_isset(node, cs->mems_allowed);
2486 task_unlock(current); 2416 rcu_read_unlock();
2487 2417
2488 mutex_unlock(&callback_mutex); 2418 mutex_unlock(&callback_mutex);
2489 return allowed; 2419 return allowed;
@@ -2609,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2609 * @task: pointer to task_struct of some task. 2539 * @task: pointer to task_struct of some task.
2610 * 2540 *
2611 * Description: Prints @task's name, cpuset name, and cached copy of its 2541 * Description: Prints @task's name, cpuset name, and cached copy of its
2612 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2542 * mems_allowed to the kernel log.
2613 * dereferencing task_cs(task).
2614 */ 2543 */
2615void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2544void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2616{ 2545{
2617 /* Statically allocated to prevent using excess stack. */ 2546 /* Statically allocated to prevent using excess stack. */
2618 static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 2547 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2619 static DEFINE_SPINLOCK(cpuset_buffer_lock); 2548 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2549 struct cgroup *cgrp;
2620 2550
2621 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2622
2623 rcu_read_lock();
2624 spin_lock(&cpuset_buffer_lock); 2551 spin_lock(&cpuset_buffer_lock);
2552 rcu_read_lock();
2625 2553
2554 cgrp = task_cs(tsk)->css.cgroup;
2626 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2627 tsk->mems_allowed); 2556 tsk->mems_allowed);
2628 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2557 printk(KERN_INFO "%s cpuset=", tsk->comm);
2629 tsk->comm, cgroup_name(cgrp), cpuset_nodelist); 2558 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2630 2560
2631 spin_unlock(&cpuset_buffer_lock);
2632 rcu_read_unlock(); 2561 rcu_read_unlock();
2562 spin_unlock(&cpuset_buffer_lock);
2633} 2563}
2634 2564
2635/* 2565/*
@@ -2660,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
2660 2590
2661void __cpuset_memory_pressure_bump(void) 2591void __cpuset_memory_pressure_bump(void)
2662{ 2592{
2663 task_lock(current); 2593 rcu_read_lock();
2664 fmeter_markevent(&task_cs(current)->fmeter); 2594 fmeter_markevent(&task_cs(current)->fmeter);
2665 task_unlock(current); 2595 rcu_read_unlock();
2666} 2596}
2667 2597
2668#ifdef CONFIG_PROC_PID_CPUSET 2598#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2679{ 2609{
2680 struct pid *pid; 2610 struct pid *pid;
2681 struct task_struct *tsk; 2611 struct task_struct *tsk;
2682 char *buf; 2612 char *buf, *p;
2683 struct cgroup_subsys_state *css; 2613 struct cgroup_subsys_state *css;
2684 int retval; 2614 int retval;
2685 2615
2686 retval = -ENOMEM; 2616 retval = -ENOMEM;
2687 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2688 if (!buf) 2618 if (!buf)
2689 goto out; 2619 goto out;
2690 2620
@@ -2694,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2694 if (!tsk) 2624 if (!tsk)
2695 goto out_free; 2625 goto out_free;
2696 2626
2627 retval = -ENAMETOOLONG;
2697 rcu_read_lock(); 2628 rcu_read_lock();
2698 css = task_css(tsk, cpuset_subsys_id); 2629 css = task_css(tsk, cpuset_cgrp_id);
2699 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2630 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2700 rcu_read_unlock(); 2631 rcu_read_unlock();
2701 if (retval < 0) 2632 if (!p)
2702 goto out_put_task; 2633 goto out_put_task;
2703 seq_puts(m, buf); 2634 seq_puts(m, p);
2704 seq_putc(m, '\n'); 2635 seq_putc(m, '\n');
2636 retval = 0;
2705out_put_task: 2637out_put_task:
2706 put_task_struct(tsk); 2638 put_task_struct(tsk);
2707out_free: 2639out_free:
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 99982a70ddad..2956c8da1605 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
49#include <linux/pid.h> 49#include <linux/pid.h>
50#include <linux/smp.h> 50#include <linux/smp.h>
51#include <linux/mm.h> 51#include <linux/mm.h>
52#include <linux/vmacache.h>
52#include <linux/rcupdate.h> 53#include <linux/rcupdate.h>
53 54
54#include <asm/cacheflush.h> 55#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
224 if (!CACHE_FLUSH_IS_SAFE) 225 if (!CACHE_FLUSH_IS_SAFE)
225 return; 226 return;
226 227
227 if (current->mm && current->mm->mmap_cache) { 228 if (current->mm) {
228 flush_cache_range(current->mm->mmap_cache, 229 int i;
229 addr, addr + BREAK_INSTR_SIZE); 230
231 for (i = 0; i < VMACACHE_SIZE; i++) {
232 if (!current->vmacache[i])
233 continue;
234 flush_cache_range(current->vmacache[i],
235 addr, addr + BREAK_INSTR_SIZE);
236 }
230 } 237 }
238
231 /* Force flush instruction cache if it was outside the mm */ 239 /* Force flush instruction cache if it was outside the mm */
232 flush_icache_range(addr, addr + BREAK_INSTR_SIZE); 240 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
233} 241}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 661951ab8ae7..f83a71a3e46d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -361,7 +361,7 @@ struct perf_cgroup {
361static inline struct perf_cgroup * 361static inline struct perf_cgroup *
362perf_cgroup_from_task(struct task_struct *task) 362perf_cgroup_from_task(struct task_struct *task)
363{ 363{
364 return container_of(task_css(task, perf_subsys_id), 364 return container_of(task_css(task, perf_event_cgrp_id),
365 struct perf_cgroup, css); 365 struct perf_cgroup, css);
366} 366}
367 367
@@ -389,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
389 event->cgrp->css.cgroup); 389 event->cgrp->css.cgroup);
390} 390}
391 391
392static inline bool perf_tryget_cgroup(struct perf_event *event)
393{
394 return css_tryget(&event->cgrp->css);
395}
396
397static inline void perf_put_cgroup(struct perf_event *event) 392static inline void perf_put_cgroup(struct perf_event *event)
398{ 393{
399 css_put(&event->cgrp->css); 394 css_put(&event->cgrp->css);
@@ -612,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
612 if (!f.file) 607 if (!f.file)
613 return -EBADF; 608 return -EBADF;
614 609
615 rcu_read_lock(); 610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
616
617 css = css_from_dir(f.file->f_dentry, &perf_subsys);
618 if (IS_ERR(css)) { 611 if (IS_ERR(css)) {
619 ret = PTR_ERR(css); 612 ret = PTR_ERR(css);
620 goto out; 613 goto out;
@@ -623,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
623 cgrp = container_of(css, struct perf_cgroup, css); 616 cgrp = container_of(css, struct perf_cgroup, css);
624 event->cgrp = cgrp; 617 event->cgrp = cgrp;
625 618
626 /* must be done before we fput() the file */
627 if (!perf_tryget_cgroup(event)) {
628 event->cgrp = NULL;
629 ret = -ENOENT;
630 goto out;
631 }
632
633 /* 619 /*
634 * all events in a group must monitor 620 * all events in a group must monitor
635 * the same cgroup because a task belongs 621 * the same cgroup because a task belongs
@@ -640,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
640 ret = -EINVAL; 626 ret = -EINVAL;
641 } 627 }
642out: 628out:
643 rcu_read_unlock();
644 fdput(f); 629 fdput(f);
645 return ret; 630 return ret;
646} 631}
@@ -8053,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8053{ 8038{
8054 struct task_struct *task; 8039 struct task_struct *task;
8055 8040
8056 cgroup_taskset_for_each(task, css, tset) 8041 cgroup_taskset_for_each(task, tset)
8057 task_function_call(task, __perf_cgroup_move, task); 8042 task_function_call(task, __perf_cgroup_move, task);
8058} 8043}
8059 8044
@@ -8072,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8072 task_function_call(task, __perf_cgroup_move, task); 8057 task_function_call(task, __perf_cgroup_move, task);
8073} 8058}
8074 8059
8075struct cgroup_subsys perf_subsys = { 8060struct cgroup_subsys perf_event_cgrp_subsys = {
8076 .name = "perf_event",
8077 .subsys_id = perf_subsys_id,
8078 .css_alloc = perf_cgroup_css_alloc, 8061 .css_alloc = perf_cgroup_css_alloc,
8079 .css_free = perf_cgroup_css_free, 8062 .css_free = perf_cgroup_css_free,
8080 .exit = perf_cgroup_exit, 8063 .exit = perf_cgroup_exit,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 307d87c0991a..04709b66369d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1804,6 +1804,11 @@ static bool handle_trampoline(struct pt_regs *regs)
1804 return true; 1804 return true;
1805} 1805}
1806 1806
1807bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
1808{
1809 return false;
1810}
1811
1807/* 1812/*
1808 * Run handler and ask thread to singlestep. 1813 * Run handler and ask thread to singlestep.
1809 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1814 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1858,7 +1863,11 @@ static void handle_swbp(struct pt_regs *regs)
1858 if (!get_utask()) 1863 if (!get_utask())
1859 goto out; 1864 goto out;
1860 1865
1866 if (arch_uprobe_ignore(&uprobe->arch, regs))
1867 goto out;
1868
1861 handler_chain(uprobe, regs); 1869 handler_chain(uprobe, regs);
1870
1862 if (can_skip_sstep(uprobe, regs)) 1871 if (can_skip_sstep(uprobe, regs))
1863 goto out; 1872 goto out;
1864 1873
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6ed6a1d552b5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -570,7 +570,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
570 if (same_thread_group(p->real_parent, father)) 570 if (same_thread_group(p->real_parent, father))
571 return; 571 return;
572 572
573 /* We don't want people slaying init. */ 573 /* We don't want people slaying init. */
574 p->exit_signal = SIGCHLD; 574 p->exit_signal = SIGCHLD;
575 575
576 /* If it has exited notify the new parent about this child's death. */ 576 /* If it has exited notify the new parent about this child's death. */
@@ -784,9 +784,10 @@ void do_exit(long code)
784 exit_shm(tsk); 784 exit_shm(tsk);
785 exit_files(tsk); 785 exit_files(tsk);
786 exit_fs(tsk); 786 exit_fs(tsk);
787 if (group_dead)
788 disassociate_ctty(1);
787 exit_task_namespaces(tsk); 789 exit_task_namespaces(tsk);
788 exit_task_work(tsk); 790 exit_task_work(tsk);
789 check_stack_usage();
790 exit_thread(); 791 exit_thread();
791 792
792 /* 793 /*
@@ -797,21 +798,17 @@ void do_exit(long code)
797 */ 798 */
798 perf_event_exit_task(tsk); 799 perf_event_exit_task(tsk);
799 800
800 cgroup_exit(tsk, 1); 801 cgroup_exit(tsk);
801
802 if (group_dead)
803 disassociate_ctty(1);
804 802
805 module_put(task_thread_info(tsk)->exec_domain->module); 803 module_put(task_thread_info(tsk)->exec_domain->module);
806 804
807 proc_exit_connector(tsk);
808
809 /* 805 /*
810 * FIXME: do that only when needed, using sched_exit tracepoint 806 * FIXME: do that only when needed, using sched_exit tracepoint
811 */ 807 */
812 flush_ptrace_hw_breakpoint(tsk); 808 flush_ptrace_hw_breakpoint(tsk);
813 809
814 exit_notify(tsk, group_dead); 810 exit_notify(tsk, group_dead);
811 proc_exit_connector(tsk);
815#ifdef CONFIG_NUMA 812#ifdef CONFIG_NUMA
816 task_lock(tsk); 813 task_lock(tsk);
817 mpol_put(tsk->mempolicy); 814 mpol_put(tsk->mempolicy);
@@ -844,6 +841,7 @@ void do_exit(long code)
844 841
845 validate_creds_for_do_exit(tsk); 842 validate_creds_for_do_exit(tsk);
846 843
844 check_stack_usage();
847 preempt_disable(); 845 preempt_disable();
848 if (tsk->nr_dirtied) 846 if (tsk->nr_dirtied)
849 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 847 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
@@ -1038,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1038 return wait_noreap_copyout(wo, p, pid, uid, why, status); 1036 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1039 } 1037 }
1040 1038
1039 traced = ptrace_reparented(p);
1041 /* 1040 /*
1042 * Try to move the task's state to DEAD 1041 * Move the task's state to DEAD/TRACE, only one thread can do this.
1043 * only one thread is allowed to do this:
1044 */ 1042 */
1045 state = xchg(&p->exit_state, EXIT_DEAD); 1043 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
1046 if (state != EXIT_ZOMBIE) { 1044 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1047 BUG_ON(state != EXIT_DEAD);
1048 return 0; 1045 return 0;
1049 }
1050
1051 traced = ptrace_reparented(p);
1052 /* 1046 /*
1053 * It can be ptraced but not reparented, check 1047 * It can be ptraced but not reparented, check
1054 * thread_group_leader() to filter out sub-threads. 1048 * thread_group_leader() to filter out sub-threads.
@@ -1109,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1109 1103
1110 /* 1104 /*
1111 * Now we are sure this task is interesting, and no other 1105 * Now we are sure this task is interesting, and no other
1112 * thread can reap it because we set its state to EXIT_DEAD. 1106 * thread can reap it because we its state == DEAD/TRACE.
1113 */ 1107 */
1114 read_unlock(&tasklist_lock); 1108 read_unlock(&tasklist_lock);
1115 1109
@@ -1146,22 +1140,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1146 if (!retval) 1140 if (!retval)
1147 retval = pid; 1141 retval = pid;
1148 1142
1149 if (traced) { 1143 if (state == EXIT_TRACE) {
1150 write_lock_irq(&tasklist_lock); 1144 write_lock_irq(&tasklist_lock);
1151 /* We dropped tasklist, ptracer could die and untrace */ 1145 /* We dropped tasklist, ptracer could die and untrace */
1152 ptrace_unlink(p); 1146 ptrace_unlink(p);
1153 /* 1147
1154 * If this is not a sub-thread, notify the parent. 1148 /* If parent wants a zombie, don't release it now */
1155 * If parent wants a zombie, don't release it now. 1149 state = EXIT_ZOMBIE;
1156 */ 1150 if (do_notify_parent(p, p->exit_signal))
1157 if (thread_group_leader(p) && 1151 state = EXIT_DEAD;
1158 !do_notify_parent(p, p->exit_signal)) { 1152 p->exit_state = state;
1159 p->exit_state = EXIT_ZOMBIE;
1160 p = NULL;
1161 }
1162 write_unlock_irq(&tasklist_lock); 1153 write_unlock_irq(&tasklist_lock);
1163 } 1154 }
1164 if (p != NULL) 1155 if (state == EXIT_DEAD)
1165 release_task(p); 1156 release_task(p);
1166 1157
1167 return retval; 1158 return retval;
@@ -1338,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1338static int wait_consider_task(struct wait_opts *wo, int ptrace, 1329static int wait_consider_task(struct wait_opts *wo, int ptrace,
1339 struct task_struct *p) 1330 struct task_struct *p)
1340{ 1331{
1341 int ret = eligible_child(wo, p); 1332 int ret;
1333
1334 if (unlikely(p->exit_state == EXIT_DEAD))
1335 return 0;
1336
1337 ret = eligible_child(wo, p);
1342 if (!ret) 1338 if (!ret)
1343 return ret; 1339 return ret;
1344 1340
@@ -1356,33 +1352,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1356 return 0; 1352 return 0;
1357 } 1353 }
1358 1354
1359 /* dead body doesn't have much to contribute */ 1355 if (unlikely(p->exit_state == EXIT_TRACE)) {
1360 if (unlikely(p->exit_state == EXIT_DEAD)) {
1361 /* 1356 /*
1362 * But do not ignore this task until the tracer does 1357 * ptrace == 0 means we are the natural parent. In this case
1363 * wait_task_zombie()->do_notify_parent(). 1358 * we should clear notask_error, debugger will notify us.
1364 */ 1359 */
1365 if (likely(!ptrace) && unlikely(ptrace_reparented(p))) 1360 if (likely(!ptrace))
1366 wo->notask_error = 0; 1361 wo->notask_error = 0;
1367 return 0; 1362 return 0;
1368 } 1363 }
1369 1364
1370 /* slay zombie? */ 1365 if (likely(!ptrace) && unlikely(p->ptrace)) {
1371 if (p->exit_state == EXIT_ZOMBIE) {
1372 /* 1366 /*
1373 * A zombie ptracee is only visible to its ptracer. 1367 * If it is traced by its real parent's group, just pretend
1374 * Notification and reaping will be cascaded to the real 1368 * the caller is ptrace_do_wait() and reap this child if it
1375 * parent when the ptracer detaches. 1369 * is zombie.
1370 *
1371 * This also hides group stop state from real parent; otherwise
1372 * a single stop can be reported twice as group and ptrace stop.
1373 * If a ptracer wants to distinguish these two events for its
1374 * own children it should create a separate process which takes
1375 * the role of real parent.
1376 */ 1376 */
1377 if (likely(!ptrace) && unlikely(p->ptrace)) { 1377 if (!ptrace_reparented(p))
1378 /* it will become visible, clear notask_error */ 1378 ptrace = 1;
1379 wo->notask_error = 0; 1379 }
1380 return 0;
1381 }
1382 1380
1381 /* slay zombie? */
1382 if (p->exit_state == EXIT_ZOMBIE) {
1383 /* we don't reap group leaders with subthreads */ 1383 /* we don't reap group leaders with subthreads */
1384 if (!delay_group_leader(p)) 1384 if (!delay_group_leader(p)) {
1385 return wait_task_zombie(wo, p); 1385 /*
1386 * A zombie ptracee is only visible to its ptracer.
1387 * Notification and reaping will be cascaded to the
1388 * real parent when the ptracer detaches.
1389 */
1390 if (unlikely(ptrace) || likely(!p->ptrace))
1391 return wait_task_zombie(wo, p);
1392 }
1386 1393
1387 /* 1394 /*
1388 * Allow access to stopped/continued state via zombie by 1395 * Allow access to stopped/continued state via zombie by
@@ -1408,19 +1415,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1408 wo->notask_error = 0; 1415 wo->notask_error = 0;
1409 } else { 1416 } else {
1410 /* 1417 /*
1411 * If @p is ptraced by a task in its real parent's group,
1412 * hide group stop/continued state when looking at @p as
1413 * the real parent; otherwise, a single stop can be
1414 * reported twice as group and ptrace stops.
1415 *
1416 * If a ptracer wants to distinguish the two events for its
1417 * own children, it should create a separate process which
1418 * takes the role of real parent.
1419 */
1420 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1421 return 0;
1422
1423 /*
1424 * @p is alive and it's gonna stop, continue or exit, so 1418 * @p is alive and it's gonna stop, continue or exit, so
1425 * there always is something to wait for. 1419 * there always is something to wait for.
1426 */ 1420 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 332688e5e7b4..54a8d26f612f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
28#include <linux/mman.h> 28#include <linux/mman.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/mm.h>
32#include <linux/vmacache.h>
31#include <linux/nsproxy.h> 33#include <linux/nsproxy.h>
32#include <linux/capability.h> 34#include <linux/capability.h>
33#include <linux/cpu.h> 35#include <linux/cpu.h>
@@ -71,6 +73,7 @@
71#include <linux/signalfd.h> 73#include <linux/signalfd.h>
72#include <linux/uprobes.h> 74#include <linux/uprobes.h>
73#include <linux/aio.h> 75#include <linux/aio.h>
76#include <linux/compiler.h>
74 77
75#include <asm/pgtable.h> 78#include <asm/pgtable.h>
76#include <asm/pgalloc.h> 79#include <asm/pgalloc.h>
@@ -284,7 +287,7 @@ void __init fork_init(unsigned long mempages)
284 init_task.signal->rlim[RLIMIT_NPROC]; 287 init_task.signal->rlim[RLIMIT_NPROC];
285} 288}
286 289
287int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, 290int __weak arch_dup_task_struct(struct task_struct *dst,
288 struct task_struct *src) 291 struct task_struct *src)
289{ 292{
290 *dst = *src; 293 *dst = *src;
@@ -364,7 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
364 367
365 mm->locked_vm = 0; 368 mm->locked_vm = 0;
366 mm->mmap = NULL; 369 mm->mmap = NULL;
367 mm->mmap_cache = NULL; 370 mm->vmacache_seqnum = 0;
368 mm->map_count = 0; 371 mm->map_count = 0;
369 cpumask_clear(mm_cpumask(mm)); 372 cpumask_clear(mm_cpumask(mm));
370 mm->mm_rb = RB_ROOT; 373 mm->mm_rb = RB_ROOT;
@@ -530,8 +533,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
530 atomic_set(&mm->mm_count, 1); 533 atomic_set(&mm->mm_count, 1);
531 init_rwsem(&mm->mmap_sem); 534 init_rwsem(&mm->mmap_sem);
532 INIT_LIST_HEAD(&mm->mmlist); 535 INIT_LIST_HEAD(&mm->mmlist);
533 mm->flags = (current->mm) ?
534 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
535 mm->core_state = NULL; 536 mm->core_state = NULL;
536 atomic_long_set(&mm->nr_ptes, 0); 537 atomic_long_set(&mm->nr_ptes, 0);
537 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 538 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
@@ -540,8 +541,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540 mm_init_owner(mm, p); 541 mm_init_owner(mm, p);
541 clear_tlb_flush_pending(mm); 542 clear_tlb_flush_pending(mm);
542 543
543 if (likely(!mm_alloc_pgd(mm))) { 544 if (current->mm) {
545 mm->flags = current->mm->flags & MMF_INIT_MASK;
546 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
547 } else {
548 mm->flags = default_dump_filter;
544 mm->def_flags = 0; 549 mm->def_flags = 0;
550 }
551
552 if (likely(!mm_alloc_pgd(mm))) {
545 mmu_notifier_mm_init(mm); 553 mmu_notifier_mm_init(mm);
546 return mm; 554 return mm;
547 } 555 }
@@ -877,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
877 if (!oldmm) 885 if (!oldmm)
878 return 0; 886 return 0;
879 887
888 /* initialize the new vmacache entries */
889 vmacache_flush(tsk);
890
880 if (clone_flags & CLONE_VM) { 891 if (clone_flags & CLONE_VM) {
881 atomic_inc(&oldmm->mm_users); 892 atomic_inc(&oldmm->mm_users);
882 mm = oldmm; 893 mm = oldmm;
@@ -1070,15 +1081,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1070 return 0; 1081 return 0;
1071} 1082}
1072 1083
1073static void copy_flags(unsigned long clone_flags, struct task_struct *p)
1074{
1075 unsigned long new_flags = p->flags;
1076
1077 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1078 new_flags |= PF_FORKNOEXEC;
1079 p->flags = new_flags;
1080}
1081
1082SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1084SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1083{ 1085{
1084 current->clear_child_tid = tidptr; 1086 current->clear_child_tid = tidptr;
@@ -1228,7 +1230,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1228 goto bad_fork_cleanup_count; 1230 goto bad_fork_cleanup_count;
1229 1231
1230 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1232 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1231 copy_flags(clone_flags, p); 1233 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1234 p->flags |= PF_FORKNOEXEC;
1232 INIT_LIST_HEAD(&p->children); 1235 INIT_LIST_HEAD(&p->children);
1233 INIT_LIST_HEAD(&p->sibling); 1236 INIT_LIST_HEAD(&p->sibling);
1234 rcu_copy_process(p); 1237 rcu_copy_process(p);
@@ -1272,9 +1275,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1272 if (IS_ERR(p->mempolicy)) { 1275 if (IS_ERR(p->mempolicy)) {
1273 retval = PTR_ERR(p->mempolicy); 1276 retval = PTR_ERR(p->mempolicy);
1274 p->mempolicy = NULL; 1277 p->mempolicy = NULL;
1275 goto bad_fork_cleanup_cgroup; 1278 goto bad_fork_cleanup_threadgroup_lock;
1276 } 1279 }
1277 mpol_fix_fork_child_flag(p);
1278#endif 1280#endif
1279#ifdef CONFIG_CPUSETS 1281#ifdef CONFIG_CPUSETS
1280 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1282 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
@@ -1525,11 +1527,10 @@ bad_fork_cleanup_policy:
1525 perf_event_free_task(p); 1527 perf_event_free_task(p);
1526#ifdef CONFIG_NUMA 1528#ifdef CONFIG_NUMA
1527 mpol_put(p->mempolicy); 1529 mpol_put(p->mempolicy);
1528bad_fork_cleanup_cgroup: 1530bad_fork_cleanup_threadgroup_lock:
1529#endif 1531#endif
1530 if (clone_flags & CLONE_THREAD) 1532 if (clone_flags & CLONE_THREAD)
1531 threadgroup_change_end(current); 1533 threadgroup_change_end(current);
1532 cgroup_exit(p, 0);
1533 delayacct_tsk_free(p); 1534 delayacct_tsk_free(p);
1534 module_put(task_thread_info(p)->exec_domain->module); 1535 module_put(task_thread_info(p)->exec_domain->module);
1535bad_fork_cleanup_count: 1536bad_fork_cleanup_count:
diff --git a/kernel/futex.c b/kernel/futex.c
index 67dacaf93e56..5f589279e462 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -70,7 +70,10 @@
70#include "locking/rtmutex_common.h" 70#include "locking/rtmutex_common.h"
71 71
72/* 72/*
73 * Basic futex operation and ordering guarantees: 73 * READ this before attempting to hack on futexes!
74 *
75 * Basic futex operation and ordering guarantees
76 * =============================================
74 * 77 *
75 * The waiter reads the futex value in user space and calls 78 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires 79 * futex_wait(). This function computes the hash bucket and acquires
@@ -119,7 +122,7 @@
119 * sys_futex(WAIT, futex, val); 122 * sys_futex(WAIT, futex, val);
120 * futex_wait(futex, val); 123 * futex_wait(futex, val);
121 * 124 *
122 * waiters++; 125 * waiters++; (a)
123 * mb(); (A) <-- paired with -. 126 * mb(); (A) <-- paired with -.
124 * | 127 * |
125 * lock(hash_bucket(futex)); | 128 * lock(hash_bucket(futex)); |
@@ -135,14 +138,14 @@
135 * unlock(hash_bucket(futex)); 138 * unlock(hash_bucket(futex));
136 * schedule(); if (waiters) 139 * schedule(); if (waiters)
137 * lock(hash_bucket(futex)); 140 * lock(hash_bucket(futex));
138 * wake_waiters(futex); 141 * else wake_waiters(futex);
139 * unlock(hash_bucket(futex)); 142 * waiters--; (b) unlock(hash_bucket(futex));
140 * 143 *
141 * Where (A) orders the waiters increment and the futex value read -- this 144 * Where (A) orders the waiters increment and the futex value read through
142 * is guaranteed by the head counter in the hb spinlock; and where (B) 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write
143 * orders the write to futex and the waiters read -- this is done by the 146 * to futex and the waiters read -- this is done by the barriers in
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc, 147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
145 * depending on the futex type. 148 * futex type.
146 * 149 *
147 * This yields the following case (where X:=waiters, Y:=futex): 150 * This yields the following case (where X:=waiters, Y:=futex):
148 * 151 *
@@ -155,6 +158,17 @@
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into 158 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the 159 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue. 160 * enqueue.
161 *
162 * Note that a new waiter is accounted for in (a) even when it is possible that
163 * the wait call can return error, in which case we backtrack from it in (b).
164 * Refer to the comment in queue_lock().
165 *
166 * Similarly, in order to account for waiters being requeued on another
167 * address we always increment the waiters for the destination bucket before
168 * acquiring the lock. It then decrements them again after releasing it -
169 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
170 * will do the additional required waiter count housekeeping. This is done for
171 * double_lock_hb() and double_unlock_hb(), respectively.
158 */ 172 */
159 173
160#ifndef CONFIG_HAVE_FUTEX_CMPXCHG 174#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
@@ -1452,6 +1466,7 @@ retry:
1452 hb2 = hash_futex(&key2); 1466 hb2 = hash_futex(&key2);
1453 1467
1454retry_private: 1468retry_private:
1469 hb_waiters_inc(hb2);
1455 double_lock_hb(hb1, hb2); 1470 double_lock_hb(hb1, hb2);
1456 1471
1457 if (likely(cmpval != NULL)) { 1472 if (likely(cmpval != NULL)) {
@@ -1461,6 +1476,7 @@ retry_private:
1461 1476
1462 if (unlikely(ret)) { 1477 if (unlikely(ret)) {
1463 double_unlock_hb(hb1, hb2); 1478 double_unlock_hb(hb1, hb2);
1479 hb_waiters_dec(hb2);
1464 1480
1465 ret = get_user(curval, uaddr1); 1481 ret = get_user(curval, uaddr1);
1466 if (ret) 1482 if (ret)
@@ -1510,6 +1526,7 @@ retry_private:
1510 break; 1526 break;
1511 case -EFAULT: 1527 case -EFAULT:
1512 double_unlock_hb(hb1, hb2); 1528 double_unlock_hb(hb1, hb2);
1529 hb_waiters_dec(hb2);
1513 put_futex_key(&key2); 1530 put_futex_key(&key2);
1514 put_futex_key(&key1); 1531 put_futex_key(&key1);
1515 ret = fault_in_user_writeable(uaddr2); 1532 ret = fault_in_user_writeable(uaddr2);
@@ -1519,6 +1536,7 @@ retry_private:
1519 case -EAGAIN: 1536 case -EAGAIN:
1520 /* The owner was exiting, try again. */ 1537 /* The owner was exiting, try again. */
1521 double_unlock_hb(hb1, hb2); 1538 double_unlock_hb(hb1, hb2);
1539 hb_waiters_dec(hb2);
1522 put_futex_key(&key2); 1540 put_futex_key(&key2);
1523 put_futex_key(&key1); 1541 put_futex_key(&key1);
1524 cond_resched(); 1542 cond_resched();
@@ -1594,6 +1612,7 @@ retry_private:
1594 1612
1595out_unlock: 1613out_unlock:
1596 double_unlock_hb(hb1, hb2); 1614 double_unlock_hb(hb1, hb2);
1615 hb_waiters_dec(hb2);
1597 1616
1598 /* 1617 /*
1599 * drop_futex_key_refs() must be called outside the spinlocks. During 1618 * drop_futex_key_refs() must be called outside the spinlocks. During
diff --git a/kernel/groups.c b/kernel/groups.c
index 90cf1c38c8ea..451698f86cfa 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
157 * set_groups - Change a group subscription in a set of credentials 157 * set_groups - Change a group subscription in a set of credentials
158 * @new: The newly prepared set of credentials to alter 158 * @new: The newly prepared set of credentials to alter
159 * @group_info: The group list to install 159 * @group_info: The group list to install
160 *
161 * Validate a group subscription and, if valid, insert it into a set
162 * of credentials.
163 */ 160 */
164int set_groups(struct cred *new, struct group_info *group_info) 161void set_groups(struct cred *new, struct group_info *group_info)
165{ 162{
166 put_group_info(new->group_info); 163 put_group_info(new->group_info);
167 groups_sort(group_info); 164 groups_sort(group_info);
168 get_group_info(group_info); 165 get_group_info(group_info);
169 new->group_info = group_info; 166 new->group_info = group_info;
170 return 0;
171} 167}
172 168
173EXPORT_SYMBOL(set_groups); 169EXPORT_SYMBOL(set_groups);
@@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups);
182int set_current_groups(struct group_info *group_info) 178int set_current_groups(struct group_info *group_info)
183{ 179{
184 struct cred *new; 180 struct cred *new;
185 int ret;
186 181
187 new = prepare_creds(); 182 new = prepare_creds();
188 if (!new) 183 if (!new)
189 return -ENOMEM; 184 return -ENOMEM;
190 185
191 ret = set_groups(new, group_info); 186 set_groups(new, group_info);
192 if (ret < 0) {
193 abort_creds(new);
194 return ret;
195 }
196
197 return commit_creds(new); 187 return commit_creds(new);
198} 188}
199 189
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0b9c169d577f..06bb1417b063 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -246,5 +246,4 @@ static int __init hung_task_init(void)
246 246
247 return 0; 247 return 0;
248} 248}
249 249subsys_initcall(hung_task_init);
250module_init(hung_task_init);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3127ad52cdb2..cb0cf37dac3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/compiler.h>
26 27
27#include <asm/sections.h> 28#include <asm/sections.h>
28 29
@@ -36,8 +37,8 @@
36 * These will be re-linked against their real values 37 * These will be re-linked against their real values
37 * during the second link stage. 38 * during the second link stage.
38 */ 39 */
39extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 40extern const unsigned long kallsyms_addresses[] __weak;
40extern const u8 kallsyms_names[] __attribute__((weak)); 41extern const u8 kallsyms_names[] __weak;
41 42
42/* 43/*
43 * Tell the compiler that the count isn't in the small data section if the arch 44 * Tell the compiler that the count isn't in the small data section if the arch
@@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak));
46extern const unsigned long kallsyms_num_syms 47extern const unsigned long kallsyms_num_syms
47__attribute__((weak, section(".rodata"))); 48__attribute__((weak, section(".rodata")));
48 49
49extern const u8 kallsyms_token_table[] __attribute__((weak)); 50extern const u8 kallsyms_token_table[] __weak;
50extern const u16 kallsyms_token_index[] __attribute__((weak)); 51extern const u16 kallsyms_token_index[] __weak;
51 52
52extern const unsigned long kallsyms_markers[] __attribute__((weak)); 53extern const unsigned long kallsyms_markers[] __weak;
53 54
54static inline int is_kernel_inittext(unsigned long addr) 55static inline int is_kernel_inittext(unsigned long addr)
55{ 56{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 45601cf41bee..c8380ad203bc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
32#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
33#include <linux/swap.h> 33#include <linux/swap.h>
34#include <linux/syscore_ops.h> 34#include <linux/syscore_ops.h>
35#include <linux/compiler.h>
35 36
36#include <asm/page.h> 37#include <asm/page.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -1235,7 +1236,7 @@ static int __init crash_notes_memory_init(void)
1235 } 1236 }
1236 return 0; 1237 return 0;
1237} 1238}
1238module_init(crash_notes_memory_init) 1239subsys_initcall(crash_notes_memory_init);
1239 1240
1240 1241
1241/* 1242/*
@@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...)
1551 * provide an empty default implementation here -- architecture 1552 * provide an empty default implementation here -- architecture
1552 * code may override this 1553 * code may override this
1553 */ 1554 */
1554void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1555void __weak arch_crash_save_vmcoreinfo(void)
1555{} 1556{}
1556 1557
1557unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1558unsigned long __weak paddr_vmcoreinfo_note(void)
1558{ 1559{
1559 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1560 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1560} 1561}
@@ -1629,7 +1630,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1629 return 0; 1630 return 0;
1630} 1631}
1631 1632
1632module_init(crash_save_vmcoreinfo_init) 1633subsys_initcall(crash_save_vmcoreinfo_init);
1633 1634
1634/* 1635/*
1635 * Move into place and start executing a preloaded standalone 1636 * Move into place and start executing a preloaded standalone
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e660964086e2..2495a9b14ac8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,7 @@
18#include <linux/stat.h> 18#include <linux/stat.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/capability.h> 20#include <linux/capability.h>
21#include <linux/compiler.h>
21 22
22#include <linux/rcupdate.h> /* rcu_expedited */ 23#include <linux/rcupdate.h> /* rcu_expedited */
23 24
@@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited);
162/* 163/*
163 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 164 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
164 */ 165 */
165extern const void __start_notes __attribute__((weak)); 166extern const void __start_notes __weak;
166extern const void __stop_notes __attribute__((weak)); 167extern const void __stop_notes __weak;
167#define notes_size (&__stop_notes - &__start_notes) 168#define notes_size (&__stop_notes - &__start_notes)
168 169
169static ssize_t notes_read(struct file *filp, struct kobject *kobj, 170static ssize_t notes_read(struct file *filp, struct kobject *kobj,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee860a9..9a130ec06f7a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
217 if (tsk == kthreadd_task) 217 if (tsk == kthreadd_task)
218 return tsk->pref_node_fork; 218 return tsk->pref_node_fork;
219#endif 219#endif
220 return numa_node_id(); 220 return NUMA_NO_NODE;
221} 221}
222 222
223static void create_kthread(struct kthread_create_info *create) 223static void create_kthread(struct kthread_create_info *create)
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
369{ 369{
370 struct task_struct *p; 370 struct task_struct *p;
371 371
372 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, 372 p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
373 cpu); 373 cpu);
374 if (IS_ERR(p)) 374 if (IS_ERR(p))
375 return p; 375 return p;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 306a76b51e0f..b8bdcd4785b7 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o 2obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o 14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif 15endif
16obj-$(CONFIG_SMP) += spinlock.o 16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_SMP) += lglock.o
17obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 18obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
18obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
19obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 20obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index e1191c996c59..5cf6731b98e9 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -71,18 +71,17 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
71 71
72void debug_mutex_unlock(struct mutex *lock) 72void debug_mutex_unlock(struct mutex *lock)
73{ 73{
74 if (unlikely(!debug_locks)) 74 if (likely(debug_locks)) {
75 return; 75 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 if (!lock->owner)
78 DEBUG_LOCKS_WARN_ON(!lock->owner);
79 else
80 DEBUG_LOCKS_WARN_ON(lock->owner != current);
78 81
79 if (!lock->owner) 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 DEBUG_LOCKS_WARN_ON(!lock->owner); 83 mutex_clear_owner(lock);
81 else 84 }
82 DEBUG_LOCKS_WARN_ON(lock->owner != current);
83
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 mutex_clear_owner(lock);
86 85
87 /* 86 /*
88 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug 87 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
diff --git a/kernel/module.c b/kernel/module.c
index 8dc7f5e80dd8..11869408f79b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod)
640 INIT_LIST_HEAD(&mod->target_list); 640 INIT_LIST_HEAD(&mod->target_list);
641 641
642 /* Hold reference count during initialization. */ 642 /* Hold reference count during initialization. */
643 __this_cpu_write(mod->refptr->incs, 1); 643 raw_cpu_write(mod->refptr->incs, 1);
644 644
645 return 0; 645 return 0;
646} 646}
@@ -1013,6 +1013,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
1013 buf[l++] = 'F'; 1013 buf[l++] = 'F';
1014 if (mod->taints & (1 << TAINT_CRAP)) 1014 if (mod->taints & (1 << TAINT_CRAP))
1015 buf[l++] = 'C'; 1015 buf[l++] = 'C';
1016 if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
1017 buf[l++] = 'E';
1016 /* 1018 /*
1017 * TAINT_FORCED_RMMOD: could be added. 1019 * TAINT_FORCED_RMMOD: could be added.
1018 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 1020 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -3218,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3218 pr_notice_once("%s: module verification failed: signature " 3220 pr_notice_once("%s: module verification failed: signature "
3219 "and/or required key missing - tainting " 3221 "and/or required key missing - tainting "
3220 "kernel\n", mod->name); 3222 "kernel\n", mod->name);
3221 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); 3223 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
3222 } 3224 }
3223#endif 3225#endif
3224 3226
@@ -3813,12 +3815,12 @@ void print_modules(void)
3813 list_for_each_entry_rcu(mod, &modules, list) { 3815 list_for_each_entry_rcu(mod, &modules, list) {
3814 if (mod->state == MODULE_STATE_UNFORMED) 3816 if (mod->state == MODULE_STATE_UNFORMED)
3815 continue; 3817 continue;
3816 printk(" %s%s", mod->name, module_flags(mod, buf)); 3818 pr_cont(" %s%s", mod->name, module_flags(mod, buf));
3817 } 3819 }
3818 preempt_enable(); 3820 preempt_enable();
3819 if (last_unloaded_module[0]) 3821 if (last_unloaded_module[0])
3820 printk(" [last unloaded: %s]", last_unloaded_module); 3822 pr_cont(" [last unloaded: %s]", last_unloaded_module);
3821 printk("\n"); 3823 pr_cont("\n");
3822} 3824}
3823 3825
3824#ifdef CONFIG_MODVERSIONS 3826#ifdef CONFIG_MODVERSIONS
diff --git a/kernel/panic.c b/kernel/panic.c
index cca8a913ae7c..d02fa9fef46a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -100,7 +100,7 @@ void panic(const char *fmt, ...)
100 va_start(args, fmt); 100 va_start(args, fmt);
101 vsnprintf(buf, sizeof(buf), fmt, args); 101 vsnprintf(buf, sizeof(buf), fmt, args);
102 va_end(args); 102 va_end(args);
103 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 103 pr_emerg("Kernel panic - not syncing: %s\n", buf);
104#ifdef CONFIG_DEBUG_BUGVERBOSE 104#ifdef CONFIG_DEBUG_BUGVERBOSE
105 /* 105 /*
106 * Avoid nested stack-dumping if a panic occurs during oops processing 106 * Avoid nested stack-dumping if a panic occurs during oops processing
@@ -141,7 +141,7 @@ void panic(const char *fmt, ...)
141 * Delay timeout seconds before rebooting the machine. 141 * Delay timeout seconds before rebooting the machine.
142 * We can't use the "normal" timers since we just panicked. 142 * We can't use the "normal" timers since we just panicked.
143 */ 143 */
144 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 144 pr_emerg("Rebooting in %d seconds..", panic_timeout);
145 145
146 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { 146 for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
147 touch_nmi_watchdog(); 147 touch_nmi_watchdog();
@@ -165,7 +165,7 @@ void panic(const char *fmt, ...)
165 extern int stop_a_enabled; 165 extern int stop_a_enabled;
166 /* Make sure the user can actually press Stop-A (L1-A) */ 166 /* Make sure the user can actually press Stop-A (L1-A) */
167 stop_a_enabled = 1; 167 stop_a_enabled = 1;
168 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); 168 pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
169 } 169 }
170#endif 170#endif
171#if defined(CONFIG_S390) 171#if defined(CONFIG_S390)
@@ -176,6 +176,7 @@ void panic(const char *fmt, ...)
176 disabled_wait(caller); 176 disabled_wait(caller);
177 } 177 }
178#endif 178#endif
179 pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
179 local_irq_enable(); 180 local_irq_enable();
180 for (i = 0; ; i += PANIC_TIMER_STEP) { 181 for (i = 0; ; i += PANIC_TIMER_STEP) {
181 touch_softlockup_watchdog(); 182 touch_softlockup_watchdog();
@@ -210,6 +211,7 @@ static const struct tnt tnts[] = {
210 { TAINT_CRAP, 'C', ' ' }, 211 { TAINT_CRAP, 'C', ' ' },
211 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 212 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
212 { TAINT_OOT_MODULE, 'O', ' ' }, 213 { TAINT_OOT_MODULE, 'O', ' ' },
214 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
213}; 215};
214 216
215/** 217/**
@@ -228,6 +230,7 @@ static const struct tnt tnts[] = {
228 * 'C' - modules from drivers/staging are loaded. 230 * 'C' - modules from drivers/staging are loaded.
229 * 'I' - Working around severe firmware bug. 231 * 'I' - Working around severe firmware bug.
230 * 'O' - Out-of-tree module has been loaded. 232 * 'O' - Out-of-tree module has been loaded.
233 * 'E' - Unsigned module has been loaded.
231 * 234 *
232 * The string is overwritten by the next call to print_tainted(). 235 * The string is overwritten by the next call to print_tainted().
233 */ 236 */
@@ -274,8 +277,7 @@ unsigned long get_taint(void)
274void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) 277void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
275{ 278{
276 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) 279 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
277 printk(KERN_WARNING 280 pr_warn("Disabling lock debugging due to kernel taint\n");
278 "Disabling lock debugging due to kernel taint\n");
279 281
280 set_bit(flag, &tainted_mask); 282 set_bit(flag, &tainted_mask);
281} 283}
@@ -380,8 +382,7 @@ late_initcall(init_oops_id);
380void print_oops_end_marker(void) 382void print_oops_end_marker(void)
381{ 383{
382 init_oops_id(); 384 init_oops_id();
383 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 385 pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
384 (unsigned long long)oops_id);
385} 386}
386 387
387/* 388/*
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06c62de9c711..db95d8eb761b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task)
318 struct pid_namespace *ns; 318 struct pid_namespace *ns;
319 319
320 rcu_read_lock(); 320 rcu_read_lock();
321 ns = get_pid_ns(task_active_pid_ns(task)); 321 ns = task_active_pid_ns(task);
322 if (ns)
323 get_pid_ns(ns);
322 rcu_read_unlock(); 324 rcu_read_unlock();
323 325
324 return ns; 326 return ns;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1ca753106557..15f37ea08719 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -2,6 +2,7 @@
2#include <linux/suspend_ioctls.h> 2#include <linux/suspend_ioctls.h>
3#include <linux/utsname.h> 3#include <linux/utsname.h>
4#include <linux/freezer.h> 4#include <linux/freezer.h>
5#include <linux/compiler.h>
5 6
6struct swsusp_info { 7struct swsusp_info {
7 struct new_utsname uts; 8 struct new_utsname uts;
@@ -11,7 +12,7 @@ struct swsusp_info {
11 unsigned long image_pages; 12 unsigned long image_pages;
12 unsigned long pages; 13 unsigned long pages;
13 unsigned long size; 14 unsigned long size;
14} __attribute__((aligned(PAGE_SIZE))); 15} __aligned(PAGE_SIZE);
15 16
16#ifdef CONFIG_HIBERNATION 17#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 18/* kernel/power/snapshot.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 149e745eaa52..18fb7a2fb14b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/compiler.h>
30 31
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
32#include <asm/mmu_context.h> 33#include <asm/mmu_context.h>
@@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
155struct linked_page { 156struct linked_page {
156 struct linked_page *next; 157 struct linked_page *next;
157 char data[LINKED_PAGE_DATA_SIZE]; 158 char data[LINKED_PAGE_DATA_SIZE];
158} __attribute__((packed)); 159} __packed;
159 160
160static inline void 161static inline void
161free_list_of_pages(struct linked_page *list, int clear_page_nosave) 162free_list_of_pages(struct linked_page *list, int clear_page_nosave)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 90b3d9366d1a..c3ad9cafe930 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,6 +26,7 @@
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <linux/ftrace.h> 27#include <linux/ftrace.h>
28#include <trace/events/power.h> 28#include <trace/events/power.h>
29#include <linux/compiler.h>
29 30
30#include "power.h" 31#include "power.h"
31 32
@@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state)
156} 157}
157 158
158/* default implementation */ 159/* default implementation */
159void __attribute__ ((weak)) arch_suspend_disable_irqs(void) 160void __weak arch_suspend_disable_irqs(void)
160{ 161{
161 local_irq_disable(); 162 local_irq_disable();
162} 163}
163 164
164/* default implementation */ 165/* default implementation */
165void __attribute__ ((weak)) arch_suspend_enable_irqs(void) 166void __weak arch_suspend_enable_irqs(void)
166{ 167{
167 local_irq_enable(); 168 local_irq_enable();
168} 169}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed200410..8c9a4819f798 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -101,7 +101,7 @@ struct swsusp_header {
101 unsigned int flags; /* Flags to pass to the "boot" kernel */ 101 unsigned int flags; /* Flags to pass to the "boot" kernel */
102 char orig_sig[10]; 102 char orig_sig[10];
103 char sig[10]; 103 char sig[10];
104} __attribute__((packed)); 104} __packed;
105 105
106static struct swsusp_header *swsusp_header; 106static struct swsusp_header *swsusp_header;
107 107
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4dae9cbe9259..a45b50962295 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -319,7 +319,7 @@ static void log_store(int facility, int level,
319 else 319 else
320 free = log_first_idx - log_next_idx; 320 free = log_first_idx - log_next_idx;
321 321
322 if (free > size + sizeof(struct printk_log)) 322 if (free >= size + sizeof(struct printk_log))
323 break; 323 break;
324 324
325 /* drop old messages until we have enough contiuous space */ 325 /* drop old messages until we have enough contiuous space */
@@ -327,7 +327,7 @@ static void log_store(int facility, int level,
327 log_first_seq++; 327 log_first_seq++;
328 } 328 }
329 329
330 if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { 330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
331 /* 331 /*
332 * This message + an additional empty header does not fit 332 * This message + an additional empty header does not fit
333 * at the end of the buffer. Add an empty header with len == 0 333 * at the end of the buffer. Add an empty header with len == 0
@@ -351,7 +351,7 @@ static void log_store(int facility, int level,
351 else 351 else
352 msg->ts_nsec = local_clock(); 352 msg->ts_nsec = local_clock();
353 memset(log_dict(msg) + dict_len, 0, pad_len); 353 memset(log_dict(msg) + dict_len, 0, pad_len);
354 msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; 354 msg->len = size;
355 355
356 /* insert message */ 356 /* insert message */
357 log_next_idx += msg->len; 357 log_next_idx += msg->len;
@@ -1560,9 +1560,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1560 level = kern_level - '0'; 1560 level = kern_level - '0';
1561 case 'd': /* KERN_DEFAULT */ 1561 case 'd': /* KERN_DEFAULT */
1562 lflags |= LOG_PREFIX; 1562 lflags |= LOG_PREFIX;
1563 case 'c': /* KERN_CONT */
1564 break;
1565 } 1563 }
1564 /*
1565 * No need to check length here because vscnprintf
1566 * put '\0' at the end of the string. Only valid and
1567 * newly printed level is detected.
1568 */
1566 text_len -= end_of_header - text; 1569 text_len -= end_of_header - text;
1567 text = (char *)end_of_header; 1570 text = (char *)end_of_header;
1568 } 1571 }
@@ -1880,6 +1883,7 @@ void suspend_console(void)
1880 console_lock(); 1883 console_lock();
1881 console_suspended = 1; 1884 console_suspended = 1;
1882 up(&console_sem); 1885 up(&console_sem);
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1883} 1887}
1884 1888
1885void resume_console(void) 1889void resume_console(void)
@@ -1887,6 +1891,7 @@ void resume_console(void)
1887 if (!console_suspend_enabled) 1891 if (!console_suspend_enabled)
1888 return; 1892 return;
1889 down(&console_sem); 1893 down(&console_sem);
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1890 console_suspended = 0; 1895 console_suspended = 0;
1891 console_unlock(); 1896 console_unlock();
1892} 1897}
diff --git a/kernel/profile.c b/kernel/profile.c
index ebdd9c1a86b4..cb980f0c731b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -591,18 +591,28 @@ out_cleanup:
591int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ 591int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
592{ 592{
593 struct proc_dir_entry *entry; 593 struct proc_dir_entry *entry;
594 int err = 0;
594 595
595 if (!prof_on) 596 if (!prof_on)
596 return 0; 597 return 0;
597 if (create_hash_tables()) 598
598 return -ENOMEM; 599 cpu_notifier_register_begin();
600
601 if (create_hash_tables()) {
602 err = -ENOMEM;
603 goto out;
604 }
605
599 entry = proc_create("profile", S_IWUSR | S_IRUGO, 606 entry = proc_create("profile", S_IWUSR | S_IRUGO,
600 NULL, &proc_profile_operations); 607 NULL, &proc_profile_operations);
601 if (!entry) 608 if (!entry)
602 return 0; 609 goto out;
603 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); 610 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
604 hotcpu_notifier(profile_cpu_callback, 0); 611 __hotcpu_notifier(profile_cpu_callback, 0);
605 return 0; 612
613out:
614 cpu_notifier_register_done();
615 return err;
606} 616}
607module_init(create_proc_profile); 617subsys_initcall(create_proc_profile);
608#endif /* CONFIG_PROC_FS */ 618#endif /* CONFIG_PROC_FS */
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..5a56d3c8dc03 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
227 * relay_remove_buf - remove a channel buffer 227 * relay_remove_buf - remove a channel buffer
228 * @kref: target kernel reference that contains the relay buffer 228 * @kref: target kernel reference that contains the relay buffer
229 * 229 *
230 * Removes the file from the fileystem, which also frees the 230 * Removes the file from the filesystem, which also frees the
231 * rchan_buf_struct and the channel buffer. Should only be called from 231 * rchan_buf_struct and the channel buffer. Should only be called from
232 * kref_put(). 232 * kref_put().
233 */ 233 */
@@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1195 1195
1196static const struct pipe_buf_operations relay_pipe_buf_ops = { 1196static const struct pipe_buf_operations relay_pipe_buf_ops = {
1197 .can_merge = 0, 1197 .can_merge = 0,
1198 .map = generic_pipe_buf_map,
1199 .unmap = generic_pipe_buf_unmap,
1200 .confirm = generic_pipe_buf_confirm, 1198 .confirm = generic_pipe_buf_confirm,
1201 .release = relay_pipe_buf_release, 1199 .release = relay_pipe_buf_release,
1202 .steal = generic_pipe_buf_steal, 1200 .steal = generic_pipe_buf_steal,
@@ -1253,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1253 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1251 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1254 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1252 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1255 poff = read_start & ~PAGE_MASK; 1253 poff = read_start & ~PAGE_MASK;
1256 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); 1254 nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
1257 1255
1258 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1256 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1259 unsigned int this_len, this_end, private; 1257 unsigned int this_len, this_end, private;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 4aa8a305aede..51dbac6a3633 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val, 25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 bool force) 26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
27{ 37{
28 int ret = 0; 38 int ret = 0;
29 39
@@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 96 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 97}
88 98
89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{
91 if (WARN_ON(counter->usage < val))
92 val = counter->usage;
93
94 counter->usage -= val;
95 return counter->usage;
96}
97
98u64 res_counter_uncharge_until(struct res_counter *counter, 99u64 res_counter_uncharge_until(struct res_counter *counter,
99 struct res_counter *top, 100 struct res_counter *top,
100 unsigned long val) 101 unsigned long val)
diff --git a/kernel/resource.c b/kernel/resource.c
index 673061c06da1..8957d686e29b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -511,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new,
511 * @newsize: new size of the resource descriptor 511 * @newsize: new size of the resource descriptor
512 * @constraint: the size and alignment constraints to be met. 512 * @constraint: the size and alignment constraints to be met.
513 */ 513 */
514int reallocate_resource(struct resource *root, struct resource *old, 514static int reallocate_resource(struct resource *root, struct resource *old,
515 resource_size_t newsize, 515 resource_size_t newsize,
516 struct resource_constraint *constraint) 516 struct resource_constraint *constraint)
517{ 517{
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b30a2924ef14..3ef6451e972e 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -60,13 +60,14 @@
60#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/static_key.h> 61#include <linux/static_key.h>
62#include <linux/workqueue.h> 62#include <linux/workqueue.h>
63#include <linux/compiler.h>
63 64
64/* 65/*
65 * Scheduler clock - returns current time in nanosec units. 66 * Scheduler clock - returns current time in nanosec units.
66 * This is default implementation. 67 * This is default implementation.
67 * Architectures and sub-architectures can override this. 68 * Architectures and sub-architectures can override this.
68 */ 69 */
69unsigned long long __attribute__((weak)) sched_clock(void) 70unsigned long long __weak sched_clock(void)
70{ 71{
71 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 72 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
72 * (NSEC_PER_SEC / HZ); 73 * (NSEC_PER_SEC / HZ);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..13584f1cccfc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h> 75#include <linux/context_tracking.h>
76#include <linux/compiler.h>
76 77
77#include <asm/switch_to.h> 78#include <asm/switch_to.h>
78#include <asm/tlb.h> 79#include <asm/tlb.h>
@@ -2591,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
2591 if (likely(prev->sched_class == class && 2592 if (likely(prev->sched_class == class &&
2592 rq->nr_running == rq->cfs.h_nr_running)) { 2593 rq->nr_running == rq->cfs.h_nr_running)) {
2593 p = fair_sched_class.pick_next_task(rq, prev); 2594 p = fair_sched_class.pick_next_task(rq, prev);
2594 if (likely(p && p != RETRY_TASK)) 2595 if (unlikely(p == RETRY_TASK))
2595 return p; 2596 goto again;
2597
2598 /* assumes fair_sched_class->next == idle_sched_class */
2599 if (unlikely(!p))
2600 p = idle_sched_class.pick_next_task(rq, prev);
2601
2602 return p;
2596 } 2603 }
2597 2604
2598again: 2605again:
@@ -2845,52 +2852,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2845} 2852}
2846EXPORT_SYMBOL(default_wake_function); 2853EXPORT_SYMBOL(default_wake_function);
2847 2854
2848static long __sched
2849sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2850{
2851 unsigned long flags;
2852 wait_queue_t wait;
2853
2854 init_waitqueue_entry(&wait, current);
2855
2856 __set_current_state(state);
2857
2858 spin_lock_irqsave(&q->lock, flags);
2859 __add_wait_queue(q, &wait);
2860 spin_unlock(&q->lock);
2861 timeout = schedule_timeout(timeout);
2862 spin_lock_irq(&q->lock);
2863 __remove_wait_queue(q, &wait);
2864 spin_unlock_irqrestore(&q->lock, flags);
2865
2866 return timeout;
2867}
2868
2869void __sched interruptible_sleep_on(wait_queue_head_t *q)
2870{
2871 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2872}
2873EXPORT_SYMBOL(interruptible_sleep_on);
2874
2875long __sched
2876interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2877{
2878 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2879}
2880EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2881
2882void __sched sleep_on(wait_queue_head_t *q)
2883{
2884 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2885}
2886EXPORT_SYMBOL(sleep_on);
2887
2888long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2889{
2890 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2891}
2892EXPORT_SYMBOL(sleep_on_timeout);
2893
2894#ifdef CONFIG_RT_MUTEXES 2855#ifdef CONFIG_RT_MUTEXES
2895 2856
2896/* 2857/*
@@ -3169,6 +3130,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3169 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3130 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3170 dl_se->dl_throttled = 0; 3131 dl_se->dl_throttled = 0;
3171 dl_se->dl_new = 1; 3132 dl_se->dl_new = 1;
3133 dl_se->dl_yielded = 0;
3172} 3134}
3173 3135
3174static void __setscheduler_params(struct task_struct *p, 3136static void __setscheduler_params(struct task_struct *p,
@@ -3684,6 +3646,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3684 * sys_sched_setattr - same as above, but with extended sched_attr 3646 * sys_sched_setattr - same as above, but with extended sched_attr
3685 * @pid: the pid in question. 3647 * @pid: the pid in question.
3686 * @uattr: structure containing the extended parameters. 3648 * @uattr: structure containing the extended parameters.
3649 * @flags: for future extension.
3687 */ 3650 */
3688SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3651SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3689 unsigned int, flags) 3652 unsigned int, flags)
@@ -3828,6 +3791,7 @@ err_size:
3828 * @pid: the pid in question. 3791 * @pid: the pid in question.
3829 * @uattr: structure containing the extended parameters. 3792 * @uattr: structure containing the extended parameters.
3830 * @size: sizeof(attr) for fwd/bwd comp. 3793 * @size: sizeof(attr) for fwd/bwd comp.
3794 * @flags: for future extension.
3831 */ 3795 */
3832SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3796SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3833 unsigned int, size, unsigned int, flags) 3797 unsigned int, size, unsigned int, flags)
@@ -6062,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6062 , 6026 ,
6063 .last_balance = jiffies, 6027 .last_balance = jiffies,
6064 .balance_interval = sd_weight, 6028 .balance_interval = sd_weight,
6029 .max_newidle_lb_cost = 0,
6030 .next_decay_max_lb_cost = jiffies,
6065 }; 6031 };
6066 SD_INIT_NAME(sd, NUMA); 6032 SD_INIT_NAME(sd, NUMA);
6067 sd->private = &tl->data; 6033 sd->private = &tl->data;
@@ -6498,7 +6464,7 @@ static cpumask_var_t fallback_doms;
6498 * cpu core maps. It is supposed to return 1 if the topology changed 6464 * cpu core maps. It is supposed to return 1 if the topology changed
6499 * or 0 if it stayed the same. 6465 * or 0 if it stayed the same.
6500 */ 6466 */
6501int __attribute__((weak)) arch_update_cpu_topology(void) 6467int __weak arch_update_cpu_topology(void)
6502{ 6468{
6503 return 0; 6469 return 0;
6504} 6470}
@@ -7230,7 +7196,7 @@ void sched_move_task(struct task_struct *tsk)
7230 if (unlikely(running)) 7196 if (unlikely(running))
7231 tsk->sched_class->put_prev_task(rq, tsk); 7197 tsk->sched_class->put_prev_task(rq, tsk);
7232 7198
7233 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7199 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7234 lockdep_is_held(&tsk->sighand->siglock)), 7200 lockdep_is_held(&tsk->sighand->siglock)),
7235 struct task_group, css); 7201 struct task_group, css);
7236 tg = autogroup_task_group(tsk, tg); 7202 tg = autogroup_task_group(tsk, tg);
@@ -7657,7 +7623,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7657{ 7623{
7658 struct task_struct *task; 7624 struct task_struct *task;
7659 7625
7660 cgroup_taskset_for_each(task, css, tset) { 7626 cgroup_taskset_for_each(task, tset) {
7661#ifdef CONFIG_RT_GROUP_SCHED 7627#ifdef CONFIG_RT_GROUP_SCHED
7662 if (!sched_rt_can_attach(css_tg(css), task)) 7628 if (!sched_rt_can_attach(css_tg(css), task))
7663 return -EINVAL; 7629 return -EINVAL;
@@ -7675,7 +7641,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7675{ 7641{
7676 struct task_struct *task; 7642 struct task_struct *task;
7677 7643
7678 cgroup_taskset_for_each(task, css, tset) 7644 cgroup_taskset_for_each(task, tset)
7679 sched_move_task(task); 7645 sched_move_task(task);
7680} 7646}
7681 7647
@@ -8014,8 +7980,7 @@ static struct cftype cpu_files[] = {
8014 { } /* terminate */ 7980 { } /* terminate */
8015}; 7981};
8016 7982
8017struct cgroup_subsys cpu_cgroup_subsys = { 7983struct cgroup_subsys cpu_cgrp_subsys = {
8018 .name = "cpu",
8019 .css_alloc = cpu_cgroup_css_alloc, 7984 .css_alloc = cpu_cgroup_css_alloc,
8020 .css_free = cpu_cgroup_css_free, 7985 .css_free = cpu_cgroup_css_free,
8021 .css_online = cpu_cgroup_css_online, 7986 .css_online = cpu_cgroup_css_online,
@@ -8023,7 +7988,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8023 .can_attach = cpu_cgroup_can_attach, 7988 .can_attach = cpu_cgroup_can_attach,
8024 .attach = cpu_cgroup_attach, 7989 .attach = cpu_cgroup_attach,
8025 .exit = cpu_cgroup_exit, 7990 .exit = cpu_cgroup_exit,
8026 .subsys_id = cpu_cgroup_subsys_id,
8027 .base_cftypes = cpu_files, 7991 .base_cftypes = cpu_files,
8028 .early_init = 1, 7992 .early_init = 1,
8029}; 7993};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
41/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
42static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
43{ 43{
44 return css_ca(task_css(tsk, cpuacct_subsys_id)); 44 return css_ca(task_css(tsk, cpuacct_cgrp_id));
45} 45}
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275 rcu_read_unlock(); 275 rcu_read_unlock();
276} 276}
277 277
278struct cgroup_subsys cpuacct_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .name = "cpuacct",
280 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
281 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
282 .subsys_id = cpuacct_subsys_id,
283 .base_cftypes = files, 281 .base_cftypes = files,
284 .early_init = 1, 282 .early_init = 1,
285}; 283};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..ab001b5d5048 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp)
210 */ 210 */
211void cpudl_cleanup(struct cpudl *cp) 211void cpudl_cleanup(struct cpudl *cp)
212{ 212{
213 /* 213 free_cpumask_var(cp->free_cpus);
214 * nothing to do for the moment
215 */
216} 214}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..3031bac8aa3e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 int idx = 0; 70 int idx = 0;
71 int task_pri = convert_prio(p->prio); 71 int task_pri = convert_prio(p->prio);
72 72
73 if (task_pri >= MAX_RT_PRIO) 73 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
74 return 0;
75 74
76 for (idx = 0; idx < task_pri; idx++) { 75 for (idx = 0; idx < task_pri; idx++) {
77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
332 * softirq as those do not count in task exec_runtime any more. 332 * softirq as those do not count in task exec_runtime any more.
333 */ 333 */
334static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 334static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
335 struct rq *rq) 335 struct rq *rq, int ticks)
336{ 336{
337 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 337 cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
338 u64 cputime = (__force u64) cputime_one_jiffy;
338 u64 *cpustat = kcpustat_this_cpu->cpustat; 339 u64 *cpustat = kcpustat_this_cpu->cpustat;
339 340
340 if (steal_account_process_tick()) 341 if (steal_account_process_tick())
341 return; 342 return;
342 343
344 cputime *= ticks;
345 scaled *= ticks;
346
343 if (irqtime_account_hi_update()) { 347 if (irqtime_account_hi_update()) {
344 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; 348 cpustat[CPUTIME_IRQ] += cputime;
345 } else if (irqtime_account_si_update()) { 349 } else if (irqtime_account_si_update()) {
346 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; 350 cpustat[CPUTIME_SOFTIRQ] += cputime;
347 } else if (this_cpu_ksoftirqd() == p) { 351 } else if (this_cpu_ksoftirqd() == p) {
348 /* 352 /*
349 * ksoftirqd time do not get accounted in cpu_softirq_time. 353 * ksoftirqd time do not get accounted in cpu_softirq_time.
350 * So, we have to handle it separately here. 354 * So, we have to handle it separately here.
351 * Also, p->stime needs to be updated for ksoftirqd. 355 * Also, p->stime needs to be updated for ksoftirqd.
352 */ 356 */
353 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 357 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
354 CPUTIME_SOFTIRQ);
355 } else if (user_tick) { 358 } else if (user_tick) {
356 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 359 account_user_time(p, cputime, scaled);
357 } else if (p == rq->idle) { 360 } else if (p == rq->idle) {
358 account_idle_time(cputime_one_jiffy); 361 account_idle_time(cputime);
359 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 362 } else if (p->flags & PF_VCPU) { /* System time or guest time */
360 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 363 account_guest_time(p, cputime, scaled);
361 } else { 364 } else {
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 365 __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
363 CPUTIME_SYSTEM);
364 } 366 }
365} 367}
366 368
367static void irqtime_account_idle_ticks(int ticks) 369static void irqtime_account_idle_ticks(int ticks)
368{ 370{
369 int i;
370 struct rq *rq = this_rq(); 371 struct rq *rq = this_rq();
371 372
372 for (i = 0; i < ticks; i++) 373 irqtime_account_process_tick(current, 0, rq, ticks);
373 irqtime_account_process_tick(current, 0, rq);
374} 374}
375#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 375#else /* CONFIG_IRQ_TIME_ACCOUNTING */
376static inline void irqtime_account_idle_ticks(int ticks) {} 376static inline void irqtime_account_idle_ticks(int ticks) {}
377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
378 struct rq *rq) {} 378 struct rq *rq, int nr_ticks) {}
379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
380 380
381/* 381/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
464 return; 464 return;
465 465
466 if (sched_clock_irqtime) { 466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq); 467 irqtime_account_process_tick(p, user_tick, rq, 1);
468 return; 468 return;
469 } 469 }
470 470
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..800e99b99075 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
528 sched_clock_tick(); 528 sched_clock_tick();
529 update_rq_clock(rq); 529 update_rq_clock(rq);
530 dl_se->dl_throttled = 0; 530 dl_se->dl_throttled = 0;
531 dl_se->dl_yielded = 0;
531 if (p->on_rq) { 532 if (p->on_rq) {
532 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 533 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
533 if (task_has_dl_policy(rq->curr)) 534 if (task_has_dl_policy(rq->curr))
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
893 * We make the task go to sleep until its current deadline by 894 * We make the task go to sleep until its current deadline by
894 * forcing its runtime to zero. This way, update_curr_dl() stops 895 * forcing its runtime to zero. This way, update_curr_dl() stops
895 * it and the bandwidth timer will wake it up and will give it 896 * it and the bandwidth timer will wake it up and will give it
896 * new scheduling parameters (thanks to dl_new=1). 897 * new scheduling parameters (thanks to dl_yielded=1).
897 */ 898 */
898 if (p->dl.runtime > 0) { 899 if (p->dl.runtime > 0) {
899 rq->curr->dl.dl_new = 1; 900 rq->curr->dl.dl_yielded = 1;
900 p->dl.runtime = 0; 901 p->dl.runtime = 0;
901 } 902 }
902 update_curr_dl(rq); 903 update_curr_dl(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
111 if (autogroup_path(tg, group_path, PATH_MAX)) 111 if (autogroup_path(tg, group_path, PATH_MAX))
112 return group_path; 112 return group_path;
113 113
114 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 114 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
115 return group_path;
116} 115}
117#endif 116#endif
118 117
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 43232b8bacde..5d859ec975c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6658,6 +6658,7 @@ static int idle_balance(struct rq *this_rq)
6658 int this_cpu = this_rq->cpu; 6658 int this_cpu = this_rq->cpu;
6659 6659
6660 idle_enter_fair(this_rq); 6660 idle_enter_fair(this_rq);
6661
6661 /* 6662 /*
6662 * We must set idle_stamp _before_ calling idle_balance(), such that we 6663 * We must set idle_stamp _before_ calling idle_balance(), such that we
6663 * measure the duration of idle_balance() as idle time. 6664 * measure the duration of idle_balance() as idle time.
@@ -6710,14 +6711,16 @@ static int idle_balance(struct rq *this_rq)
6710 6711
6711 raw_spin_lock(&this_rq->lock); 6712 raw_spin_lock(&this_rq->lock);
6712 6713
6714 if (curr_cost > this_rq->max_idle_balance_cost)
6715 this_rq->max_idle_balance_cost = curr_cost;
6716
6713 /* 6717 /*
6714 * While browsing the domains, we released the rq lock. 6718 * While browsing the domains, we released the rq lock, a task could
6715 * A task could have be enqueued in the meantime 6719 * have been enqueued in the meantime. Since we're not going idle,
6720 * pretend we pulled a task.
6716 */ 6721 */
6717 if (this_rq->cfs.h_nr_running && !pulled_task) { 6722 if (this_rq->cfs.h_nr_running && !pulled_task)
6718 pulled_task = 1; 6723 pulled_task = 1;
6719 goto out;
6720 }
6721 6724
6722 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6725 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6723 /* 6726 /*
@@ -6727,9 +6730,6 @@ static int idle_balance(struct rq *this_rq)
6727 this_rq->next_balance = next_balance; 6730 this_rq->next_balance = next_balance;
6728 } 6731 }
6729 6732
6730 if (curr_cost > this_rq->max_idle_balance_cost)
6731 this_rq->max_idle_balance_cost = curr_cost;
6732
6733out: 6733out:
6734 /* Is there a task of a high priority class? */ 6734 /* Is there a task of a high priority class? */
6735 if (this_rq->nr_running != this_rq->cfs.h_nr_running) 6735 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b7976a127178..8f4390a079c7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -63,6 +63,136 @@ void __weak arch_cpu_idle(void)
63 local_irq_enable(); 63 local_irq_enable();
64} 64}
65 65
66/**
67 * cpuidle_idle_call - the main idle function
68 *
69 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure
71 */
72static int cpuidle_idle_call(void)
73{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret;
77 bool broadcast;
78
79 /*
80 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and
82 * set again the polling flag
83 */
84 if (current_clr_polling_and_test()) {
85 local_irq_enable();
86 __current_set_polling();
87 return 0;
88 }
89
90 /*
91 * During the idle period, stop measuring the disabled irqs
92 * critical sections latencies
93 */
94 stop_critical_timings();
95
96 /*
97 * Tell the RCU framework we are entering an idle section,
98 * so no more rcu read side critical sections and one more
99 * step to the grace period
100 */
101 rcu_idle_enter();
102
103 /*
104 * Check if the cpuidle framework is ready, otherwise fallback
105 * to the default arch specific idle method
106 */
107 ret = cpuidle_enabled(drv, dev);
108
109 if (!ret) {
110 /*
111 * Ask the governor to choose an idle state it thinks
112 * it is convenient to go to. There is *always* a
113 * convenient idle state
114 */
115 next_state = cpuidle_select(drv, dev);
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable();
126 } else {
127 broadcast = !!(drv->states[next_state].flags &
128 CPUIDLE_FLAG_TIMER_STOP);
129
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 }
173
174 /*
175 * We can't use the cpuidle framework, let's use the default
176 * idle routine
177 */
178 if (ret)
179 arch_cpu_idle();
180
181 __current_set_polling();
182
183 /*
184 * It is up to the idle functions to enable back the local
185 * interrupt
186 */
187 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable();
189
190 rcu_idle_exit();
191 start_critical_timings();
192
193 return 0;
194}
195
66/* 196/*
67 * Generic idle loop implementation 197 * Generic idle loop implementation
68 */ 198 */
@@ -90,23 +220,11 @@ static void cpu_idle_loop(void)
90 * know that the IPI is going to arrive right 220 * know that the IPI is going to arrive right
91 * away 221 * away
92 */ 222 */
93 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 223 if (cpu_idle_force_poll || tick_check_broadcast_expired())
94 cpu_idle_poll(); 224 cpu_idle_poll();
95 } else { 225 else
96 if (!current_clr_polling_and_test()) { 226 cpuidle_idle_call();
97 stop_critical_timings(); 227
98 rcu_idle_enter();
99 if (cpuidle_idle_call())
100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
103 rcu_idle_exit();
104 start_critical_timings();
105 } else {
106 local_irq_enable();
107 }
108 __current_set_polling();
109 }
110 arch_cpu_idle_exit(); 228 arch_cpu_idle_exit();
111 } 229 }
112 230
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
142 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 142 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
143 return 0; 143 return 0;
144} 144}
145module_init(proc_schedstat_init); 145subsys_initcall(proc_schedstat_init);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..b35c21503a36 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -55,60 +55,32 @@ struct seccomp_filter {
55 atomic_t usage; 55 atomic_t usage;
56 struct seccomp_filter *prev; 56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */ 57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[]; 58 struct sock_filter_int insnsi[];
59}; 59};
60 60
61/* Limit any path through the tree to 256KB worth of instructions. */ 61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) 62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63 63
64/** 64/*
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage 65 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture. 66 * as per the specific architecture.
76 */ 67 */
77static inline u32 get_u32(u64 data, int index) 68static void populate_seccomp_data(struct seccomp_data *sd)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{ 69{
94 struct pt_regs *regs = task_pt_regs(current); 70 struct task_struct *task = current;
95 if (off == BPF_DATA(nr)) 71 struct pt_regs *regs = task_pt_regs(task);
96 return syscall_get_nr(current, regs); 72 unsigned long args[6];
97 if (off == BPF_DATA(arch)) 73
98 return syscall_get_arch(current, regs); 74 sd->nr = syscall_get_nr(task, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { 75 sd->arch = syscall_get_arch();
100 unsigned long value; 76 syscall_get_arguments(task, regs, 0, 6, args);
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64); 77 sd->args[0] = args[0];
102 int index = !!(off % sizeof(u64)); 78 sd->args[1] = args[1];
103 syscall_get_arguments(current, regs, arg, 1, &value); 79 sd->args[2] = args[2];
104 return get_u32(value, index); 80 sd->args[3] = args[3];
105 } 81 sd->args[4] = args[4];
106 if (off == BPF_DATA(instruction_pointer)) 82 sd->args[5] = args[5];
107 return get_u32(KSTK_EIP(current), 0); 83 sd->instruction_pointer = KSTK_EIP(task);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112} 84}
113 85
114/** 86/**
@@ -133,17 +105,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
133 105
134 switch (code) { 106 switch (code) {
135 case BPF_S_LD_W_ABS: 107 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W; 108 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
137 /* 32-bit aligned and not out of bounds. */ 109 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3) 110 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL; 111 return -EINVAL;
140 continue; 112 continue;
141 case BPF_S_LD_W_LEN: 113 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM; 114 ftest->code = BPF_LD | BPF_IMM;
143 ftest->k = sizeof(struct seccomp_data); 115 ftest->k = sizeof(struct seccomp_data);
144 continue; 116 continue;
145 case BPF_S_LDX_W_LEN: 117 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM; 118 ftest->code = BPF_LDX | BPF_IMM;
147 ftest->k = sizeof(struct seccomp_data); 119 ftest->k = sizeof(struct seccomp_data);
148 continue; 120 continue;
149 /* Explicitly include allowed calls. */ 121 /* Explicitly include allowed calls. */
@@ -185,6 +157,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
185 case BPF_S_JMP_JGT_X: 157 case BPF_S_JMP_JGT_X:
186 case BPF_S_JMP_JSET_K: 158 case BPF_S_JMP_JSET_K:
187 case BPF_S_JMP_JSET_X: 159 case BPF_S_JMP_JSET_X:
160 sk_decode_filter(ftest, ftest);
188 continue; 161 continue;
189 default: 162 default:
190 return -EINVAL; 163 return -EINVAL;
@@ -202,18 +175,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
202static u32 seccomp_run_filters(int syscall) 175static u32 seccomp_run_filters(int syscall)
203{ 176{
204 struct seccomp_filter *f; 177 struct seccomp_filter *f;
178 struct seccomp_data sd;
205 u32 ret = SECCOMP_RET_ALLOW; 179 u32 ret = SECCOMP_RET_ALLOW;
206 180
207 /* Ensure unexpected behavior doesn't result in failing open. */ 181 /* Ensure unexpected behavior doesn't result in failing open. */
208 if (WARN_ON(current->seccomp.filter == NULL)) 182 if (WARN_ON(current->seccomp.filter == NULL))
209 return SECCOMP_RET_KILL; 183 return SECCOMP_RET_KILL;
210 184
185 populate_seccomp_data(&sd);
186
211 /* 187 /*
212 * All filters in the list are evaluated and the lowest BPF return 188 * All filters in the list are evaluated and the lowest BPF return
213 * value always takes priority (ignoring the DATA). 189 * value always takes priority (ignoring the DATA).
214 */ 190 */
215 for (f = current->seccomp.filter; f; f = f->prev) { 191 for (f = current->seccomp.filter; f; f = f->prev) {
216 u32 cur_ret = sk_run_filter(NULL, f->insns); 192 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
217 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 193 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
218 ret = cur_ret; 194 ret = cur_ret;
219 } 195 }
@@ -231,6 +207,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
231 struct seccomp_filter *filter; 207 struct seccomp_filter *filter;
232 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 208 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
233 unsigned long total_insns = fprog->len; 209 unsigned long total_insns = fprog->len;
210 struct sock_filter *fp;
211 int new_len;
234 long ret; 212 long ret;
235 213
236 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 214 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
@@ -252,28 +230,45 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
252 CAP_SYS_ADMIN) != 0) 230 CAP_SYS_ADMIN) != 0)
253 return -EACCES; 231 return -EACCES;
254 232
255 /* Allocate a new seccomp_filter */ 233 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
256 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, 234 if (!fp)
257 GFP_KERNEL|__GFP_NOWARN);
258 if (!filter)
259 return -ENOMEM; 235 return -ENOMEM;
260 atomic_set(&filter->usage, 1);
261 filter->len = fprog->len;
262 236
263 /* Copy the instructions from fprog. */ 237 /* Copy the instructions from fprog. */
264 ret = -EFAULT; 238 ret = -EFAULT;
265 if (copy_from_user(filter->insns, fprog->filter, fp_size)) 239 if (copy_from_user(fp, fprog->filter, fp_size))
266 goto fail; 240 goto free_prog;
267 241
268 /* Check and rewrite the fprog via the skb checker */ 242 /* Check and rewrite the fprog via the skb checker */
269 ret = sk_chk_filter(filter->insns, filter->len); 243 ret = sk_chk_filter(fp, fprog->len);
270 if (ret) 244 if (ret)
271 goto fail; 245 goto free_prog;
272 246
273 /* Check and rewrite the fprog for seccomp use */ 247 /* Check and rewrite the fprog for seccomp use */
274 ret = seccomp_check_filter(filter->insns, filter->len); 248 ret = seccomp_check_filter(fp, fprog->len);
249 if (ret)
250 goto free_prog;
251
252 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
253 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
275 if (ret) 254 if (ret)
276 goto fail; 255 goto free_prog;
256
257 /* Allocate a new seccomp_filter */
258 ret = -ENOMEM;
259 filter = kzalloc(sizeof(struct seccomp_filter) +
260 sizeof(struct sock_filter_int) * new_len,
261 GFP_KERNEL|__GFP_NOWARN);
262 if (!filter)
263 goto free_prog;
264
265 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
266 if (ret)
267 goto free_filter;
268 kfree(fp);
269
270 atomic_set(&filter->usage, 1);
271 filter->len = new_len;
277 272
278 /* 273 /*
279 * If there is an existing filter, make it the prev and don't drop its 274 * If there is an existing filter, make it the prev and don't drop its
@@ -282,8 +277,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
282 filter->prev = current->seccomp.filter; 277 filter->prev = current->seccomp.filter;
283 current->seccomp.filter = filter; 278 current->seccomp.filter = filter;
284 return 0; 279 return 0;
285fail: 280
281free_filter:
286 kfree(filter); 282 kfree(filter);
283free_prog:
284 kfree(fp);
287 return ret; 285 return ret;
288} 286}
289 287
@@ -293,7 +291,7 @@ fail:
293 * 291 *
294 * Returns 0 on success and non-zero otherwise. 292 * Returns 0 on success and non-zero otherwise.
295 */ 293 */
296long seccomp_attach_user_filter(char __user *user_filter) 294static long seccomp_attach_user_filter(char __user *user_filter)
297{ 295{
298 struct sock_fprog fprog; 296 struct sock_fprog fprog;
299 long ret = -EFAULT; 297 long ret = -EFAULT;
@@ -351,7 +349,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
351 info.si_code = SYS_SECCOMP; 349 info.si_code = SYS_SECCOMP;
352 info.si_call_addr = (void __user *)KSTK_EIP(current); 350 info.si_call_addr = (void __user *)KSTK_EIP(current);
353 info.si_errno = reason; 351 info.si_errno = reason;
354 info.si_arch = syscall_get_arch(current, task_pt_regs(current)); 352 info.si_arch = syscall_get_arch();
355 info.si_syscall = syscall; 353 info.si_syscall = syscall;
356 force_sig_info(SIGSYS, &info, current); 354 force_sig_info(SIGSYS, &info, current);
357} 355}
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881db1ca0..6ea13c09ae56 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,6 +33,8 @@
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/cn_proc.h> 35#include <linux/cn_proc.h>
36#include <linux/compiler.h>
37
36#define CREATE_TRACE_POINTS 38#define CREATE_TRACE_POINTS
37#include <trace/events/signal.h> 39#include <trace/events/signal.h>
38 40
@@ -2382,7 +2384,7 @@ relock:
2382 * @regs: user register state 2384 * @regs: user register state
2383 * @stepping: nonzero if debugger single-step or block-step in use 2385 * @stepping: nonzero if debugger single-step or block-step in use
2384 * 2386 *
2385 * This function should be called when a signal has succesfully been 2387 * This function should be called when a signal has successfully been
2386 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask 2388 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
2387 * is always blocked, and the signal itself is blocked unless %SA_NODEFER 2389 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2388 * is set in @ka->sa.sa_flags. Tracing is notified. 2390 * is set in @ka->sa.sa_flags. Tracing is notified.
@@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
3618} 3620}
3619#endif 3621#endif
3620 3622
3621__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) 3623__weak const char *arch_vma_name(struct vm_area_struct *vma)
3622{ 3624{
3623 return NULL; 3625 return NULL;
3624} 3626}
diff --git a/kernel/sys.c b/kernel/sys.c
index adaeab6f7a87..fba0f29401ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1996 if (arg2 || arg3 || arg4 || arg5) 1996 if (arg2 || arg3 || arg4 || arg5)
1997 return -EINVAL; 1997 return -EINVAL;
1998 return current->no_new_privs ? 1 : 0; 1998 return current->no_new_privs ? 1 : 0;
1999 case PR_GET_THP_DISABLE:
2000 if (arg2 || arg3 || arg4 || arg5)
2001 return -EINVAL;
2002 error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
2003 break;
2004 case PR_SET_THP_DISABLE:
2005 if (arg3 || arg4 || arg5)
2006 return -EINVAL;
2007 down_write(&me->mm->mmap_sem);
2008 if (arg2)
2009 me->mm->def_flags |= VM_NOHUGEPAGE;
2010 else
2011 me->mm->def_flags &= ~VM_NOHUGEPAGE;
2012 up_write(&me->mm->mmap_sem);
2013 break;
1999 default: 2014 default:
2000 error = -EINVAL; 2015 error = -EINVAL;
2001 break; 2016 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052284fd..bc8d1b74a6b9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -146,11 +146,13 @@ cond_syscall(sys_io_destroy);
146cond_syscall(sys_io_submit); 146cond_syscall(sys_io_submit);
147cond_syscall(sys_io_cancel); 147cond_syscall(sys_io_cancel);
148cond_syscall(sys_io_getevents); 148cond_syscall(sys_io_getevents);
149cond_syscall(sys_sysfs);
149cond_syscall(sys_syslog); 150cond_syscall(sys_syslog);
150cond_syscall(sys_process_vm_readv); 151cond_syscall(sys_process_vm_readv);
151cond_syscall(sys_process_vm_writev); 152cond_syscall(sys_process_vm_writev);
152cond_syscall(compat_sys_process_vm_readv); 153cond_syscall(compat_sys_process_vm_readv);
153cond_syscall(compat_sys_process_vm_writev); 154cond_syscall(compat_sys_process_vm_writev);
155cond_syscall(sys_uselib);
154 156
155/* arch-specific weak syscall entries */ 157/* arch-specific weak syscall entries */
156cond_syscall(sys_pciconfig_read); 158cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09d2e2413605..74f5b580fe34 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -123,7 +123,7 @@ static int __maybe_unused neg_one = -1;
123static int zero; 123static int zero;
124static int __maybe_unused one = 1; 124static int __maybe_unused one = 1;
125static int __maybe_unused two = 2; 125static int __maybe_unused two = 2;
126static int __maybe_unused three = 3; 126static int __maybe_unused four = 4;
127static unsigned long one_ul = 1; 127static unsigned long one_ul = 1;
128static int one_hundred = 100; 128static int one_hundred = 100;
129#ifdef CONFIG_PRINTK 129#ifdef CONFIG_PRINTK
@@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8;
141static int ngroups_max = NGROUPS_MAX; 141static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 142static const int cap_last_cap = CAP_LAST_CAP;
143 143
144/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
145#ifdef CONFIG_DETECT_HUNG_TASK
146static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
147#endif
148
144#ifdef CONFIG_INOTIFY_USER 149#ifdef CONFIG_INOTIFY_USER
145#include <linux/inotify.h> 150#include <linux/inotify.h>
146#endif 151#endif
@@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = {
985 .maxlen = sizeof(unsigned long), 990 .maxlen = sizeof(unsigned long),
986 .mode = 0644, 991 .mode = 0644,
987 .proc_handler = proc_dohung_task_timeout_secs, 992 .proc_handler = proc_dohung_task_timeout_secs,
993 .extra2 = &hung_task_timeout_max,
988 }, 994 },
989 { 995 {
990 .procname = "hung_task_warnings", 996 .procname = "hung_task_warnings",
@@ -1264,7 +1270,7 @@ static struct ctl_table vm_table[] = {
1264 .mode = 0644, 1270 .mode = 0644,
1265 .proc_handler = drop_caches_sysctl_handler, 1271 .proc_handler = drop_caches_sysctl_handler,
1266 .extra1 = &one, 1272 .extra1 = &one,
1267 .extra2 = &three, 1273 .extra2 = &four,
1268 }, 1274 },
1269#ifdef CONFIG_COMPACTION 1275#ifdef CONFIG_COMPACTION
1270 { 1276 {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 015661279b68..0a0608edeb26 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
276bool tick_check_replacement(struct clock_event_device *curdev, 276bool tick_check_replacement(struct clock_event_device *curdev,
277 struct clock_event_device *newdev) 277 struct clock_event_device *newdev)
278{ 278{
279 if (tick_check_percpu(curdev, newdev, smp_processor_id())) 279 if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
280 return false; 280 return false;
281 281
282 return tick_check_preferred(curdev, newdev); 282 return tick_check_preferred(curdev, newdev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9f8af69c67ec..6558b7ac112d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)
84 84
85 /* Keep the tick_next_period variable up to date */ 85 /* Keep the tick_next_period variable up to date */
86 tick_next_period = ktime_add(last_jiffies_update, tick_period); 86 tick_next_period = ktime_add(last_jiffies_update, tick_period);
87 } else {
88 write_sequnlock(&jiffies_lock);
89 return;
87 } 90 }
88 write_sequnlock(&jiffies_lock); 91 write_sequnlock(&jiffies_lock);
89 update_wall_time(); 92 update_wall_time();
@@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
967 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 970 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
968 ktime_t next; 971 ktime_t next;
969 972
970 if (!tick_nohz_active) 973 if (!tick_nohz_enabled)
971 return; 974 return;
972 975
973 local_irq_disable(); 976 local_irq_disable();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b40279ecd71..f7df8ea21707 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,6 +22,7 @@
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h> 24#include <linux/pvclock_gtod.h>
25#include <linux/compiler.h>
25 26
26#include "tick-internal.h" 27#include "tick-internal.h"
27#include "ntp_internal.h" 28#include "ntp_internal.h"
@@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void)
760 * 761 *
761 * XXX - Do be sure to remove it once all arches implement it. 762 * XXX - Do be sure to remove it once all arches implement it.
762 */ 763 */
763void __attribute__((weak)) read_persistent_clock(struct timespec *ts) 764void __weak read_persistent_clock(struct timespec *ts)
764{ 765{
765 ts->tv_sec = 0; 766 ts->tv_sec = 0;
766 ts->tv_nsec = 0; 767 ts->tv_nsec = 0;
@@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
775 * 776 *
776 * XXX - Do be sure to remove it once all arches implement it. 777 * XXX - Do be sure to remove it once all arches implement it.
777 */ 778 */
778void __attribute__((weak)) read_boot_clock(struct timespec *ts) 779void __weak read_boot_clock(struct timespec *ts)
779{ 780{
780 ts->tv_sec = 0; 781 ts->tv_sec = 0;
781 ts->tv_nsec = 0; 782 ts->tv_nsec = 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85aaca08..8639819f6cef 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -424,6 +424,7 @@ config UPROBE_EVENT
424 bool "Enable uprobes-based dynamic events" 424 bool "Enable uprobes-based dynamic events"
425 depends on ARCH_SUPPORTS_UPROBES 425 depends on ARCH_SUPPORTS_UPROBES
426 depends on MMU 426 depends on MMU
427 depends on PERF_EVENTS
427 select UPROBES 428 select UPROBES
428 select PROBE_EVENTS 429 select PROBE_EVENTS
429 select TRACING 430 select TRACING
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4f3a3c03eadb..c1bd4ada2a04 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1429,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1429 return print_one_line(iter, true); 1429 return print_one_line(iter, true);
1430} 1430}
1431 1431
1432static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) 1432static int
1433blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1433{ 1434{
1434 /* don't output context-info for blk_classic output */ 1435 /* don't output context-info for blk_classic output */
1435 if (bit == TRACE_BLK_OPT_CLASSIC) { 1436 if (bit == TRACE_BLK_OPT_CLASSIC) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd7f76d1eb86..1fd4b9479210 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -237,14 +237,13 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 237 return 0;
238} 238}
239 239
240static void control_ops_free(struct ftrace_ops *ops)
241{
242 free_percpu(ops->disabled);
243}
244
245static void update_global_ops(void) 240static void update_global_ops(void)
246{ 241{
247 ftrace_func_t func; 242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
248 247
249 /* 248 /*
250 * If there's only one function registered, then call that 249 * If there's only one function registered, then call that
@@ -254,23 +253,17 @@ static void update_global_ops(void)
254 if (ftrace_global_list == &ftrace_list_end || 253 if (ftrace_global_list == &ftrace_list_end ||
255 ftrace_global_list->next == &ftrace_list_end) { 254 ftrace_global_list->next == &ftrace_list_end) {
256 func = ftrace_global_list->func; 255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /* 257 /*
258 * As we are calling the function directly. 258 * As we are calling the function directly.
259 * If it does not have recursion protection, 259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated 260 * the function_trace_op needs to be updated
261 * accordingly. 261 * accordingly.
262 */ 262 */
263 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) 263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
265 else
266 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; 264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
267 } else {
268 func = ftrace_global_list_func;
269 /* The list has its own recursion protection. */
270 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
271 } 265 }
272 266
273
274 /* If we filter on pids, update to use the pid function */ 267 /* If we filter on pids, update to use the pid function */
275 if (!list_empty(&ftrace_pids)) { 268 if (!list_empty(&ftrace_pids)) {
276 set_ftrace_pid_function(func); 269 set_ftrace_pid_function(func);
@@ -278,6 +271,7 @@ static void update_global_ops(void)
278 } 271 }
279 272
280 global_ops.func = func; 273 global_ops.func = func;
274 global_ops.private = private;
281} 275}
282 276
283static void ftrace_sync(struct work_struct *work) 277static void ftrace_sync(struct work_struct *work)
@@ -437,6 +431,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
437 431
438static int __register_ftrace_function(struct ftrace_ops *ops) 432static int __register_ftrace_function(struct ftrace_ops *ops)
439{ 433{
434 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL;
436
440 if (FTRACE_WARN_ON(ops == &global_ops)) 437 if (FTRACE_WARN_ON(ops == &global_ops))
441 return -EINVAL; 438 return -EINVAL;
442 439
@@ -1172,8 +1169,6 @@ struct ftrace_page {
1172 int size; 1169 int size;
1173}; 1170};
1174 1171
1175static struct ftrace_page *ftrace_new_pgs;
1176
1177#define ENTRY_SIZE sizeof(struct dyn_ftrace) 1172#define ENTRY_SIZE sizeof(struct dyn_ftrace)
1178#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) 1173#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1179 1174
@@ -1560,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip)
1560 * the function tracer. It checks the ftrace internal tables to 1555 * the function tracer. It checks the ftrace internal tables to
1561 * determine if the address belongs or not. 1556 * determine if the address belongs or not.
1562 */ 1557 */
1563int ftrace_text_reserved(void *start, void *end) 1558int ftrace_text_reserved(const void *start, const void *end)
1564{ 1559{
1565 unsigned long ret; 1560 unsigned long ret;
1566 1561
@@ -1994,6 +1989,7 @@ int __weak ftrace_arch_code_modify_post_process(void)
1994void ftrace_modify_all_code(int command) 1989void ftrace_modify_all_code(int command)
1995{ 1990{
1996 int update = command & FTRACE_UPDATE_TRACE_FUNC; 1991 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1992 int err = 0;
1997 1993
1998 /* 1994 /*
1999 * If the ftrace_caller calls a ftrace_ops func directly, 1995 * If the ftrace_caller calls a ftrace_ops func directly,
@@ -2005,8 +2001,11 @@ void ftrace_modify_all_code(int command)
2005 * to make sure the ops are having the right functions 2001 * to make sure the ops are having the right functions
2006 * traced. 2002 * traced.
2007 */ 2003 */
2008 if (update) 2004 if (update) {
2009 ftrace_update_ftrace_func(ftrace_ops_list_func); 2005 err = ftrace_update_ftrace_func(ftrace_ops_list_func);
2006 if (FTRACE_WARN_ON(err))
2007 return;
2008 }
2010 2009
2011 if (command & FTRACE_UPDATE_CALLS) 2010 if (command & FTRACE_UPDATE_CALLS)
2012 ftrace_replace_code(1); 2011 ftrace_replace_code(1);
@@ -2019,13 +2018,16 @@ void ftrace_modify_all_code(int command)
2019 /* If irqs are disabled, we are in stop machine */ 2018 /* If irqs are disabled, we are in stop machine */
2020 if (!irqs_disabled()) 2019 if (!irqs_disabled())
2021 smp_call_function(ftrace_sync_ipi, NULL, 1); 2020 smp_call_function(ftrace_sync_ipi, NULL, 1);
2022 ftrace_update_ftrace_func(ftrace_trace_function); 2021 err = ftrace_update_ftrace_func(ftrace_trace_function);
2022 if (FTRACE_WARN_ON(err))
2023 return;
2023 } 2024 }
2024 2025
2025 if (command & FTRACE_START_FUNC_RET) 2026 if (command & FTRACE_START_FUNC_RET)
2026 ftrace_enable_ftrace_graph_caller(); 2027 err = ftrace_enable_ftrace_graph_caller();
2027 else if (command & FTRACE_STOP_FUNC_RET) 2028 else if (command & FTRACE_STOP_FUNC_RET)
2028 ftrace_disable_ftrace_graph_caller(); 2029 err = ftrace_disable_ftrace_graph_caller();
2030 FTRACE_WARN_ON(err);
2029} 2031}
2030 2032
2031static int __ftrace_modify_code(void *data) 2033static int __ftrace_modify_code(void *data)
@@ -2093,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func;
2093static int ftrace_start_up; 2095static int ftrace_start_up;
2094static int global_start_up; 2096static int global_start_up;
2095 2097
2098static void control_ops_free(struct ftrace_ops *ops)
2099{
2100 free_percpu(ops->disabled);
2101}
2102
2096static void ftrace_startup_enable(int command) 2103static void ftrace_startup_enable(int command)
2097{ 2104{
2098 if (saved_ftrace_func != ftrace_trace_function) { 2105 if (saved_ftrace_func != ftrace_trace_function) {
@@ -2244,7 +2251,6 @@ static void ftrace_shutdown_sysctl(void)
2244} 2251}
2245 2252
2246static cycle_t ftrace_update_time; 2253static cycle_t ftrace_update_time;
2247static unsigned long ftrace_update_cnt;
2248unsigned long ftrace_update_tot_cnt; 2254unsigned long ftrace_update_tot_cnt;
2249 2255
2250static inline int ops_traces_mod(struct ftrace_ops *ops) 2256static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2300,11 +2306,12 @@ static int referenced_filters(struct dyn_ftrace *rec)
2300 return cnt; 2306 return cnt;
2301} 2307}
2302 2308
2303static int ftrace_update_code(struct module *mod) 2309static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2304{ 2310{
2305 struct ftrace_page *pg; 2311 struct ftrace_page *pg;
2306 struct dyn_ftrace *p; 2312 struct dyn_ftrace *p;
2307 cycle_t start, stop; 2313 cycle_t start, stop;
2314 unsigned long update_cnt = 0;
2308 unsigned long ref = 0; 2315 unsigned long ref = 0;
2309 bool test = false; 2316 bool test = false;
2310 int i; 2317 int i;
@@ -2330,9 +2337,8 @@ static int ftrace_update_code(struct module *mod)
2330 } 2337 }
2331 2338
2332 start = ftrace_now(raw_smp_processor_id()); 2339 start = ftrace_now(raw_smp_processor_id());
2333 ftrace_update_cnt = 0;
2334 2340
2335 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2341 for (pg = new_pgs; pg; pg = pg->next) {
2336 2342
2337 for (i = 0; i < pg->index; i++) { 2343 for (i = 0; i < pg->index; i++) {
2338 int cnt = ref; 2344 int cnt = ref;
@@ -2353,7 +2359,7 @@ static int ftrace_update_code(struct module *mod)
2353 if (!ftrace_code_disable(mod, p)) 2359 if (!ftrace_code_disable(mod, p))
2354 break; 2360 break;
2355 2361
2356 ftrace_update_cnt++; 2362 update_cnt++;
2357 2363
2358 /* 2364 /*
2359 * If the tracing is enabled, go ahead and enable the record. 2365 * If the tracing is enabled, go ahead and enable the record.
@@ -2372,11 +2378,9 @@ static int ftrace_update_code(struct module *mod)
2372 } 2378 }
2373 } 2379 }
2374 2380
2375 ftrace_new_pgs = NULL;
2376
2377 stop = ftrace_now(raw_smp_processor_id()); 2381 stop = ftrace_now(raw_smp_processor_id());
2378 ftrace_update_time = stop - start; 2382 ftrace_update_time = stop - start;
2379 ftrace_update_tot_cnt += ftrace_update_cnt; 2383 ftrace_update_tot_cnt += update_cnt;
2380 2384
2381 return 0; 2385 return 0;
2382} 2386}
@@ -2468,22 +2472,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
2468 return NULL; 2472 return NULL;
2469} 2473}
2470 2474
2471static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2472{
2473 int cnt;
2474
2475 if (!num_to_init) {
2476 pr_info("ftrace: No functions to be traced?\n");
2477 return -1;
2478 }
2479
2480 cnt = num_to_init / ENTRIES_PER_PAGE;
2481 pr_info("ftrace: allocating %ld entries in %d pages\n",
2482 num_to_init, cnt + 1);
2483
2484 return 0;
2485}
2486
2487#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2475#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2488 2476
2489struct ftrace_iterator { 2477struct ftrace_iterator {
@@ -2871,7 +2859,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2871static int 2859static int
2872ftrace_filter_open(struct inode *inode, struct file *file) 2860ftrace_filter_open(struct inode *inode, struct file *file)
2873{ 2861{
2874 return ftrace_regex_open(&global_ops, 2862 struct ftrace_ops *ops = inode->i_private;
2863
2864 return ftrace_regex_open(ops,
2875 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 2865 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2876 inode, file); 2866 inode, file);
2877} 2867}
@@ -2879,7 +2869,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
2879static int 2869static int
2880ftrace_notrace_open(struct inode *inode, struct file *file) 2870ftrace_notrace_open(struct inode *inode, struct file *file)
2881{ 2871{
2882 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, 2872 struct ftrace_ops *ops = inode->i_private;
2873
2874 return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
2883 inode, file); 2875 inode, file);
2884} 2876}
2885 2877
@@ -4109,6 +4101,36 @@ static const struct file_operations ftrace_graph_notrace_fops = {
4109}; 4101};
4110#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4102#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
4111 4103
4104void ftrace_create_filter_files(struct ftrace_ops *ops,
4105 struct dentry *parent)
4106{
4107
4108 trace_create_file("set_ftrace_filter", 0644, parent,
4109 ops, &ftrace_filter_fops);
4110
4111 trace_create_file("set_ftrace_notrace", 0644, parent,
4112 ops, &ftrace_notrace_fops);
4113}
4114
4115/*
4116 * The name "destroy_filter_files" is really a misnomer. Although
4117 * in the future, it may actualy delete the files, but this is
4118 * really intended to make sure the ops passed in are disabled
4119 * and that when this function returns, the caller is free to
4120 * free the ops.
4121 *
4122 * The "destroy" name is only to match the "create" name that this
4123 * should be paired with.
4124 */
4125void ftrace_destroy_filter_files(struct ftrace_ops *ops)
4126{
4127 mutex_lock(&ftrace_lock);
4128 if (ops->flags & FTRACE_OPS_FL_ENABLED)
4129 ftrace_shutdown(ops, 0);
4130 ops->flags |= FTRACE_OPS_FL_DELETED;
4131 mutex_unlock(&ftrace_lock);
4132}
4133
4112static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4134static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4113{ 4135{
4114 4136
@@ -4118,11 +4140,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4118 trace_create_file("enabled_functions", 0444, 4140 trace_create_file("enabled_functions", 0444,
4119 d_tracer, NULL, &ftrace_enabled_fops); 4141 d_tracer, NULL, &ftrace_enabled_fops);
4120 4142
4121 trace_create_file("set_ftrace_filter", 0644, d_tracer, 4143 ftrace_create_filter_files(&global_ops, d_tracer);
4122 NULL, &ftrace_filter_fops);
4123
4124 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
4125 NULL, &ftrace_notrace_fops);
4126 4144
4127#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4145#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4128 trace_create_file("set_graph_function", 0444, d_tracer, 4146 trace_create_file("set_graph_function", 0444, d_tracer,
@@ -4238,9 +4256,6 @@ static int ftrace_process_locs(struct module *mod,
4238 /* Assign the last page to ftrace_pages */ 4256 /* Assign the last page to ftrace_pages */
4239 ftrace_pages = pg; 4257 ftrace_pages = pg;
4240 4258
4241 /* These new locations need to be initialized */
4242 ftrace_new_pgs = start_pg;
4243
4244 /* 4259 /*
4245 * We only need to disable interrupts on start up 4260 * We only need to disable interrupts on start up
4246 * because we are modifying code that an interrupt 4261 * because we are modifying code that an interrupt
@@ -4251,7 +4266,7 @@ static int ftrace_process_locs(struct module *mod,
4251 */ 4266 */
4252 if (!mod) 4267 if (!mod)
4253 local_irq_save(flags); 4268 local_irq_save(flags);
4254 ftrace_update_code(mod); 4269 ftrace_update_code(mod, start_pg);
4255 if (!mod) 4270 if (!mod)
4256 local_irq_restore(flags); 4271 local_irq_restore(flags);
4257 ret = 0; 4272 ret = 0;
@@ -4360,30 +4375,27 @@ struct notifier_block ftrace_module_exit_nb = {
4360 .priority = INT_MIN, /* Run after anything that can remove kprobes */ 4375 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4361}; 4376};
4362 4377
4363extern unsigned long __start_mcount_loc[];
4364extern unsigned long __stop_mcount_loc[];
4365
4366void __init ftrace_init(void) 4378void __init ftrace_init(void)
4367{ 4379{
4368 unsigned long count, addr, flags; 4380 extern unsigned long __start_mcount_loc[];
4381 extern unsigned long __stop_mcount_loc[];
4382 unsigned long count, flags;
4369 int ret; 4383 int ret;
4370 4384
4371 /* Keep the ftrace pointer to the stub */
4372 addr = (unsigned long)ftrace_stub;
4373
4374 local_irq_save(flags); 4385 local_irq_save(flags);
4375 ftrace_dyn_arch_init(&addr); 4386 ret = ftrace_dyn_arch_init();
4376 local_irq_restore(flags); 4387 local_irq_restore(flags);
4377 4388 if (ret)
4378 /* ftrace_dyn_arch_init places the return code in addr */
4379 if (addr)
4380 goto failed; 4389 goto failed;
4381 4390
4382 count = __stop_mcount_loc - __start_mcount_loc; 4391 count = __stop_mcount_loc - __start_mcount_loc;
4383 4392 if (!count) {
4384 ret = ftrace_dyn_table_alloc(count); 4393 pr_info("ftrace: No functions to be traced?\n");
4385 if (ret)
4386 goto failed; 4394 goto failed;
4395 }
4396
4397 pr_info("ftrace: allocating %ld entries in %ld pages\n",
4398 count, count / ENTRIES_PER_PAGE + 1);
4387 4399
4388 last_ftrace_enabled = ftrace_enabled = 1; 4400 last_ftrace_enabled = ftrace_enabled = 1;
4389 4401
@@ -4431,7 +4443,13 @@ static inline void ftrace_startup_enable(int command) { }
4431 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4443 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4432 ___ret; \ 4444 ___ret; \
4433 }) 4445 })
4434# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) 4446# define ftrace_shutdown(ops, command) \
4447 ({ \
4448 int ___ret = __unregister_ftrace_function(ops); \
4449 if (!___ret) \
4450 (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
4451 ___ret; \
4452 })
4435 4453
4436# define ftrace_startup_sysctl() do { } while (0) 4454# define ftrace_startup_sysctl() do { } while (0)
4437# define ftrace_shutdown_sysctl() do { } while (0) 4455# define ftrace_shutdown_sysctl() do { } while (0)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fc4da2d97f9b..c634868c2921 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1301 * In that off case, we need to allocate for all possible cpus. 1301 * In that off case, we need to allocate for all possible cpus.
1302 */ 1302 */
1303#ifdef CONFIG_HOTPLUG_CPU 1303#ifdef CONFIG_HOTPLUG_CPU
1304 get_online_cpus(); 1304 cpu_notifier_register_begin();
1305 cpumask_copy(buffer->cpumask, cpu_online_mask); 1305 cpumask_copy(buffer->cpumask, cpu_online_mask);
1306#else 1306#else
1307 cpumask_copy(buffer->cpumask, cpu_possible_mask); 1307 cpumask_copy(buffer->cpumask, cpu_possible_mask);
@@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1324#ifdef CONFIG_HOTPLUG_CPU 1324#ifdef CONFIG_HOTPLUG_CPU
1325 buffer->cpu_notify.notifier_call = rb_cpu_notify; 1325 buffer->cpu_notify.notifier_call = rb_cpu_notify;
1326 buffer->cpu_notify.priority = 0; 1326 buffer->cpu_notify.priority = 0;
1327 register_cpu_notifier(&buffer->cpu_notify); 1327 __register_cpu_notifier(&buffer->cpu_notify);
1328 cpu_notifier_register_done();
1328#endif 1329#endif
1329 1330
1330 put_online_cpus();
1331 mutex_init(&buffer->mutex); 1331 mutex_init(&buffer->mutex);
1332 1332
1333 return buffer; 1333 return buffer;
@@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1341 1341
1342 fail_free_cpumask: 1342 fail_free_cpumask:
1343 free_cpumask_var(buffer->cpumask); 1343 free_cpumask_var(buffer->cpumask);
1344 put_online_cpus(); 1344#ifdef CONFIG_HOTPLUG_CPU
1345 cpu_notifier_register_done();
1346#endif
1345 1347
1346 fail_free_buffer: 1348 fail_free_buffer:
1347 kfree(buffer); 1349 kfree(buffer);
@@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer)
1358{ 1360{
1359 int cpu; 1361 int cpu;
1360 1362
1361 get_online_cpus();
1362
1363#ifdef CONFIG_HOTPLUG_CPU 1363#ifdef CONFIG_HOTPLUG_CPU
1364 unregister_cpu_notifier(&buffer->cpu_notify); 1364 cpu_notifier_register_begin();
1365 __unregister_cpu_notifier(&buffer->cpu_notify);
1365#endif 1366#endif
1366 1367
1367 for_each_buffer_cpu(buffer, cpu) 1368 for_each_buffer_cpu(buffer, cpu)
1368 rb_free_cpu_buffer(buffer->buffers[cpu]); 1369 rb_free_cpu_buffer(buffer->buffers[cpu]);
1369 1370
1370 put_online_cpus(); 1371#ifdef CONFIG_HOTPLUG_CPU
1372 cpu_notifier_register_done();
1373#endif
1371 1374
1372 kfree(buffer->buffers); 1375 kfree(buffer->buffers);
1373 free_cpumask_var(buffer->cpumask); 1376 free_cpumask_var(buffer->cpumask);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 24c1f2382557..737b0efa1a62 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = {
73 .opts = dummy_tracer_opt 73 .opts = dummy_tracer_opt
74}; 74};
75 75
76static int dummy_set_flag(u32 old_flags, u32 bit, int set) 76static int
77dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
77{ 78{
78 return 0; 79 return 0;
79} 80}
@@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
118/* When set, tracing will stop when a WARN*() is hit */ 119/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning; 120int __disable_trace_on_warning;
120 121
121static int tracing_set_tracer(const char *buf); 122static int tracing_set_tracer(struct trace_array *tr, const char *buf);
122 123
123#define MAX_TRACER_SIZE 100 124#define MAX_TRACER_SIZE 100
124static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 125static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str)
180} 181}
181__setup("trace_options=", set_trace_boot_options); 182__setup("trace_options=", set_trace_boot_options);
182 183
184static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
185static char *trace_boot_clock __initdata;
186
187static int __init set_trace_boot_clock(char *str)
188{
189 strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
190 trace_boot_clock = trace_boot_clock_buf;
191 return 0;
192}
193__setup("trace_clock=", set_trace_boot_clock);
194
183 195
184unsigned long long ns2usecs(cycle_t nsec) 196unsigned long long ns2usecs(cycle_t nsec)
185{ 197{
@@ -1230,7 +1242,7 @@ int register_tracer(struct tracer *type)
1230 1242
1231 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 1243 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
1232 /* Do we want this tracer to start on bootup? */ 1244 /* Do we want this tracer to start on bootup? */
1233 tracing_set_tracer(type->name); 1245 tracing_set_tracer(&global_trace, type->name);
1234 default_bootup_tracer = NULL; 1246 default_bootup_tracer = NULL;
1235 /* disable other selftests, since this will break it. */ 1247 /* disable other selftests, since this will break it. */
1236 tracing_selftest_disabled = true; 1248 tracing_selftest_disabled = true;
@@ -3137,27 +3149,52 @@ static int tracing_open(struct inode *inode, struct file *file)
3137 return ret; 3149 return ret;
3138} 3150}
3139 3151
3152/*
3153 * Some tracers are not suitable for instance buffers.
3154 * A tracer is always available for the global array (toplevel)
3155 * or if it explicitly states that it is.
3156 */
3157static bool
3158trace_ok_for_array(struct tracer *t, struct trace_array *tr)
3159{
3160 return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
3161}
3162
3163/* Find the next tracer that this trace array may use */
3164static struct tracer *
3165get_tracer_for_array(struct trace_array *tr, struct tracer *t)
3166{
3167 while (t && !trace_ok_for_array(t, tr))
3168 t = t->next;
3169
3170 return t;
3171}
3172
3140static void * 3173static void *
3141t_next(struct seq_file *m, void *v, loff_t *pos) 3174t_next(struct seq_file *m, void *v, loff_t *pos)
3142{ 3175{
3176 struct trace_array *tr = m->private;
3143 struct tracer *t = v; 3177 struct tracer *t = v;
3144 3178
3145 (*pos)++; 3179 (*pos)++;
3146 3180
3147 if (t) 3181 if (t)
3148 t = t->next; 3182 t = get_tracer_for_array(tr, t->next);
3149 3183
3150 return t; 3184 return t;
3151} 3185}
3152 3186
3153static void *t_start(struct seq_file *m, loff_t *pos) 3187static void *t_start(struct seq_file *m, loff_t *pos)
3154{ 3188{
3189 struct trace_array *tr = m->private;
3155 struct tracer *t; 3190 struct tracer *t;
3156 loff_t l = 0; 3191 loff_t l = 0;
3157 3192
3158 mutex_lock(&trace_types_lock); 3193 mutex_lock(&trace_types_lock);
3159 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) 3194
3160 ; 3195 t = get_tracer_for_array(tr, trace_types);
3196 for (; t && l < *pos; t = t_next(m, t, &l))
3197 ;
3161 3198
3162 return t; 3199 return t;
3163} 3200}
@@ -3192,10 +3229,21 @@ static const struct seq_operations show_traces_seq_ops = {
3192 3229
3193static int show_traces_open(struct inode *inode, struct file *file) 3230static int show_traces_open(struct inode *inode, struct file *file)
3194{ 3231{
3232 struct trace_array *tr = inode->i_private;
3233 struct seq_file *m;
3234 int ret;
3235
3195 if (tracing_disabled) 3236 if (tracing_disabled)
3196 return -ENODEV; 3237 return -ENODEV;
3197 3238
3198 return seq_open(file, &show_traces_seq_ops); 3239 ret = seq_open(file, &show_traces_seq_ops);
3240 if (ret)
3241 return ret;
3242
3243 m = file->private_data;
3244 m->private = tr;
3245
3246 return 0;
3199} 3247}
3200 3248
3201static ssize_t 3249static ssize_t
@@ -3355,13 +3403,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
3355 return 0; 3403 return 0;
3356} 3404}
3357 3405
3358static int __set_tracer_option(struct tracer *trace, 3406static int __set_tracer_option(struct trace_array *tr,
3359 struct tracer_flags *tracer_flags, 3407 struct tracer_flags *tracer_flags,
3360 struct tracer_opt *opts, int neg) 3408 struct tracer_opt *opts, int neg)
3361{ 3409{
3410 struct tracer *trace = tr->current_trace;
3362 int ret; 3411 int ret;
3363 3412
3364 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); 3413 ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
3365 if (ret) 3414 if (ret)
3366 return ret; 3415 return ret;
3367 3416
@@ -3373,8 +3422,9 @@ static int __set_tracer_option(struct tracer *trace,
3373} 3422}
3374 3423
3375/* Try to assign a tracer specific option */ 3424/* Try to assign a tracer specific option */
3376static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 3425static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
3377{ 3426{
3427 struct tracer *trace = tr->current_trace;
3378 struct tracer_flags *tracer_flags = trace->flags; 3428 struct tracer_flags *tracer_flags = trace->flags;
3379 struct tracer_opt *opts = NULL; 3429 struct tracer_opt *opts = NULL;
3380 int i; 3430 int i;
@@ -3383,8 +3433,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
3383 opts = &tracer_flags->opts[i]; 3433 opts = &tracer_flags->opts[i];
3384 3434
3385 if (strcmp(cmp, opts->name) == 0) 3435 if (strcmp(cmp, opts->name) == 0)
3386 return __set_tracer_option(trace, trace->flags, 3436 return __set_tracer_option(tr, trace->flags, opts, neg);
3387 opts, neg);
3388 } 3437 }
3389 3438
3390 return -EINVAL; 3439 return -EINVAL;
@@ -3407,7 +3456,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
3407 3456
3408 /* Give the tracer a chance to approve the change */ 3457 /* Give the tracer a chance to approve the change */
3409 if (tr->current_trace->flag_changed) 3458 if (tr->current_trace->flag_changed)
3410 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) 3459 if (tr->current_trace->flag_changed(tr, mask, !!enabled))
3411 return -EINVAL; 3460 return -EINVAL;
3412 3461
3413 if (enabled) 3462 if (enabled)
@@ -3456,7 +3505,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
3456 3505
3457 /* If no option could be set, test the specific tracer options */ 3506 /* If no option could be set, test the specific tracer options */
3458 if (!trace_options[i]) 3507 if (!trace_options[i])
3459 ret = set_tracer_option(tr->current_trace, cmp, neg); 3508 ret = set_tracer_option(tr, cmp, neg);
3460 3509
3461 mutex_unlock(&trace_types_lock); 3510 mutex_unlock(&trace_types_lock);
3462 3511
@@ -3562,6 +3611,8 @@ static const char readme_msg[] =
3562#ifdef CONFIG_TRACER_SNAPSHOT 3611#ifdef CONFIG_TRACER_SNAPSHOT
3563 "\t\t snapshot\n" 3612 "\t\t snapshot\n"
3564#endif 3613#endif
3614 "\t\t dump\n"
3615 "\t\t cpudump\n"
3565 "\t example: echo do_fault:traceoff > set_ftrace_filter\n" 3616 "\t example: echo do_fault:traceoff > set_ftrace_filter\n"
3566 "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" 3617 "\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
3567 "\t The first one will disable tracing every time do_fault is hit\n" 3618 "\t The first one will disable tracing every time do_fault is hit\n"
@@ -3885,10 +3936,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3885static void 3936static void
3886destroy_trace_option_files(struct trace_option_dentry *topts); 3937destroy_trace_option_files(struct trace_option_dentry *topts);
3887 3938
3888static int tracing_set_tracer(const char *buf) 3939/*
3940 * Used to clear out the tracer before deletion of an instance.
3941 * Must have trace_types_lock held.
3942 */
3943static void tracing_set_nop(struct trace_array *tr)
3944{
3945 if (tr->current_trace == &nop_trace)
3946 return;
3947
3948 tr->current_trace->enabled--;
3949
3950 if (tr->current_trace->reset)
3951 tr->current_trace->reset(tr);
3952
3953 tr->current_trace = &nop_trace;
3954}
3955
3956static int tracing_set_tracer(struct trace_array *tr, const char *buf)
3889{ 3957{
3890 static struct trace_option_dentry *topts; 3958 static struct trace_option_dentry *topts;
3891 struct trace_array *tr = &global_trace;
3892 struct tracer *t; 3959 struct tracer *t;
3893#ifdef CONFIG_TRACER_MAX_TRACE 3960#ifdef CONFIG_TRACER_MAX_TRACE
3894 bool had_max_tr; 3961 bool had_max_tr;
@@ -3916,9 +3983,15 @@ static int tracing_set_tracer(const char *buf)
3916 if (t == tr->current_trace) 3983 if (t == tr->current_trace)
3917 goto out; 3984 goto out;
3918 3985
3986 /* Some tracers are only allowed for the top level buffer */
3987 if (!trace_ok_for_array(t, tr)) {
3988 ret = -EINVAL;
3989 goto out;
3990 }
3991
3919 trace_branch_disable(); 3992 trace_branch_disable();
3920 3993
3921 tr->current_trace->enabled = false; 3994 tr->current_trace->enabled--;
3922 3995
3923 if (tr->current_trace->reset) 3996 if (tr->current_trace->reset)
3924 tr->current_trace->reset(tr); 3997 tr->current_trace->reset(tr);
@@ -3941,9 +4014,11 @@ static int tracing_set_tracer(const char *buf)
3941 free_snapshot(tr); 4014 free_snapshot(tr);
3942 } 4015 }
3943#endif 4016#endif
3944 destroy_trace_option_files(topts); 4017 /* Currently, only the top instance has options */
3945 4018 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
3946 topts = create_trace_option_files(tr, t); 4019 destroy_trace_option_files(topts);
4020 topts = create_trace_option_files(tr, t);
4021 }
3947 4022
3948#ifdef CONFIG_TRACER_MAX_TRACE 4023#ifdef CONFIG_TRACER_MAX_TRACE
3949 if (t->use_max_tr && !had_max_tr) { 4024 if (t->use_max_tr && !had_max_tr) {
@@ -3960,7 +4035,7 @@ static int tracing_set_tracer(const char *buf)
3960 } 4035 }
3961 4036
3962 tr->current_trace = t; 4037 tr->current_trace = t;
3963 tr->current_trace->enabled = true; 4038 tr->current_trace->enabled++;
3964 trace_branch_enable(tr); 4039 trace_branch_enable(tr);
3965 out: 4040 out:
3966 mutex_unlock(&trace_types_lock); 4041 mutex_unlock(&trace_types_lock);
@@ -3972,6 +4047,7 @@ static ssize_t
3972tracing_set_trace_write(struct file *filp, const char __user *ubuf, 4047tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3973 size_t cnt, loff_t *ppos) 4048 size_t cnt, loff_t *ppos)
3974{ 4049{
4050 struct trace_array *tr = filp->private_data;
3975 char buf[MAX_TRACER_SIZE+1]; 4051 char buf[MAX_TRACER_SIZE+1];
3976 int i; 4052 int i;
3977 size_t ret; 4053 size_t ret;
@@ -3991,7 +4067,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3991 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) 4067 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3992 buf[i] = 0; 4068 buf[i] = 0;
3993 4069
3994 err = tracing_set_tracer(buf); 4070 err = tracing_set_tracer(tr, buf);
3995 if (err) 4071 if (err)
3996 return err; 4072 return err;
3997 4073
@@ -4316,8 +4392,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
4316 4392
4317static const struct pipe_buf_operations tracing_pipe_buf_ops = { 4393static const struct pipe_buf_operations tracing_pipe_buf_ops = {
4318 .can_merge = 0, 4394 .can_merge = 0,
4319 .map = generic_pipe_buf_map,
4320 .unmap = generic_pipe_buf_unmap,
4321 .confirm = generic_pipe_buf_confirm, 4395 .confirm = generic_pipe_buf_confirm,
4322 .release = generic_pipe_buf_release, 4396 .release = generic_pipe_buf_release,
4323 .steal = generic_pipe_buf_steal, 4397 .steal = generic_pipe_buf_steal,
@@ -4412,7 +4486,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4412 trace_access_lock(iter->cpu_file); 4486 trace_access_lock(iter->cpu_file);
4413 4487
4414 /* Fill as many pages as possible. */ 4488 /* Fill as many pages as possible. */
4415 for (i = 0, rem = len; i < pipe->buffers && rem; i++) { 4489 for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
4416 spd.pages[i] = alloc_page(GFP_KERNEL); 4490 spd.pages[i] = alloc_page(GFP_KERNEL);
4417 if (!spd.pages[i]) 4491 if (!spd.pages[i])
4418 break; 4492 break;
@@ -4699,25 +4773,10 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4699 return 0; 4773 return 0;
4700} 4774}
4701 4775
4702static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4776static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
4703 size_t cnt, loff_t *fpos)
4704{ 4777{
4705 struct seq_file *m = filp->private_data;
4706 struct trace_array *tr = m->private;
4707 char buf[64];
4708 const char *clockstr;
4709 int i; 4778 int i;
4710 4779
4711 if (cnt >= sizeof(buf))
4712 return -EINVAL;
4713
4714 if (copy_from_user(&buf, ubuf, cnt))
4715 return -EFAULT;
4716
4717 buf[cnt] = 0;
4718
4719 clockstr = strstrip(buf);
4720
4721 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { 4780 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
4722 if (strcmp(trace_clocks[i].name, clockstr) == 0) 4781 if (strcmp(trace_clocks[i].name, clockstr) == 0)
4723 break; 4782 break;
@@ -4745,6 +4804,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4745 4804
4746 mutex_unlock(&trace_types_lock); 4805 mutex_unlock(&trace_types_lock);
4747 4806
4807 return 0;
4808}
4809
4810static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4811 size_t cnt, loff_t *fpos)
4812{
4813 struct seq_file *m = filp->private_data;
4814 struct trace_array *tr = m->private;
4815 char buf[64];
4816 const char *clockstr;
4817 int ret;
4818
4819 if (cnt >= sizeof(buf))
4820 return -EINVAL;
4821
4822 if (copy_from_user(&buf, ubuf, cnt))
4823 return -EFAULT;
4824
4825 buf[cnt] = 0;
4826
4827 clockstr = strstrip(buf);
4828
4829 ret = tracing_set_clock(tr, clockstr);
4830 if (ret)
4831 return ret;
4832
4748 *fpos += cnt; 4833 *fpos += cnt;
4749 4834
4750 return cnt; 4835 return cnt;
@@ -5194,8 +5279,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
5194/* Pipe buffer operations for a buffer. */ 5279/* Pipe buffer operations for a buffer. */
5195static const struct pipe_buf_operations buffer_pipe_buf_ops = { 5280static const struct pipe_buf_operations buffer_pipe_buf_ops = {
5196 .can_merge = 0, 5281 .can_merge = 0,
5197 .map = generic_pipe_buf_map,
5198 .unmap = generic_pipe_buf_unmap,
5199 .confirm = generic_pipe_buf_confirm, 5282 .confirm = generic_pipe_buf_confirm,
5200 .release = buffer_pipe_buf_release, 5283 .release = buffer_pipe_buf_release,
5201 .steal = generic_pipe_buf_steal, 5284 .steal = generic_pipe_buf_steal,
@@ -5271,7 +5354,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5271 trace_access_lock(iter->cpu_file); 5354 trace_access_lock(iter->cpu_file);
5272 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); 5355 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
5273 5356
5274 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { 5357 for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
5275 struct page *page; 5358 struct page *page;
5276 int r; 5359 int r;
5277 5360
@@ -5705,7 +5788,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
5705 5788
5706 if (!!(topt->flags->val & topt->opt->bit) != val) { 5789 if (!!(topt->flags->val & topt->opt->bit) != val) {
5707 mutex_lock(&trace_types_lock); 5790 mutex_lock(&trace_types_lock);
5708 ret = __set_tracer_option(topt->tr->current_trace, topt->flags, 5791 ret = __set_tracer_option(topt->tr, topt->flags,
5709 topt->opt, !val); 5792 topt->opt, !val);
5710 mutex_unlock(&trace_types_lock); 5793 mutex_unlock(&trace_types_lock);
5711 if (ret) 5794 if (ret)
@@ -6112,7 +6195,9 @@ static int instance_delete(const char *name)
6112 6195
6113 list_del(&tr->list); 6196 list_del(&tr->list);
6114 6197
6198 tracing_set_nop(tr);
6115 event_trace_del_tracer(tr); 6199 event_trace_del_tracer(tr);
6200 ftrace_destroy_function_files(tr);
6116 debugfs_remove_recursive(tr->dir); 6201 debugfs_remove_recursive(tr->dir);
6117 free_percpu(tr->trace_buffer.data); 6202 free_percpu(tr->trace_buffer.data);
6118 ring_buffer_free(tr->trace_buffer.buffer); 6203 ring_buffer_free(tr->trace_buffer.buffer);
@@ -6207,6 +6292,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6207{ 6292{
6208 int cpu; 6293 int cpu;
6209 6294
6295 trace_create_file("available_tracers", 0444, d_tracer,
6296 tr, &show_traces_fops);
6297
6298 trace_create_file("current_tracer", 0644, d_tracer,
6299 tr, &set_tracer_fops);
6300
6210 trace_create_file("tracing_cpumask", 0644, d_tracer, 6301 trace_create_file("tracing_cpumask", 0644, d_tracer,
6211 tr, &tracing_cpumask_fops); 6302 tr, &tracing_cpumask_fops);
6212 6303
@@ -6237,6 +6328,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6237 trace_create_file("tracing_on", 0644, d_tracer, 6328 trace_create_file("tracing_on", 0644, d_tracer,
6238 tr, &rb_simple_fops); 6329 tr, &rb_simple_fops);
6239 6330
6331 if (ftrace_create_function_files(tr, d_tracer))
6332 WARN(1, "Could not allocate function filter files");
6333
6240#ifdef CONFIG_TRACER_SNAPSHOT 6334#ifdef CONFIG_TRACER_SNAPSHOT
6241 trace_create_file("snapshot", 0644, d_tracer, 6335 trace_create_file("snapshot", 0644, d_tracer,
6242 tr, &snapshot_fops); 6336 tr, &snapshot_fops);
@@ -6259,12 +6353,6 @@ static __init int tracer_init_debugfs(void)
6259 6353
6260 init_tracer_debugfs(&global_trace, d_tracer); 6354 init_tracer_debugfs(&global_trace, d_tracer);
6261 6355
6262 trace_create_file("available_tracers", 0444, d_tracer,
6263 &global_trace, &show_traces_fops);
6264
6265 trace_create_file("current_tracer", 0644, d_tracer,
6266 &global_trace, &set_tracer_fops);
6267
6268#ifdef CONFIG_TRACER_MAX_TRACE 6356#ifdef CONFIG_TRACER_MAX_TRACE
6269 trace_create_file("tracing_max_latency", 0644, d_tracer, 6357 trace_create_file("tracing_max_latency", 0644, d_tracer,
6270 &tracing_max_latency, &tracing_max_lat_fops); 6358 &tracing_max_latency, &tracing_max_lat_fops);
@@ -6527,6 +6615,13 @@ __init static int tracer_alloc_buffers(void)
6527 6615
6528 trace_init_cmdlines(); 6616 trace_init_cmdlines();
6529 6617
6618 if (trace_boot_clock) {
6619 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6620 if (ret < 0)
6621 pr_warning("Trace clock %s not defined, going back to default\n",
6622 trace_boot_clock);
6623 }
6624
6530 /* 6625 /*
6531 * register_tracer() might reference current_trace, so it 6626 * register_tracer() might reference current_trace, so it
6532 * needs to be set before we register anything. This is 6627 * needs to be set before we register anything. This is
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 02b592f2d4b7..2e29d7ba5a52 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
13#include <linux/hw_breakpoint.h> 13#include <linux/hw_breakpoint.h>
14#include <linux/trace_seq.h> 14#include <linux/trace_seq.h>
15#include <linux/ftrace_event.h> 15#include <linux/ftrace_event.h>
16#include <linux/compiler.h>
16 17
17#ifdef CONFIG_FTRACE_SYSCALLS 18#ifdef CONFIG_FTRACE_SYSCALLS
18#include <asm/unistd.h> /* For NR_SYSCALLS */ 19#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -210,6 +211,11 @@ struct trace_array {
210 struct list_head events; 211 struct list_head events;
211 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ 212 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
212 int ref; 213 int ref;
214#ifdef CONFIG_FUNCTION_TRACER
215 struct ftrace_ops *ops;
216 /* function tracing enabled */
217 int function_enabled;
218#endif
213}; 219};
214 220
215enum { 221enum {
@@ -355,14 +361,16 @@ struct tracer {
355 void (*print_header)(struct seq_file *m); 361 void (*print_header)(struct seq_file *m);
356 enum print_line_t (*print_line)(struct trace_iterator *iter); 362 enum print_line_t (*print_line)(struct trace_iterator *iter);
357 /* If you handled the flag setting, return 0 */ 363 /* If you handled the flag setting, return 0 */
358 int (*set_flag)(u32 old_flags, u32 bit, int set); 364 int (*set_flag)(struct trace_array *tr,
365 u32 old_flags, u32 bit, int set);
359 /* Return 0 if OK with change, else return non-zero */ 366 /* Return 0 if OK with change, else return non-zero */
360 int (*flag_changed)(struct tracer *tracer, 367 int (*flag_changed)(struct trace_array *tr,
361 u32 mask, int set); 368 u32 mask, int set);
362 struct tracer *next; 369 struct tracer *next;
363 struct tracer_flags *flags; 370 struct tracer_flags *flags;
371 int enabled;
364 bool print_max; 372 bool print_max;
365 bool enabled; 373 bool allow_instances;
366#ifdef CONFIG_TRACER_MAX_TRACE 374#ifdef CONFIG_TRACER_MAX_TRACE
367 bool use_max_tr; 375 bool use_max_tr;
368#endif 376#endif
@@ -812,13 +820,36 @@ static inline int ftrace_trace_task(struct task_struct *task)
812 return test_tsk_trace_trace(task); 820 return test_tsk_trace_trace(task);
813} 821}
814extern int ftrace_is_dead(void); 822extern int ftrace_is_dead(void);
823int ftrace_create_function_files(struct trace_array *tr,
824 struct dentry *parent);
825void ftrace_destroy_function_files(struct trace_array *tr);
815#else 826#else
816static inline int ftrace_trace_task(struct task_struct *task) 827static inline int ftrace_trace_task(struct task_struct *task)
817{ 828{
818 return 1; 829 return 1;
819} 830}
820static inline int ftrace_is_dead(void) { return 0; } 831static inline int ftrace_is_dead(void) { return 0; }
821#endif 832static inline int
833ftrace_create_function_files(struct trace_array *tr,
834 struct dentry *parent)
835{
836 return 0;
837}
838static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
839#endif /* CONFIG_FUNCTION_TRACER */
840
841#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
842void ftrace_create_filter_files(struct ftrace_ops *ops,
843 struct dentry *parent);
844void ftrace_destroy_filter_files(struct ftrace_ops *ops);
845#else
846/*
847 * The ops parameter passed in is usually undefined.
848 * This must be a macro.
849 */
850#define ftrace_create_filter_files(ops, parent) do { } while (0)
851#define ftrace_destroy_filter_files(ops) do { } while (0)
852#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
822 853
823int ftrace_event_is_function(struct ftrace_event_call *call); 854int ftrace_event_is_function(struct ftrace_event_call *call);
824 855
@@ -1249,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1249#undef FTRACE_ENTRY 1280#undef FTRACE_ENTRY
1250#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 1281#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
1251 extern struct ftrace_event_call \ 1282 extern struct ftrace_event_call \
1252 __attribute__((__aligned__(4))) event_##call; 1283 __aligned(4) event_##call;
1253#undef FTRACE_ENTRY_DUP 1284#undef FTRACE_ENTRY_DUP
1254#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ 1285#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
1255 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ 1286 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7b16d40bd64d..3ddfd8f62c05 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -188,29 +188,60 @@ int trace_event_raw_init(struct ftrace_event_call *call)
188} 188}
189EXPORT_SYMBOL_GPL(trace_event_raw_init); 189EXPORT_SYMBOL_GPL(trace_event_raw_init);
190 190
191void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
192 struct ftrace_event_file *ftrace_file,
193 unsigned long len)
194{
195 struct ftrace_event_call *event_call = ftrace_file->event_call;
196
197 local_save_flags(fbuffer->flags);
198 fbuffer->pc = preempt_count();
199 fbuffer->ftrace_file = ftrace_file;
200
201 fbuffer->event =
202 trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
203 event_call->event.type, len,
204 fbuffer->flags, fbuffer->pc);
205 if (!fbuffer->event)
206 return NULL;
207
208 fbuffer->entry = ring_buffer_event_data(fbuffer->event);
209 return fbuffer->entry;
210}
211EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
212
213void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
214{
215 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
216 fbuffer->event, fbuffer->entry,
217 fbuffer->flags, fbuffer->pc);
218}
219EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
220
191int ftrace_event_reg(struct ftrace_event_call *call, 221int ftrace_event_reg(struct ftrace_event_call *call,
192 enum trace_reg type, void *data) 222 enum trace_reg type, void *data)
193{ 223{
194 struct ftrace_event_file *file = data; 224 struct ftrace_event_file *file = data;
195 225
226 WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
196 switch (type) { 227 switch (type) {
197 case TRACE_REG_REGISTER: 228 case TRACE_REG_REGISTER:
198 return tracepoint_probe_register(call->name, 229 return tracepoint_probe_register(call->tp,
199 call->class->probe, 230 call->class->probe,
200 file); 231 file);
201 case TRACE_REG_UNREGISTER: 232 case TRACE_REG_UNREGISTER:
202 tracepoint_probe_unregister(call->name, 233 tracepoint_probe_unregister(call->tp,
203 call->class->probe, 234 call->class->probe,
204 file); 235 file);
205 return 0; 236 return 0;
206 237
207#ifdef CONFIG_PERF_EVENTS 238#ifdef CONFIG_PERF_EVENTS
208 case TRACE_REG_PERF_REGISTER: 239 case TRACE_REG_PERF_REGISTER:
209 return tracepoint_probe_register(call->name, 240 return tracepoint_probe_register(call->tp,
210 call->class->perf_probe, 241 call->class->perf_probe,
211 call); 242 call);
212 case TRACE_REG_PERF_UNREGISTER: 243 case TRACE_REG_PERF_UNREGISTER:
213 tracepoint_probe_unregister(call->name, 244 tracepoint_probe_unregister(call->tp,
214 call->class->perf_probe, 245 call->class->perf_probe,
215 call); 246 call);
216 return 0; 247 return 0;
@@ -322,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
322 if (ret) { 353 if (ret) {
323 tracing_stop_cmdline_record(); 354 tracing_stop_cmdline_record();
324 pr_info("event trace: Could not enable event " 355 pr_info("event trace: Could not enable event "
325 "%s\n", call->name); 356 "%s\n", ftrace_event_name(call));
326 break; 357 break;
327 } 358 }
328 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); 359 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
@@ -451,27 +482,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
451{ 482{
452 struct ftrace_event_file *file; 483 struct ftrace_event_file *file;
453 struct ftrace_event_call *call; 484 struct ftrace_event_call *call;
485 const char *name;
454 int ret = -EINVAL; 486 int ret = -EINVAL;
455 487
456 list_for_each_entry(file, &tr->events, list) { 488 list_for_each_entry(file, &tr->events, list) {
457 489
458 call = file->event_call; 490 call = file->event_call;
491 name = ftrace_event_name(call);
459 492
460 if (!call->name || !call->class || !call->class->reg) 493 if (!name || !call->class || !call->class->reg)
461 continue; 494 continue;
462 495
463 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) 496 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
464 continue; 497 continue;
465 498
466 if (match && 499 if (match &&
467 strcmp(match, call->name) != 0 && 500 strcmp(match, name) != 0 &&
468 strcmp(match, call->class->system) != 0) 501 strcmp(match, call->class->system) != 0)
469 continue; 502 continue;
470 503
471 if (sub && strcmp(sub, call->class->system) != 0) 504 if (sub && strcmp(sub, call->class->system) != 0)
472 continue; 505 continue;
473 506
474 if (event && strcmp(event, call->name) != 0) 507 if (event && strcmp(event, name) != 0)
475 continue; 508 continue;
476 509
477 ftrace_event_enable_disable(file, set); 510 ftrace_event_enable_disable(file, set);
@@ -669,7 +702,7 @@ static int t_show(struct seq_file *m, void *v)
669 702
670 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 703 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
671 seq_printf(m, "%s:", call->class->system); 704 seq_printf(m, "%s:", call->class->system);
672 seq_printf(m, "%s\n", call->name); 705 seq_printf(m, "%s\n", ftrace_event_name(call));
673 706
674 return 0; 707 return 0;
675} 708}
@@ -762,7 +795,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
762 mutex_lock(&event_mutex); 795 mutex_lock(&event_mutex);
763 list_for_each_entry(file, &tr->events, list) { 796 list_for_each_entry(file, &tr->events, list) {
764 call = file->event_call; 797 call = file->event_call;
765 if (!call->name || !call->class || !call->class->reg) 798 if (!ftrace_event_name(call) || !call->class || !call->class->reg)
766 continue; 799 continue;
767 800
768 if (system && strcmp(call->class->system, system->name) != 0) 801 if (system && strcmp(call->class->system, system->name) != 0)
@@ -877,7 +910,7 @@ static int f_show(struct seq_file *m, void *v)
877 910
878 switch ((unsigned long)v) { 911 switch ((unsigned long)v) {
879 case FORMAT_HEADER: 912 case FORMAT_HEADER:
880 seq_printf(m, "name: %s\n", call->name); 913 seq_printf(m, "name: %s\n", ftrace_event_name(call));
881 seq_printf(m, "ID: %d\n", call->event.type); 914 seq_printf(m, "ID: %d\n", call->event.type);
882 seq_printf(m, "format:\n"); 915 seq_printf(m, "format:\n");
883 return 0; 916 return 0;
@@ -1497,6 +1530,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1497 struct trace_array *tr = file->tr; 1530 struct trace_array *tr = file->tr;
1498 struct list_head *head; 1531 struct list_head *head;
1499 struct dentry *d_events; 1532 struct dentry *d_events;
1533 const char *name;
1500 int ret; 1534 int ret;
1501 1535
1502 /* 1536 /*
@@ -1510,10 +1544,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1510 } else 1544 } else
1511 d_events = parent; 1545 d_events = parent;
1512 1546
1513 file->dir = debugfs_create_dir(call->name, d_events); 1547 name = ftrace_event_name(call);
1548 file->dir = debugfs_create_dir(name, d_events);
1514 if (!file->dir) { 1549 if (!file->dir) {
1515 pr_warning("Could not create debugfs '%s' directory\n", 1550 pr_warning("Could not create debugfs '%s' directory\n",
1516 call->name); 1551 name);
1517 return -1; 1552 return -1;
1518 } 1553 }
1519 1554
@@ -1537,7 +1572,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1537 ret = call->class->define_fields(call); 1572 ret = call->class->define_fields(call);
1538 if (ret < 0) { 1573 if (ret < 0) {
1539 pr_warning("Could not initialize trace point" 1574 pr_warning("Could not initialize trace point"
1540 " events/%s\n", call->name); 1575 " events/%s\n", name);
1541 return -1; 1576 return -1;
1542 } 1577 }
1543 } 1578 }
@@ -1601,15 +1636,17 @@ static void event_remove(struct ftrace_event_call *call)
1601static int event_init(struct ftrace_event_call *call) 1636static int event_init(struct ftrace_event_call *call)
1602{ 1637{
1603 int ret = 0; 1638 int ret = 0;
1639 const char *name;
1604 1640
1605 if (WARN_ON(!call->name)) 1641 name = ftrace_event_name(call);
1642 if (WARN_ON(!name))
1606 return -EINVAL; 1643 return -EINVAL;
1607 1644
1608 if (call->class->raw_init) { 1645 if (call->class->raw_init) {
1609 ret = call->class->raw_init(call); 1646 ret = call->class->raw_init(call);
1610 if (ret < 0 && ret != -ENOSYS) 1647 if (ret < 0 && ret != -ENOSYS)
1611 pr_warn("Could not initialize trace events/%s\n", 1648 pr_warn("Could not initialize trace events/%s\n",
1612 call->name); 1649 name);
1613 } 1650 }
1614 1651
1615 return ret; 1652 return ret;
@@ -1855,7 +1892,7 @@ __trace_add_event_dirs(struct trace_array *tr)
1855 ret = __trace_add_new_event(call, tr); 1892 ret = __trace_add_new_event(call, tr);
1856 if (ret < 0) 1893 if (ret < 0)
1857 pr_warning("Could not create directory for event %s\n", 1894 pr_warning("Could not create directory for event %s\n",
1858 call->name); 1895 ftrace_event_name(call));
1859 } 1896 }
1860} 1897}
1861 1898
@@ -1864,18 +1901,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
1864{ 1901{
1865 struct ftrace_event_file *file; 1902 struct ftrace_event_file *file;
1866 struct ftrace_event_call *call; 1903 struct ftrace_event_call *call;
1904 const char *name;
1867 1905
1868 list_for_each_entry(file, &tr->events, list) { 1906 list_for_each_entry(file, &tr->events, list) {
1869 1907
1870 call = file->event_call; 1908 call = file->event_call;
1909 name = ftrace_event_name(call);
1871 1910
1872 if (!call->name || !call->class || !call->class->reg) 1911 if (!name || !call->class || !call->class->reg)
1873 continue; 1912 continue;
1874 1913
1875 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) 1914 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
1876 continue; 1915 continue;
1877 1916
1878 if (strcmp(event, call->name) == 0 && 1917 if (strcmp(event, name) == 0 &&
1879 strcmp(system, call->class->system) == 0) 1918 strcmp(system, call->class->system) == 0)
1880 return file; 1919 return file;
1881 } 1920 }
@@ -1943,7 +1982,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
1943 seq_printf(m, "%s:%s:%s", 1982 seq_printf(m, "%s:%s:%s",
1944 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, 1983 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1945 data->file->event_call->class->system, 1984 data->file->event_call->class->system,
1946 data->file->event_call->name); 1985 ftrace_event_name(data->file->event_call));
1947 1986
1948 if (data->count == -1) 1987 if (data->count == -1)
1949 seq_printf(m, ":unlimited\n"); 1988 seq_printf(m, ":unlimited\n");
@@ -2163,7 +2202,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
2163 ret = event_create_dir(tr->event_dir, file); 2202 ret = event_create_dir(tr->event_dir, file);
2164 if (ret < 0) 2203 if (ret < 0)
2165 pr_warning("Could not create directory for event %s\n", 2204 pr_warning("Could not create directory for event %s\n",
2166 file->event_call->name); 2205 ftrace_event_name(file->event_call));
2167 } 2206 }
2168} 2207}
2169 2208
@@ -2187,7 +2226,7 @@ __trace_early_add_events(struct trace_array *tr)
2187 ret = __trace_early_add_new_event(call, tr); 2226 ret = __trace_early_add_new_event(call, tr);
2188 if (ret < 0) 2227 if (ret < 0)
2189 pr_warning("Could not create early event %s\n", 2228 pr_warning("Could not create early event %s\n",
2190 call->name); 2229 ftrace_event_name(call));
2191 } 2230 }
2192} 2231}
2193 2232
@@ -2519,7 +2558,7 @@ static __init void event_trace_self_tests(void)
2519 continue; 2558 continue;
2520#endif 2559#endif
2521 2560
2522 pr_info("Testing event %s: ", call->name); 2561 pr_info("Testing event %s: ", ftrace_event_name(call));
2523 2562
2524 /* 2563 /*
2525 * If an event is already enabled, someone is using 2564 * If an event is already enabled, someone is using
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8efbb69b04f0..925f537f07d1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1095 seq_printf(m, "%s:%s:%s", 1095 seq_printf(m, "%s:%s:%s",
1096 enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, 1096 enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1097 enable_data->file->event_call->class->system, 1097 enable_data->file->event_call->class->system,
1098 enable_data->file->event_call->name); 1098 ftrace_event_name(enable_data->file->event_call));
1099 1099
1100 if (data->count == -1) 1100 if (data->count == -1)
1101 seq_puts(m, ":unlimited"); 1101 seq_puts(m, ":unlimited");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index ee0a5098ac43..d4ddde28a81a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -173,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \
173}; \ 173}; \
174 \ 174 \
175struct ftrace_event_call __used event_##call = { \ 175struct ftrace_event_call __used event_##call = { \
176 .name = #call, \
177 .event.type = etype, \
178 .class = &event_class_ftrace_##call, \ 176 .class = &event_class_ftrace_##call, \
177 { \
178 .name = #call, \
179 }, \
180 .event.type = etype, \
179 .print_fmt = print, \ 181 .print_fmt = print, \
180 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ 182 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
181}; \ 183}; \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38fe1483c508..ffd56351b521 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,32 +13,110 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/slab.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
19 20
20/* function tracing enabled */ 21static void tracing_start_function_trace(struct trace_array *tr);
21static int ftrace_function_enabled; 22static void tracing_stop_function_trace(struct trace_array *tr);
23static void
24function_trace_call(unsigned long ip, unsigned long parent_ip,
25 struct ftrace_ops *op, struct pt_regs *pt_regs);
26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags;
32
33/* Our option */
34enum {
35 TRACE_FUNC_OPT_STACK = 0x1,
36};
37
38static int allocate_ftrace_ops(struct trace_array *tr)
39{
40 struct ftrace_ops *ops;
41
42 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
43 if (!ops)
44 return -ENOMEM;
45
46 /* Currently only the non stack verision is supported */
47 ops->func = function_trace_call;
48 ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
22 49
23static struct trace_array *func_trace; 50 tr->ops = ops;
51 ops->private = tr;
52 return 0;
53}
54
55
56int ftrace_create_function_files(struct trace_array *tr,
57 struct dentry *parent)
58{
59 int ret;
60
61 /*
62 * The top level array uses the "global_ops", and the files are
63 * created on boot up.
64 */
65 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
66 return 0;
24 67
25static void tracing_start_function_trace(void); 68 ret = allocate_ftrace_ops(tr);
26static void tracing_stop_function_trace(void); 69 if (ret)
70 return ret;
71
72 ftrace_create_filter_files(tr->ops, parent);
73
74 return 0;
75}
76
77void ftrace_destroy_function_files(struct trace_array *tr)
78{
79 ftrace_destroy_filter_files(tr->ops);
80 kfree(tr->ops);
81 tr->ops = NULL;
82}
27 83
28static int function_trace_init(struct trace_array *tr) 84static int function_trace_init(struct trace_array *tr)
29{ 85{
30 func_trace = tr; 86 struct ftrace_ops *ops;
87
88 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
89 /* There's only one global tr */
90 if (!trace_ops.private) {
91 trace_ops.private = tr;
92 trace_stack_ops.private = tr;
93 }
94
95 if (func_flags.val & TRACE_FUNC_OPT_STACK)
96 ops = &trace_stack_ops;
97 else
98 ops = &trace_ops;
99 tr->ops = ops;
100 } else if (!tr->ops) {
101 /*
102 * Instance trace_arrays get their ops allocated
103 * at instance creation. Unless it failed
104 * the allocation.
105 */
106 return -ENOMEM;
107 }
108
31 tr->trace_buffer.cpu = get_cpu(); 109 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 110 put_cpu();
33 111
34 tracing_start_cmdline_record(); 112 tracing_start_cmdline_record();
35 tracing_start_function_trace(); 113 tracing_start_function_trace(tr);
36 return 0; 114 return 0;
37} 115}
38 116
39static void function_trace_reset(struct trace_array *tr) 117static void function_trace_reset(struct trace_array *tr)
40{ 118{
41 tracing_stop_function_trace(); 119 tracing_stop_function_trace(tr);
42 tracing_stop_cmdline_record(); 120 tracing_stop_cmdline_record();
43} 121}
44 122
@@ -47,25 +125,18 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(&tr->trace_buffer); 125 tracing_reset_online_cpus(&tr->trace_buffer);
48} 126}
49 127
50/* Our option */
51enum {
52 TRACE_FUNC_OPT_STACK = 0x1,
53};
54
55static struct tracer_flags func_flags;
56
57static void 128static void
58function_trace_call(unsigned long ip, unsigned long parent_ip, 129function_trace_call(unsigned long ip, unsigned long parent_ip,
59 struct ftrace_ops *op, struct pt_regs *pt_regs) 130 struct ftrace_ops *op, struct pt_regs *pt_regs)
60{ 131{
61 struct trace_array *tr = func_trace; 132 struct trace_array *tr = op->private;
62 struct trace_array_cpu *data; 133 struct trace_array_cpu *data;
63 unsigned long flags; 134 unsigned long flags;
64 int bit; 135 int bit;
65 int cpu; 136 int cpu;
66 int pc; 137 int pc;
67 138
68 if (unlikely(!ftrace_function_enabled)) 139 if (unlikely(!tr->function_enabled))
69 return; 140 return;
70 141
71 pc = preempt_count(); 142 pc = preempt_count();
@@ -91,14 +162,14 @@ static void
91function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 162function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
92 struct ftrace_ops *op, struct pt_regs *pt_regs) 163 struct ftrace_ops *op, struct pt_regs *pt_regs)
93{ 164{
94 struct trace_array *tr = func_trace; 165 struct trace_array *tr = op->private;
95 struct trace_array_cpu *data; 166 struct trace_array_cpu *data;
96 unsigned long flags; 167 unsigned long flags;
97 long disabled; 168 long disabled;
98 int cpu; 169 int cpu;
99 int pc; 170 int pc;
100 171
101 if (unlikely(!ftrace_function_enabled)) 172 if (unlikely(!tr->function_enabled))
102 return; 173 return;
103 174
104 /* 175 /*
@@ -128,7 +199,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
128 local_irq_restore(flags); 199 local_irq_restore(flags);
129} 200}
130 201
131
132static struct ftrace_ops trace_ops __read_mostly = 202static struct ftrace_ops trace_ops __read_mostly =
133{ 203{
134 .func = function_trace_call, 204 .func = function_trace_call,
@@ -153,29 +223,21 @@ static struct tracer_flags func_flags = {
153 .opts = func_opts 223 .opts = func_opts
154}; 224};
155 225
156static void tracing_start_function_trace(void) 226static void tracing_start_function_trace(struct trace_array *tr)
157{ 227{
158 ftrace_function_enabled = 0; 228 tr->function_enabled = 0;
159 229 register_ftrace_function(tr->ops);
160 if (func_flags.val & TRACE_FUNC_OPT_STACK) 230 tr->function_enabled = 1;
161 register_ftrace_function(&trace_stack_ops);
162 else
163 register_ftrace_function(&trace_ops);
164
165 ftrace_function_enabled = 1;
166} 231}
167 232
168static void tracing_stop_function_trace(void) 233static void tracing_stop_function_trace(struct trace_array *tr)
169{ 234{
170 ftrace_function_enabled = 0; 235 tr->function_enabled = 0;
171 236 unregister_ftrace_function(tr->ops);
172 if (func_flags.val & TRACE_FUNC_OPT_STACK)
173 unregister_ftrace_function(&trace_stack_ops);
174 else
175 unregister_ftrace_function(&trace_ops);
176} 237}
177 238
178static int func_set_flag(u32 old_flags, u32 bit, int set) 239static int
240func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
179{ 241{
180 switch (bit) { 242 switch (bit) {
181 case TRACE_FUNC_OPT_STACK: 243 case TRACE_FUNC_OPT_STACK:
@@ -183,12 +245,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
183 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 245 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
184 break; 246 break;
185 247
248 unregister_ftrace_function(tr->ops);
249
186 if (set) { 250 if (set) {
187 unregister_ftrace_function(&trace_ops); 251 tr->ops = &trace_stack_ops;
188 register_ftrace_function(&trace_stack_ops); 252 register_ftrace_function(tr->ops);
189 } else { 253 } else {
190 unregister_ftrace_function(&trace_stack_ops); 254 tr->ops = &trace_ops;
191 register_ftrace_function(&trace_ops); 255 register_ftrace_function(tr->ops);
192 } 256 }
193 257
194 break; 258 break;
@@ -208,6 +272,7 @@ static struct tracer function_trace __tracer_data =
208 .wait_pipe = poll_wait_pipe, 272 .wait_pipe = poll_wait_pipe,
209 .flags = &func_flags, 273 .flags = &func_flags,
210 .set_flag = func_set_flag, 274 .set_flag = func_set_flag,
275 .allow_instances = true,
211#ifdef CONFIG_FTRACE_SELFTEST 276#ifdef CONFIG_FTRACE_SELFTEST
212 .selftest = trace_selftest_startup_function, 277 .selftest = trace_selftest_startup_function,
213#endif 278#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120d395c..deff11200261 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter)
1476 } 1476 }
1477} 1477}
1478 1478
1479static int func_graph_set_flag(u32 old_flags, u32 bit, int set) 1479static int
1480func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1480{ 1481{
1481 if (bit == TRACE_GRAPH_PRINT_IRQS) 1482 if (bit == TRACE_GRAPH_PRINT_IRQS)
1482 ftrace_graph_skip_irqs = !set; 1483 ftrace_graph_skip_irqs = !set;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 887ef88b0bc7..8ff02cbb892f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly =
160#endif /* CONFIG_FUNCTION_TRACER */ 160#endif /* CONFIG_FUNCTION_TRACER */
161 161
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 162#ifdef CONFIG_FUNCTION_GRAPH_TRACER
163static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 163static int
164irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
164{ 165{
165 int cpu; 166 int cpu;
166 167
@@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr,
266#else 267#else
267#define __trace_function trace_function 268#define __trace_function trace_function
268 269
269static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 270static int
271irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
270{ 272{
271 return -EINVAL; 273 return -EINVAL;
272} 274}
@@ -570,8 +572,10 @@ static void irqsoff_function_set(int set)
570 unregister_irqsoff_function(is_graph()); 572 unregister_irqsoff_function(is_graph());
571} 573}
572 574
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) 575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
574{ 576{
577 struct tracer *tracer = tr->current_trace;
578
575 if (mask & TRACE_ITER_FUNCTION) 579 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set); 580 irqsoff_function_set(set);
577 581
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdbae450c13e..903ae28962be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,11 +35,6 @@ struct trace_kprobe {
35 struct trace_probe tp; 35 struct trace_probe tp;
36}; 36};
37 37
38struct event_file_link {
39 struct ftrace_event_file *file;
40 struct list_head list;
41};
42
43#define SIZEOF_TRACE_KPROBE(n) \ 38#define SIZEOF_TRACE_KPROBE(n) \
44 (offsetof(struct trace_kprobe, tp.args) + \ 39 (offsetof(struct trace_kprobe, tp.args) + \
45 (sizeof(struct probe_arg) * (n))) 40 (sizeof(struct probe_arg) * (n)))
@@ -346,7 +341,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
346 struct trace_kprobe *tk; 341 struct trace_kprobe *tk;
347 342
348 list_for_each_entry(tk, &probe_list, list) 343 list_for_each_entry(tk, &probe_list, list)
349 if (strcmp(tk->tp.call.name, event) == 0 && 344 if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
350 strcmp(tk->tp.call.class->system, group) == 0) 345 strcmp(tk->tp.call.class->system, group) == 0)
351 return tk; 346 return tk;
352 return NULL; 347 return NULL;
@@ -387,18 +382,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
387 return ret; 382 return ret;
388} 383}
389 384
390static struct event_file_link *
391find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
392{
393 struct event_file_link *link;
394
395 list_for_each_entry(link, &tp->files, list)
396 if (link->file == file)
397 return link;
398
399 return NULL;
400}
401
402/* 385/*
403 * Disable trace_probe 386 * Disable trace_probe
404 * if the file is NULL, disable "perf" handler, or disable "trace" handler. 387 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
@@ -533,7 +516,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
533 mutex_lock(&probe_lock); 516 mutex_lock(&probe_lock);
534 517
535 /* Delete old (same name) event if exist */ 518 /* Delete old (same name) event if exist */
536 old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); 519 old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
520 tk->tp.call.class->system);
537 if (old_tk) { 521 if (old_tk) {
538 ret = unregister_trace_kprobe(old_tk); 522 ret = unregister_trace_kprobe(old_tk);
539 if (ret < 0) 523 if (ret < 0)
@@ -581,7 +565,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
581 if (ret) 565 if (ret)
582 pr_warning("Failed to re-register probe %s on" 566 pr_warning("Failed to re-register probe %s on"
583 "%s: %d\n", 567 "%s: %d\n",
584 tk->tp.call.name, mod->name, ret); 568 ftrace_event_name(&tk->tp.call),
569 mod->name, ret);
585 } 570 }
586 } 571 }
587 mutex_unlock(&probe_lock); 572 mutex_unlock(&probe_lock);
@@ -835,7 +820,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
835 int i; 820 int i;
836 821
837 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); 822 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
838 seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); 823 seq_printf(m, ":%s/%s", tk->tp.call.class->system,
824 ftrace_event_name(&tk->tp.call));
839 825
840 if (!tk->symbol) 826 if (!tk->symbol)
841 seq_printf(m, " 0x%p", tk->rp.kp.addr); 827 seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -893,7 +879,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
893{ 879{
894 struct trace_kprobe *tk = v; 880 struct trace_kprobe *tk = v;
895 881
896 seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, 882 seq_printf(m, " %-44s %15lu %15lu\n",
883 ftrace_event_name(&tk->tp.call), tk->nhit,
897 tk->rp.kp.nmissed); 884 tk->rp.kp.nmissed);
898 885
899 return 0; 886 return 0;
@@ -1028,7 +1015,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1028 field = (struct kprobe_trace_entry_head *)iter->ent; 1015 field = (struct kprobe_trace_entry_head *)iter->ent;
1029 tp = container_of(event, struct trace_probe, call.event); 1016 tp = container_of(event, struct trace_probe, call.event);
1030 1017
1031 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1018 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
1032 goto partial; 1019 goto partial;
1033 1020
1034 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 1021 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1064,7 +1051,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1064 field = (struct kretprobe_trace_entry_head *)iter->ent; 1051 field = (struct kretprobe_trace_entry_head *)iter->ent;
1065 tp = container_of(event, struct trace_probe, call.event); 1052 tp = container_of(event, struct trace_probe, call.event);
1066 1053
1067 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1054 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
1068 goto partial; 1055 goto partial;
1069 1056
1070 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) 1057 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1303,7 +1290,8 @@ static int register_kprobe_event(struct trace_kprobe *tk)
1303 call->data = tk; 1290 call->data = tk;
1304 ret = trace_add_event_call(call); 1291 ret = trace_add_event_call(call);
1305 if (ret) { 1292 if (ret) {
1306 pr_info("Failed to register kprobe event: %s\n", call->name); 1293 pr_info("Failed to register kprobe event: %s\n",
1294 ftrace_event_name(call));
1307 kfree(call->print_fmt); 1295 kfree(call->print_fmt);
1308 unregister_ftrace_event(&call->event); 1296 unregister_ftrace_event(&call->event);
1309 } 1297 }
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2f..69a5cc94c01a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
62 * If you don't implement it, then the flag setting will be 62 * If you don't implement it, then the flag setting will be
63 * automatically accepted. 63 * automatically accepted.
64 */ 64 */
65static int nop_set_flag(u32 old_flags, u32 bit, int set) 65static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
66{ 66{
67 /* 67 /*
68 * Note that you don't need to update nop_flags.val yourself. 68 * Note that you don't need to update nop_flags.val yourself.
@@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly =
96 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
97#endif 97#endif
98 .flags = &nop_flags, 98 .flags = &nop_flags,
99 .set_flag = nop_set_flag 99 .set_flag = nop_set_flag,
100 .allow_instances = true,
100}; 101};
101 102
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..a436de18aa99 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -431,7 +431,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
431 } 431 }
432 432
433 trace_seq_init(p); 433 trace_seq_init(p);
434 ret = trace_seq_printf(s, "%s: ", event->name); 434 ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
435 if (!ret) 435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE; 436 return TRACE_TYPE_PARTIAL_LINE;
437 437
@@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
439} 439}
440EXPORT_SYMBOL(ftrace_raw_output_prep); 440EXPORT_SYMBOL(ftrace_raw_output_prep);
441 441
442static int ftrace_output_raw(struct trace_iterator *iter, char *name,
443 char *fmt, va_list ap)
444{
445 struct trace_seq *s = &iter->seq;
446 int ret;
447
448 ret = trace_seq_printf(s, "%s: ", name);
449 if (!ret)
450 return TRACE_TYPE_PARTIAL_LINE;
451
452 ret = trace_seq_vprintf(s, fmt, ap);
453
454 if (!ret)
455 return TRACE_TYPE_PARTIAL_LINE;
456
457 return TRACE_TYPE_HANDLED;
458}
459
460int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
461{
462 va_list ap;
463 int ret;
464
465 va_start(ap, fmt);
466 ret = ftrace_output_raw(iter, name, fmt, ap);
467 va_end(ap);
468
469 return ret;
470}
471EXPORT_SYMBOL_GPL(ftrace_output_call);
472
442#ifdef CONFIG_KRETPROBES 473#ifdef CONFIG_KRETPROBES
443static inline const char *kretprobed(const char *name) 474static inline const char *kretprobed(const char *name)
444{ 475{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b73574a5f429..fb1ab5dfbd42 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -288,6 +288,11 @@ struct trace_probe {
288 struct probe_arg args[]; 288 struct probe_arg args[];
289}; 289};
290 290
291struct event_file_link {
292 struct ftrace_event_file *file;
293 struct list_head list;
294};
295
291static inline bool trace_probe_is_enabled(struct trace_probe *tp) 296static inline bool trace_probe_is_enabled(struct trace_probe *tp)
292{ 297{
293 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); 298 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
@@ -316,6 +321,18 @@ static inline int is_good_name(const char *name)
316 return 1; 321 return 1;
317} 322}
318 323
324static inline struct event_file_link *
325find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
326{
327 struct event_file_link *link;
328
329 list_for_each_entry(link, &tp->files, list)
330 if (link->file == file)
331 return link;
332
333 return NULL;
334}
335
319extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 336extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
320 struct probe_arg *parg, bool is_return, bool is_kprobe); 337 struct probe_arg *parg, bool is_return, bool is_kprobe);
321 338
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e32635e5e57..e14da5e97a69 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -179,8 +179,10 @@ static void wakeup_function_set(int set)
179 unregister_wakeup_function(is_graph()); 179 unregister_wakeup_function(is_graph());
180} 180}
181 181
182static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) 182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
183{ 183{
184 struct tracer *tracer = tr->current_trace;
185
184 if (mask & TRACE_ITER_FUNCTION) 186 if (mask & TRACE_ITER_FUNCTION)
185 wakeup_function_set(set); 187 wakeup_function_set(set);
186 188
@@ -209,7 +211,8 @@ static void stop_func_tracer(int graph)
209} 211}
210 212
211#ifdef CONFIG_FUNCTION_GRAPH_TRACER 213#ifdef CONFIG_FUNCTION_GRAPH_TRACER
212static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 214static int
215wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
213{ 216{
214 217
215 if (!(bit & TRACE_DISPLAY_GRAPH)) 218 if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -311,7 +314,8 @@ __trace_function(struct trace_array *tr,
311#else 314#else
312#define __trace_function trace_function 315#define __trace_function trace_function
313 316
314static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 317static int
318wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
315{ 319{
316 return -EINVAL; 320 return -EINVAL;
317} 321}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e6be585cf06a..21b320e5d163 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
16 17
17#include <asm/setup.h> 18#include <asm/setup.h>
18 19
@@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack)
144 i++; 145 i++;
145 } 146 }
146 147
148 BUG_ON(current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC);
147 out: 150 out:
148 arch_spin_unlock(&max_stack_lock); 151 arch_spin_unlock(&max_stack_lock);
149 local_irq_restore(flags); 152 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 79e52d93860b..c082a7441345 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
260 goto error; 260 goto error;
261 261
262 INIT_LIST_HEAD(&tu->list); 262 INIT_LIST_HEAD(&tu->list);
263 INIT_LIST_HEAD(&tu->tp.files);
263 tu->consumer.handler = uprobe_dispatcher; 264 tu->consumer.handler = uprobe_dispatcher;
264 if (is_ret) 265 if (is_ret)
265 tu->consumer.ret_handler = uretprobe_dispatcher; 266 tu->consumer.ret_handler = uretprobe_dispatcher;
@@ -293,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
293 struct trace_uprobe *tu; 294 struct trace_uprobe *tu;
294 295
295 list_for_each_entry(tu, &uprobe_list, list) 296 list_for_each_entry(tu, &uprobe_list, list)
296 if (strcmp(tu->tp.call.name, event) == 0 && 297 if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
297 strcmp(tu->tp.call.class->system, group) == 0) 298 strcmp(tu->tp.call.class->system, group) == 0)
298 return tu; 299 return tu;
299 300
@@ -323,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
323 mutex_lock(&uprobe_lock); 324 mutex_lock(&uprobe_lock);
324 325
325 /* register as an event */ 326 /* register as an event */
326 old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); 327 old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
328 tu->tp.call.class->system);
327 if (old_tu) { 329 if (old_tu) {
328 /* delete old event */ 330 /* delete old event */
329 ret = unregister_trace_uprobe(old_tu); 331 ret = unregister_trace_uprobe(old_tu);
@@ -598,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
598 char c = is_ret_probe(tu) ? 'r' : 'p'; 600 char c = is_ret_probe(tu) ? 'r' : 'p';
599 int i; 601 int i;
600 602
601 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); 603 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
604 ftrace_event_name(&tu->tp.call));
602 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 605 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
603 606
604 for (i = 0; i < tu->tp.nr_args; i++) 607 for (i = 0; i < tu->tp.nr_args; i++)
@@ -648,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
648{ 651{
649 struct trace_uprobe *tu = v; 652 struct trace_uprobe *tu = v;
650 653
651 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); 654 seq_printf(m, " %s %-44s %15lu\n", tu->filename,
655 ftrace_event_name(&tu->tp.call), tu->nhit);
652 return 0; 656 return 0;
653} 657}
654 658
@@ -728,9 +732,15 @@ static int uprobe_buffer_enable(void)
728 732
729static void uprobe_buffer_disable(void) 733static void uprobe_buffer_disable(void)
730{ 734{
735 int cpu;
736
731 BUG_ON(!mutex_is_locked(&event_mutex)); 737 BUG_ON(!mutex_is_locked(&event_mutex));
732 738
733 if (--uprobe_buffer_refcnt == 0) { 739 if (--uprobe_buffer_refcnt == 0) {
740 for_each_possible_cpu(cpu)
741 free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
742 cpu)->buf);
743
734 free_percpu(uprobe_cpu_buffer); 744 free_percpu(uprobe_cpu_buffer);
735 uprobe_cpu_buffer = NULL; 745 uprobe_cpu_buffer = NULL;
736 } 746 }
@@ -758,31 +768,32 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
758 mutex_unlock(&ucb->mutex); 768 mutex_unlock(&ucb->mutex);
759} 769}
760 770
761static void uprobe_trace_print(struct trace_uprobe *tu, 771static void __uprobe_trace_func(struct trace_uprobe *tu,
762 unsigned long func, struct pt_regs *regs) 772 unsigned long func, struct pt_regs *regs,
773 struct uprobe_cpu_buffer *ucb, int dsize,
774 struct ftrace_event_file *ftrace_file)
763{ 775{
764 struct uprobe_trace_entry_head *entry; 776 struct uprobe_trace_entry_head *entry;
765 struct ring_buffer_event *event; 777 struct ring_buffer_event *event;
766 struct ring_buffer *buffer; 778 struct ring_buffer *buffer;
767 struct uprobe_cpu_buffer *ucb;
768 void *data; 779 void *data;
769 int size, dsize, esize; 780 int size, esize;
770 struct ftrace_event_call *call = &tu->tp.call; 781 struct ftrace_event_call *call = &tu->tp.call;
771 782
772 dsize = __get_data_size(&tu->tp, regs); 783 WARN_ON(call != ftrace_file->event_call);
773 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
774 784
775 if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) 785 if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
776 return; 786 return;
777 787
778 ucb = uprobe_buffer_get(); 788 if (ftrace_trigger_soft_disabled(ftrace_file))
779 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); 789 return;
780 790
791 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
781 size = esize + tu->tp.size + dsize; 792 size = esize + tu->tp.size + dsize;
782 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 793 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
783 size, 0, 0); 794 call->event.type, size, 0, 0);
784 if (!event) 795 if (!event)
785 goto out; 796 return;
786 797
787 entry = ring_buffer_event_data(event); 798 entry = ring_buffer_event_data(event);
788 if (is_ret_probe(tu)) { 799 if (is_ret_probe(tu)) {
@@ -796,25 +807,36 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
796 807
797 memcpy(data, ucb->buf, tu->tp.size + dsize); 808 memcpy(data, ucb->buf, tu->tp.size + dsize);
798 809
799 if (!call_filter_check_discard(call, entry, buffer, event)) 810 event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
800 trace_buffer_unlock_commit(buffer, event, 0, 0);
801
802out:
803 uprobe_buffer_put(ucb);
804} 811}
805 812
806/* uprobe handler */ 813/* uprobe handler */
807static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 814static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
815 struct uprobe_cpu_buffer *ucb, int dsize)
808{ 816{
809 if (!is_ret_probe(tu)) 817 struct event_file_link *link;
810 uprobe_trace_print(tu, 0, regs); 818
819 if (is_ret_probe(tu))
820 return 0;
821
822 rcu_read_lock();
823 list_for_each_entry_rcu(link, &tu->tp.files, list)
824 __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
825 rcu_read_unlock();
826
811 return 0; 827 return 0;
812} 828}
813 829
814static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, 830static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
815 struct pt_regs *regs) 831 struct pt_regs *regs,
832 struct uprobe_cpu_buffer *ucb, int dsize)
816{ 833{
817 uprobe_trace_print(tu, func, regs); 834 struct event_file_link *link;
835
836 rcu_read_lock();
837 list_for_each_entry_rcu(link, &tu->tp.files, list)
838 __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
839 rcu_read_unlock();
818} 840}
819 841
820/* Event entry printers */ 842/* Event entry printers */
@@ -831,12 +853,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
831 tu = container_of(event, struct trace_uprobe, tp.call.event); 853 tu = container_of(event, struct trace_uprobe, tp.call.event);
832 854
833 if (is_ret_probe(tu)) { 855 if (is_ret_probe(tu)) {
834 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, 856 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
857 ftrace_event_name(&tu->tp.call),
835 entry->vaddr[1], entry->vaddr[0])) 858 entry->vaddr[1], entry->vaddr[0]))
836 goto partial; 859 goto partial;
837 data = DATAOF_TRACE_ENTRY(entry, true); 860 data = DATAOF_TRACE_ENTRY(entry, true);
838 } else { 861 } else {
839 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, 862 if (!trace_seq_printf(s, "%s: (0x%lx)",
863 ftrace_event_name(&tu->tp.call),
840 entry->vaddr[0])) 864 entry->vaddr[0]))
841 goto partial; 865 goto partial;
842 data = DATAOF_TRACE_ENTRY(entry, false); 866 data = DATAOF_TRACE_ENTRY(entry, false);
@@ -861,12 +885,24 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
861 struct mm_struct *mm); 885 struct mm_struct *mm);
862 886
863static int 887static int
864probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) 888probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
889 filter_func_t filter)
865{ 890{
866 int ret = 0; 891 bool enabled = trace_probe_is_enabled(&tu->tp);
892 struct event_file_link *link = NULL;
893 int ret;
894
895 if (file) {
896 link = kmalloc(sizeof(*link), GFP_KERNEL);
897 if (!link)
898 return -ENOMEM;
867 899
868 if (trace_probe_is_enabled(&tu->tp)) 900 link->file = file;
869 return -EINTR; 901 list_add_tail_rcu(&link->list, &tu->tp.files);
902
903 tu->tp.flags |= TP_FLAG_TRACE;
904 } else
905 tu->tp.flags |= TP_FLAG_PROFILE;
870 906
871 ret = uprobe_buffer_enable(); 907 ret = uprobe_buffer_enable();
872 if (ret < 0) 908 if (ret < 0)
@@ -874,24 +910,49 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
874 910
875 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 911 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
876 912
877 tu->tp.flags |= flag; 913 if (enabled)
914 return 0;
915
878 tu->consumer.filter = filter; 916 tu->consumer.filter = filter;
879 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 917 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
880 if (ret) 918 if (ret) {
881 tu->tp.flags &= ~flag; 919 if (file) {
920 list_del(&link->list);
921 kfree(link);
922 tu->tp.flags &= ~TP_FLAG_TRACE;
923 } else
924 tu->tp.flags &= ~TP_FLAG_PROFILE;
925 }
882 926
883 return ret; 927 return ret;
884} 928}
885 929
886static void probe_event_disable(struct trace_uprobe *tu, int flag) 930static void
931probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
887{ 932{
888 if (!trace_probe_is_enabled(&tu->tp)) 933 if (!trace_probe_is_enabled(&tu->tp))
889 return; 934 return;
890 935
936 if (file) {
937 struct event_file_link *link;
938
939 link = find_event_file_link(&tu->tp, file);
940 if (!link)
941 return;
942
943 list_del_rcu(&link->list);
944 /* synchronize with u{,ret}probe_trace_func */
945 synchronize_sched();
946 kfree(link);
947
948 if (!list_empty(&tu->tp.files))
949 return;
950 }
951
891 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 952 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
892 953
893 uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 954 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
894 tu->tp.flags &= ~flag; 955 tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
895 956
896 uprobe_buffer_disable(); 957 uprobe_buffer_disable();
897} 958}
@@ -1014,31 +1075,24 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
1014 return ret; 1075 return ret;
1015} 1076}
1016 1077
1017static void uprobe_perf_print(struct trace_uprobe *tu, 1078static void __uprobe_perf_func(struct trace_uprobe *tu,
1018 unsigned long func, struct pt_regs *regs) 1079 unsigned long func, struct pt_regs *regs,
1080 struct uprobe_cpu_buffer *ucb, int dsize)
1019{ 1081{
1020 struct ftrace_event_call *call = &tu->tp.call; 1082 struct ftrace_event_call *call = &tu->tp.call;
1021 struct uprobe_trace_entry_head *entry; 1083 struct uprobe_trace_entry_head *entry;
1022 struct hlist_head *head; 1084 struct hlist_head *head;
1023 struct uprobe_cpu_buffer *ucb;
1024 void *data; 1085 void *data;
1025 int size, dsize, esize; 1086 int size, esize;
1026 int rctx; 1087 int rctx;
1027 1088
1028 dsize = __get_data_size(&tu->tp, regs);
1029 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1089 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1030 1090
1031 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1032 return;
1033
1034 size = esize + tu->tp.size + dsize; 1091 size = esize + tu->tp.size + dsize;
1035 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); 1092 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1036 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 1093 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1037 return; 1094 return;
1038 1095
1039 ucb = uprobe_buffer_get();
1040 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1041
1042 preempt_disable(); 1096 preempt_disable();
1043 head = this_cpu_ptr(call->perf_events); 1097 head = this_cpu_ptr(call->perf_events);
1044 if (hlist_empty(head)) 1098 if (hlist_empty(head))
@@ -1068,46 +1122,49 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
1068 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1122 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1069 out: 1123 out:
1070 preempt_enable(); 1124 preempt_enable();
1071 uprobe_buffer_put(ucb);
1072} 1125}
1073 1126
1074/* uprobe profile handler */ 1127/* uprobe profile handler */
1075static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 1128static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
1129 struct uprobe_cpu_buffer *ucb, int dsize)
1076{ 1130{
1077 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 1131 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
1078 return UPROBE_HANDLER_REMOVE; 1132 return UPROBE_HANDLER_REMOVE;
1079 1133
1080 if (!is_ret_probe(tu)) 1134 if (!is_ret_probe(tu))
1081 uprobe_perf_print(tu, 0, regs); 1135 __uprobe_perf_func(tu, 0, regs, ucb, dsize);
1082 return 0; 1136 return 0;
1083} 1137}
1084 1138
1085static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, 1139static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
1086 struct pt_regs *regs) 1140 struct pt_regs *regs,
1141 struct uprobe_cpu_buffer *ucb, int dsize)
1087{ 1142{
1088 uprobe_perf_print(tu, func, regs); 1143 __uprobe_perf_func(tu, func, regs, ucb, dsize);
1089} 1144}
1090#endif /* CONFIG_PERF_EVENTS */ 1145#endif /* CONFIG_PERF_EVENTS */
1091 1146
1092static 1147static int
1093int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 1148trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
1149 void *data)
1094{ 1150{
1095 struct trace_uprobe *tu = event->data; 1151 struct trace_uprobe *tu = event->data;
1152 struct ftrace_event_file *file = data;
1096 1153
1097 switch (type) { 1154 switch (type) {
1098 case TRACE_REG_REGISTER: 1155 case TRACE_REG_REGISTER:
1099 return probe_event_enable(tu, TP_FLAG_TRACE, NULL); 1156 return probe_event_enable(tu, file, NULL);
1100 1157
1101 case TRACE_REG_UNREGISTER: 1158 case TRACE_REG_UNREGISTER:
1102 probe_event_disable(tu, TP_FLAG_TRACE); 1159 probe_event_disable(tu, file);
1103 return 0; 1160 return 0;
1104 1161
1105#ifdef CONFIG_PERF_EVENTS 1162#ifdef CONFIG_PERF_EVENTS
1106 case TRACE_REG_PERF_REGISTER: 1163 case TRACE_REG_PERF_REGISTER:
1107 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); 1164 return probe_event_enable(tu, NULL, uprobe_perf_filter);
1108 1165
1109 case TRACE_REG_PERF_UNREGISTER: 1166 case TRACE_REG_PERF_UNREGISTER:
1110 probe_event_disable(tu, TP_FLAG_PROFILE); 1167 probe_event_disable(tu, NULL);
1111 return 0; 1168 return 0;
1112 1169
1113 case TRACE_REG_PERF_OPEN: 1170 case TRACE_REG_PERF_OPEN:
@@ -1127,8 +1184,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1127{ 1184{
1128 struct trace_uprobe *tu; 1185 struct trace_uprobe *tu;
1129 struct uprobe_dispatch_data udd; 1186 struct uprobe_dispatch_data udd;
1187 struct uprobe_cpu_buffer *ucb;
1188 int dsize, esize;
1130 int ret = 0; 1189 int ret = 0;
1131 1190
1191
1132 tu = container_of(con, struct trace_uprobe, consumer); 1192 tu = container_of(con, struct trace_uprobe, consumer);
1133 tu->nhit++; 1193 tu->nhit++;
1134 1194
@@ -1137,13 +1197,29 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1137 1197
1138 current->utask->vaddr = (unsigned long) &udd; 1198 current->utask->vaddr = (unsigned long) &udd;
1139 1199
1200#ifdef CONFIG_PERF_EVENTS
1201 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1202 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1203 return UPROBE_HANDLER_REMOVE;
1204#endif
1205
1206 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1207 return 0;
1208
1209 dsize = __get_data_size(&tu->tp, regs);
1210 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1211
1212 ucb = uprobe_buffer_get();
1213 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1214
1140 if (tu->tp.flags & TP_FLAG_TRACE) 1215 if (tu->tp.flags & TP_FLAG_TRACE)
1141 ret |= uprobe_trace_func(tu, regs); 1216 ret |= uprobe_trace_func(tu, regs, ucb, dsize);
1142 1217
1143#ifdef CONFIG_PERF_EVENTS 1218#ifdef CONFIG_PERF_EVENTS
1144 if (tu->tp.flags & TP_FLAG_PROFILE) 1219 if (tu->tp.flags & TP_FLAG_PROFILE)
1145 ret |= uprobe_perf_func(tu, regs); 1220 ret |= uprobe_perf_func(tu, regs, ucb, dsize);
1146#endif 1221#endif
1222 uprobe_buffer_put(ucb);
1147 return ret; 1223 return ret;
1148} 1224}
1149 1225
@@ -1152,6 +1228,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1152{ 1228{
1153 struct trace_uprobe *tu; 1229 struct trace_uprobe *tu;
1154 struct uprobe_dispatch_data udd; 1230 struct uprobe_dispatch_data udd;
1231 struct uprobe_cpu_buffer *ucb;
1232 int dsize, esize;
1155 1233
1156 tu = container_of(con, struct trace_uprobe, consumer); 1234 tu = container_of(con, struct trace_uprobe, consumer);
1157 1235
@@ -1160,13 +1238,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1160 1238
1161 current->utask->vaddr = (unsigned long) &udd; 1239 current->utask->vaddr = (unsigned long) &udd;
1162 1240
1241 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1242 return 0;
1243
1244 dsize = __get_data_size(&tu->tp, regs);
1245 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1246
1247 ucb = uprobe_buffer_get();
1248 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1249
1163 if (tu->tp.flags & TP_FLAG_TRACE) 1250 if (tu->tp.flags & TP_FLAG_TRACE)
1164 uretprobe_trace_func(tu, func, regs); 1251 uretprobe_trace_func(tu, func, regs, ucb, dsize);
1165 1252
1166#ifdef CONFIG_PERF_EVENTS 1253#ifdef CONFIG_PERF_EVENTS
1167 if (tu->tp.flags & TP_FLAG_PROFILE) 1254 if (tu->tp.flags & TP_FLAG_PROFILE)
1168 uretprobe_perf_func(tu, func, regs); 1255 uretprobe_perf_func(tu, func, regs, ucb, dsize);
1169#endif 1256#endif
1257 uprobe_buffer_put(ucb);
1170 return 0; 1258 return 0;
1171} 1259}
1172 1260
@@ -1198,7 +1286,8 @@ static int register_uprobe_event(struct trace_uprobe *tu)
1198 ret = trace_add_event_call(call); 1286 ret = trace_add_event_call(call);
1199 1287
1200 if (ret) { 1288 if (ret) {
1201 pr_info("Failed to register uprobe event: %s\n", call->name); 1289 pr_info("Failed to register uprobe event: %s\n",
1290 ftrace_event_name(call));
1202 kfree(call->print_fmt); 1291 kfree(call->print_fmt);
1203 unregister_ftrace_event(&call->event); 1292 unregister_ftrace_event(&call->event);
1204 } 1293 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 031cc5655a51..ac5b23cf7212 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2008 Mathieu Desnoyers 2 * Copyright (C) 2008-2014 Mathieu Desnoyers
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -33,43 +33,29 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
33/* Set to 1 to enable tracepoint debug output */ 33/* Set to 1 to enable tracepoint debug output */
34static const int tracepoint_debug; 34static const int tracepoint_debug;
35 35
36#ifdef CONFIG_MODULES
36/* 37/*
37 * Tracepoints mutex protects the builtin and module tracepoints and the hash 38 * Tracepoint module list mutex protects the local module list.
38 * table, as well as the local module list.
39 */ 39 */
40static DEFINE_MUTEX(tracepoints_mutex); 40static DEFINE_MUTEX(tracepoint_module_list_mutex);
41 41
42#ifdef CONFIG_MODULES 42/* Local list of struct tp_module */
43/* Local list of struct module */
44static LIST_HEAD(tracepoint_module_list); 43static LIST_HEAD(tracepoint_module_list);
45#endif /* CONFIG_MODULES */ 44#endif /* CONFIG_MODULES */
46 45
47/* 46/*
48 * Tracepoint hash table, containing the active tracepoints. 47 * tracepoints_mutex protects the builtin and module tracepoints.
49 * Protected by tracepoints_mutex. 48 * tracepoints_mutex nests inside tracepoint_module_list_mutex.
50 */ 49 */
51#define TRACEPOINT_HASH_BITS 6 50static DEFINE_MUTEX(tracepoints_mutex);
52#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
53static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 51
55/* 52/*
56 * Note about RCU : 53 * Note about RCU :
57 * It is used to delay the free of multiple probes array until a quiescent 54 * It is used to delay the free of multiple probes array until a quiescent
58 * state is reached. 55 * state is reached.
59 * Tracepoint entries modifications are protected by the tracepoints_mutex.
60 */ 56 */
61struct tracepoint_entry {
62 struct hlist_node hlist;
63 struct tracepoint_func *funcs;
64 int refcount; /* Number of times armed. 0 if disarmed. */
65 char name[0];
66};
67
68struct tp_probes { 57struct tp_probes {
69 union { 58 struct rcu_head rcu;
70 struct rcu_head rcu;
71 struct list_head list;
72 } u;
73 struct tracepoint_func probes[0]; 59 struct tracepoint_func probes[0];
74}; 60};
75 61
@@ -82,7 +68,7 @@ static inline void *allocate_probes(int count)
82 68
83static void rcu_free_old_probes(struct rcu_head *head) 69static void rcu_free_old_probes(struct rcu_head *head)
84{ 70{
85 kfree(container_of(head, struct tp_probes, u.rcu)); 71 kfree(container_of(head, struct tp_probes, rcu));
86} 72}
87 73
88static inline void release_probes(struct tracepoint_func *old) 74static inline void release_probes(struct tracepoint_func *old)
@@ -90,38 +76,37 @@ static inline void release_probes(struct tracepoint_func *old)
90 if (old) { 76 if (old) {
91 struct tp_probes *tp_probes = container_of(old, 77 struct tp_probes *tp_probes = container_of(old,
92 struct tp_probes, probes[0]); 78 struct tp_probes, probes[0]);
93 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); 79 call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
94 } 80 }
95} 81}
96 82
97static void debug_print_probes(struct tracepoint_entry *entry) 83static void debug_print_probes(struct tracepoint_func *funcs)
98{ 84{
99 int i; 85 int i;
100 86
101 if (!tracepoint_debug || !entry->funcs) 87 if (!tracepoint_debug || !funcs)
102 return; 88 return;
103 89
104 for (i = 0; entry->funcs[i].func; i++) 90 for (i = 0; funcs[i].func; i++)
105 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); 91 printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
106} 92}
107 93
108static struct tracepoint_func * 94static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
109tracepoint_entry_add_probe(struct tracepoint_entry *entry, 95 struct tracepoint_func *tp_func)
110 void *probe, void *data)
111{ 96{
112 int nr_probes = 0; 97 int nr_probes = 0;
113 struct tracepoint_func *old, *new; 98 struct tracepoint_func *old, *new;
114 99
115 if (WARN_ON(!probe)) 100 if (WARN_ON(!tp_func->func))
116 return ERR_PTR(-EINVAL); 101 return ERR_PTR(-EINVAL);
117 102
118 debug_print_probes(entry); 103 debug_print_probes(*funcs);
119 old = entry->funcs; 104 old = *funcs;
120 if (old) { 105 if (old) {
121 /* (N -> N+1), (N != 0, 1) probes */ 106 /* (N -> N+1), (N != 0, 1) probes */
122 for (nr_probes = 0; old[nr_probes].func; nr_probes++) 107 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
123 if (old[nr_probes].func == probe && 108 if (old[nr_probes].func == tp_func->func &&
124 old[nr_probes].data == data) 109 old[nr_probes].data == tp_func->data)
125 return ERR_PTR(-EEXIST); 110 return ERR_PTR(-EEXIST);
126 } 111 }
127 /* + 2 : one for new probe, one for NULL func */ 112 /* + 2 : one for new probe, one for NULL func */
@@ -130,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
130 return ERR_PTR(-ENOMEM); 115 return ERR_PTR(-ENOMEM);
131 if (old) 116 if (old)
132 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); 117 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
133 new[nr_probes].func = probe; 118 new[nr_probes] = *tp_func;
134 new[nr_probes].data = data;
135 new[nr_probes + 1].func = NULL; 119 new[nr_probes + 1].func = NULL;
136 entry->refcount = nr_probes + 1; 120 *funcs = new;
137 entry->funcs = new; 121 debug_print_probes(*funcs);
138 debug_print_probes(entry);
139 return old; 122 return old;
140} 123}
141 124
142static void * 125static void *func_remove(struct tracepoint_func **funcs,
143tracepoint_entry_remove_probe(struct tracepoint_entry *entry, 126 struct tracepoint_func *tp_func)
144 void *probe, void *data)
145{ 127{
146 int nr_probes = 0, nr_del = 0, i; 128 int nr_probes = 0, nr_del = 0, i;
147 struct tracepoint_func *old, *new; 129 struct tracepoint_func *old, *new;
148 130
149 old = entry->funcs; 131 old = *funcs;
150 132
151 if (!old) 133 if (!old)
152 return ERR_PTR(-ENOENT); 134 return ERR_PTR(-ENOENT);
153 135
154 debug_print_probes(entry); 136 debug_print_probes(*funcs);
155 /* (N -> M), (N > 1, M >= 0) probes */ 137 /* (N -> M), (N > 1, M >= 0) probes */
156 if (probe) { 138 if (tp_func->func) {
157 for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 139 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
158 if (old[nr_probes].func == probe && 140 if (old[nr_probes].func == tp_func->func &&
159 old[nr_probes].data == data) 141 old[nr_probes].data == tp_func->data)
160 nr_del++; 142 nr_del++;
161 } 143 }
162 } 144 }
@@ -167,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
167 */ 149 */
168 if (nr_probes - nr_del == 0) { 150 if (nr_probes - nr_del == 0) {
169 /* N -> 0, (N > 1) */ 151 /* N -> 0, (N > 1) */
170 entry->funcs = NULL; 152 *funcs = NULL;
171 entry->refcount = 0; 153 debug_print_probes(*funcs);
172 debug_print_probes(entry);
173 return old; 154 return old;
174 } else { 155 } else {
175 int j = 0; 156 int j = 0;
@@ -179,90 +160,35 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
179 if (new == NULL) 160 if (new == NULL)
180 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
181 for (i = 0; old[i].func; i++) 162 for (i = 0; old[i].func; i++)
182 if (old[i].func != probe || old[i].data != data) 163 if (old[i].func != tp_func->func
164 || old[i].data != tp_func->data)
183 new[j++] = old[i]; 165 new[j++] = old[i];
184 new[nr_probes - nr_del].func = NULL; 166 new[nr_probes - nr_del].func = NULL;
185 entry->refcount = nr_probes - nr_del; 167 *funcs = new;
186 entry->funcs = new;
187 } 168 }
188 debug_print_probes(entry); 169 debug_print_probes(*funcs);
189 return old; 170 return old;
190} 171}
191 172
192/* 173/*
193 * Get tracepoint if the tracepoint is present in the tracepoint hash table. 174 * Add the probe function to a tracepoint.
194 * Must be called with tracepoints_mutex held.
195 * Returns NULL if not present.
196 */ 175 */
197static struct tracepoint_entry *get_tracepoint(const char *name) 176static int tracepoint_add_func(struct tracepoint *tp,
177 struct tracepoint_func *func)
198{ 178{
199 struct hlist_head *head; 179 struct tracepoint_func *old, *tp_funcs;
200 struct tracepoint_entry *e;
201 u32 hash = jhash(name, strlen(name), 0);
202
203 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
204 hlist_for_each_entry(e, head, hlist) {
205 if (!strcmp(name, e->name))
206 return e;
207 }
208 return NULL;
209}
210 180
211/* 181 if (tp->regfunc && !static_key_enabled(&tp->key))
212 * Add the tracepoint to the tracepoint hash table. Must be called with 182 tp->regfunc();
213 * tracepoints_mutex held.
214 */
215static struct tracepoint_entry *add_tracepoint(const char *name)
216{
217 struct hlist_head *head;
218 struct tracepoint_entry *e;
219 size_t name_len = strlen(name) + 1;
220 u32 hash = jhash(name, name_len-1, 0);
221
222 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
223 hlist_for_each_entry(e, head, hlist) {
224 if (!strcmp(name, e->name)) {
225 printk(KERN_NOTICE
226 "tracepoint %s busy\n", name);
227 return ERR_PTR(-EEXIST); /* Already there */
228 }
229 }
230 /*
231 * Using kmalloc here to allocate a variable length element. Could
232 * cause some memory fragmentation if overused.
233 */
234 e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
235 if (!e)
236 return ERR_PTR(-ENOMEM);
237 memcpy(&e->name[0], name, name_len);
238 e->funcs = NULL;
239 e->refcount = 0;
240 hlist_add_head(&e->hlist, head);
241 return e;
242}
243
244/*
245 * Remove the tracepoint from the tracepoint hash table. Must be called with
246 * mutex_lock held.
247 */
248static inline void remove_tracepoint(struct tracepoint_entry *e)
249{
250 hlist_del(&e->hlist);
251 kfree(e);
252}
253
254/*
255 * Sets the probe callback corresponding to one tracepoint.
256 */
257static void set_tracepoint(struct tracepoint_entry **entry,
258 struct tracepoint *elem, int active)
259{
260 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
261 183
262 if (elem->regfunc && !static_key_enabled(&elem->key) && active) 184 tp_funcs = rcu_dereference_protected(tp->funcs,
263 elem->regfunc(); 185 lockdep_is_held(&tracepoints_mutex));
264 else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) 186 old = func_add(&tp_funcs, func);
265 elem->unregfunc(); 187 if (IS_ERR(old)) {
188 WARN_ON_ONCE(1);
189 return PTR_ERR(old);
190 }
191 release_probes(old);
266 192
267 /* 193 /*
268 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 194 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -271,426 +197,215 @@ static void set_tracepoint(struct tracepoint_entry **entry,
271 * include/linux/tracepoints.h. A matching smp_read_barrier_depends() 197 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
272 * is used. 198 * is used.
273 */ 199 */
274 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 200 rcu_assign_pointer(tp->funcs, tp_funcs);
275 if (active && !static_key_enabled(&elem->key)) 201 if (!static_key_enabled(&tp->key))
276 static_key_slow_inc(&elem->key); 202 static_key_slow_inc(&tp->key);
277 else if (!active && static_key_enabled(&elem->key)) 203 return 0;
278 static_key_slow_dec(&elem->key);
279} 204}
280 205
281/* 206/*
282 * Disable a tracepoint and its probe callback. 207 * Remove a probe function from a tracepoint.
283 * Note: only waiting an RCU period after setting elem->call to the empty 208 * Note: only waiting an RCU period after setting elem->call to the empty
284 * function insures that the original callback is not used anymore. This insured 209 * function insures that the original callback is not used anymore. This insured
285 * by preempt_disable around the call site. 210 * by preempt_disable around the call site.
286 */ 211 */
287static void disable_tracepoint(struct tracepoint *elem) 212static int tracepoint_remove_func(struct tracepoint *tp,
213 struct tracepoint_func *func)
288{ 214{
289 if (elem->unregfunc && static_key_enabled(&elem->key)) 215 struct tracepoint_func *old, *tp_funcs;
290 elem->unregfunc();
291 216
292 if (static_key_enabled(&elem->key)) 217 tp_funcs = rcu_dereference_protected(tp->funcs,
293 static_key_slow_dec(&elem->key); 218 lockdep_is_held(&tracepoints_mutex));
294 rcu_assign_pointer(elem->funcs, NULL); 219 old = func_remove(&tp_funcs, func);
295} 220 if (IS_ERR(old)) {
296 221 WARN_ON_ONCE(1);
297/** 222 return PTR_ERR(old);
298 * tracepoint_update_probe_range - Update a probe range
299 * @begin: beginning of the range
300 * @end: end of the range
301 *
302 * Updates the probe callback corresponding to a range of tracepoints.
303 * Called with tracepoints_mutex held.
304 */
305static void tracepoint_update_probe_range(struct tracepoint * const *begin,
306 struct tracepoint * const *end)
307{
308 struct tracepoint * const *iter;
309 struct tracepoint_entry *mark_entry;
310
311 if (!begin)
312 return;
313
314 for (iter = begin; iter < end; iter++) {
315 mark_entry = get_tracepoint((*iter)->name);
316 if (mark_entry) {
317 set_tracepoint(&mark_entry, *iter,
318 !!mark_entry->refcount);
319 } else {
320 disable_tracepoint(*iter);
321 }
322 } 223 }
323} 224 release_probes(old);
324
325#ifdef CONFIG_MODULES
326void module_update_tracepoints(void)
327{
328 struct tp_module *tp_mod;
329
330 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
331 tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
332 tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
333}
334#else /* CONFIG_MODULES */
335void module_update_tracepoints(void)
336{
337}
338#endif /* CONFIG_MODULES */
339
340 225
341/* 226 if (!tp_funcs) {
342 * Update probes, removing the faulty probes. 227 /* Removed last function */
343 * Called with tracepoints_mutex held. 228 if (tp->unregfunc && static_key_enabled(&tp->key))
344 */ 229 tp->unregfunc();
345static void tracepoint_update_probes(void)
346{
347 /* Core kernel tracepoints */
348 tracepoint_update_probe_range(__start___tracepoints_ptrs,
349 __stop___tracepoints_ptrs);
350 /* tracepoints in modules. */
351 module_update_tracepoints();
352}
353 230
354static struct tracepoint_func * 231 if (static_key_enabled(&tp->key))
355tracepoint_add_probe(const char *name, void *probe, void *data) 232 static_key_slow_dec(&tp->key);
356{
357 struct tracepoint_entry *entry;
358 struct tracepoint_func *old;
359
360 entry = get_tracepoint(name);
361 if (!entry) {
362 entry = add_tracepoint(name);
363 if (IS_ERR(entry))
364 return (struct tracepoint_func *)entry;
365 } 233 }
366 old = tracepoint_entry_add_probe(entry, probe, data); 234 rcu_assign_pointer(tp->funcs, tp_funcs);
367 if (IS_ERR(old) && !entry->refcount) 235 return 0;
368 remove_tracepoint(entry);
369 return old;
370} 236}
371 237
372/** 238/**
373 * tracepoint_probe_register - Connect a probe to a tracepoint 239 * tracepoint_probe_register - Connect a probe to a tracepoint
374 * @name: tracepoint name 240 * @tp: tracepoint
375 * @probe: probe handler 241 * @probe: probe handler
376 * 242 *
377 * Returns 0 if ok, error value on error. 243 * Returns 0 if ok, error value on error.
378 * The probe address must at least be aligned on the architecture pointer size. 244 * Note: if @tp is within a module, the caller is responsible for
245 * unregistering the probe before the module is gone. This can be
246 * performed either with a tracepoint module going notifier, or from
247 * within module exit functions.
379 */ 248 */
380int tracepoint_probe_register(const char *name, void *probe, void *data) 249int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
381{ 250{
382 struct tracepoint_func *old; 251 struct tracepoint_func tp_func;
252 int ret;
383 253
384 mutex_lock(&tracepoints_mutex); 254 mutex_lock(&tracepoints_mutex);
385 old = tracepoint_add_probe(name, probe, data); 255 tp_func.func = probe;
386 if (IS_ERR(old)) { 256 tp_func.data = data;
387 mutex_unlock(&tracepoints_mutex); 257 ret = tracepoint_add_func(tp, &tp_func);
388 return PTR_ERR(old);
389 }
390 tracepoint_update_probes(); /* may update entry */
391 mutex_unlock(&tracepoints_mutex); 258 mutex_unlock(&tracepoints_mutex);
392 release_probes(old); 259 return ret;
393 return 0;
394} 260}
395EXPORT_SYMBOL_GPL(tracepoint_probe_register); 261EXPORT_SYMBOL_GPL(tracepoint_probe_register);
396 262
397static struct tracepoint_func *
398tracepoint_remove_probe(const char *name, void *probe, void *data)
399{
400 struct tracepoint_entry *entry;
401 struct tracepoint_func *old;
402
403 entry = get_tracepoint(name);
404 if (!entry)
405 return ERR_PTR(-ENOENT);
406 old = tracepoint_entry_remove_probe(entry, probe, data);
407 if (IS_ERR(old))
408 return old;
409 if (!entry->refcount)
410 remove_tracepoint(entry);
411 return old;
412}
413
414/** 263/**
415 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 264 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
416 * @name: tracepoint name 265 * @tp: tracepoint
417 * @probe: probe function pointer 266 * @probe: probe function pointer
418 * 267 *
419 * We do not need to call a synchronize_sched to make sure the probes have 268 * Returns 0 if ok, error value on error.
420 * finished running before doing a module unload, because the module unload
421 * itself uses stop_machine(), which insures that every preempt disabled section
422 * have finished.
423 */ 269 */
424int tracepoint_probe_unregister(const char *name, void *probe, void *data) 270int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
425{ 271{
426 struct tracepoint_func *old; 272 struct tracepoint_func tp_func;
273 int ret;
427 274
428 mutex_lock(&tracepoints_mutex); 275 mutex_lock(&tracepoints_mutex);
429 old = tracepoint_remove_probe(name, probe, data); 276 tp_func.func = probe;
430 if (IS_ERR(old)) { 277 tp_func.data = data;
431 mutex_unlock(&tracepoints_mutex); 278 ret = tracepoint_remove_func(tp, &tp_func);
432 return PTR_ERR(old);
433 }
434 tracepoint_update_probes(); /* may update entry */
435 mutex_unlock(&tracepoints_mutex); 279 mutex_unlock(&tracepoints_mutex);
436 release_probes(old); 280 return ret;
437 return 0;
438} 281}
439EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 282EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
440 283
441static LIST_HEAD(old_probes); 284#ifdef CONFIG_MODULES
442static int need_update; 285bool trace_module_has_bad_taint(struct module *mod)
443
444static void tracepoint_add_old_probes(void *old)
445{ 286{
446 need_update = 1; 287 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
447 if (old) { 288 (1 << TAINT_UNSIGNED_MODULE));
448 struct tp_probes *tp_probes = container_of(old,
449 struct tp_probes, probes[0]);
450 list_add(&tp_probes->u.list, &old_probes);
451 }
452} 289}
453 290
454/** 291static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
455 * tracepoint_probe_register_noupdate - register a probe but not connect
456 * @name: tracepoint name
457 * @probe: probe handler
458 *
459 * caller must call tracepoint_probe_update_all()
460 */
461int tracepoint_probe_register_noupdate(const char *name, void *probe,
462 void *data)
463{
464 struct tracepoint_func *old;
465
466 mutex_lock(&tracepoints_mutex);
467 old = tracepoint_add_probe(name, probe, data);
468 if (IS_ERR(old)) {
469 mutex_unlock(&tracepoints_mutex);
470 return PTR_ERR(old);
471 }
472 tracepoint_add_old_probes(old);
473 mutex_unlock(&tracepoints_mutex);
474 return 0;
475}
476EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
477 292
478/** 293/**
479 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect 294 * register_tracepoint_notifier - register tracepoint coming/going notifier
480 * @name: tracepoint name 295 * @nb: notifier block
481 * @probe: probe function pointer
482 * 296 *
483 * caller must call tracepoint_probe_update_all() 297 * Notifiers registered with this function are called on module
298 * coming/going with the tracepoint_module_list_mutex held.
299 * The notifier block callback should expect a "struct tp_module" data
300 * pointer.
484 */ 301 */
485int tracepoint_probe_unregister_noupdate(const char *name, void *probe, 302int register_tracepoint_module_notifier(struct notifier_block *nb)
486 void *data)
487{ 303{
488 struct tracepoint_func *old; 304 struct tp_module *tp_mod;
489 305 int ret;
490 mutex_lock(&tracepoints_mutex);
491 old = tracepoint_remove_probe(name, probe, data);
492 if (IS_ERR(old)) {
493 mutex_unlock(&tracepoints_mutex);
494 return PTR_ERR(old);
495 }
496 tracepoint_add_old_probes(old);
497 mutex_unlock(&tracepoints_mutex);
498 return 0;
499}
500EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
501
502/**
503 * tracepoint_probe_update_all - update tracepoints
504 */
505void tracepoint_probe_update_all(void)
506{
507 LIST_HEAD(release_probes);
508 struct tp_probes *pos, *next;
509 306
510 mutex_lock(&tracepoints_mutex); 307 mutex_lock(&tracepoint_module_list_mutex);
511 if (!need_update) { 308 ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
512 mutex_unlock(&tracepoints_mutex); 309 if (ret)
513 return; 310 goto end;
514 } 311 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
515 if (!list_empty(&old_probes)) 312 (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
516 list_replace_init(&old_probes, &release_probes); 313end:
517 need_update = 0; 314 mutex_unlock(&tracepoint_module_list_mutex);
518 tracepoint_update_probes(); 315 return ret;
519 mutex_unlock(&tracepoints_mutex);
520 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
521 list_del(&pos->u.list);
522 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
523 }
524} 316}
525EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); 317EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
526 318
527/** 319/**
528 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 320 * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
529 * @tracepoint: current tracepoints (in), next tracepoint (out) 321 * @nb: notifier block
530 * @begin: beginning of the range
531 * @end: end of the range
532 * 322 *
533 * Returns whether a next tracepoint has been found (1) or not (0). 323 * The notifier block callback should expect a "struct tp_module" data
534 * Will return the first tracepoint in the range if the input tracepoint is 324 * pointer.
535 * NULL.
536 */ 325 */
537static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, 326int unregister_tracepoint_module_notifier(struct notifier_block *nb)
538 struct tracepoint * const *begin, struct tracepoint * const *end)
539{ 327{
540 if (!*tracepoint && begin != end) { 328 struct tp_module *tp_mod;
541 *tracepoint = begin; 329 int ret;
542 return 1;
543 }
544 if (*tracepoint >= begin && *tracepoint < end)
545 return 1;
546 return 0;
547}
548 330
549#ifdef CONFIG_MODULES 331 mutex_lock(&tracepoint_module_list_mutex);
550static void tracepoint_get_iter(struct tracepoint_iter *iter) 332 ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
551{ 333 if (ret)
552 int found = 0; 334 goto end;
553 struct tp_module *iter_mod; 335 list_for_each_entry(tp_mod, &tracepoint_module_list, list)
554 336 (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
555 /* Core kernel tracepoints */
556 if (!iter->module) {
557 found = tracepoint_get_iter_range(&iter->tracepoint,
558 __start___tracepoints_ptrs,
559 __stop___tracepoints_ptrs);
560 if (found)
561 goto end;
562 }
563 /* Tracepoints in modules */
564 mutex_lock(&tracepoints_mutex);
565 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
566 /*
567 * Sorted module list
568 */
569 if (iter_mod < iter->module)
570 continue;
571 else if (iter_mod > iter->module)
572 iter->tracepoint = NULL;
573 found = tracepoint_get_iter_range(&iter->tracepoint,
574 iter_mod->tracepoints_ptrs,
575 iter_mod->tracepoints_ptrs
576 + iter_mod->num_tracepoints);
577 if (found) {
578 iter->module = iter_mod;
579 break;
580 }
581 }
582 mutex_unlock(&tracepoints_mutex);
583end: 337end:
584 if (!found) 338 mutex_unlock(&tracepoint_module_list_mutex);
585 tracepoint_iter_reset(iter); 339 return ret;
586}
587#else /* CONFIG_MODULES */
588static void tracepoint_get_iter(struct tracepoint_iter *iter)
589{
590 int found = 0;
591
592 /* Core kernel tracepoints */
593 found = tracepoint_get_iter_range(&iter->tracepoint,
594 __start___tracepoints_ptrs,
595 __stop___tracepoints_ptrs);
596 if (!found)
597 tracepoint_iter_reset(iter);
598}
599#endif /* CONFIG_MODULES */
600
601void tracepoint_iter_start(struct tracepoint_iter *iter)
602{
603 tracepoint_get_iter(iter);
604}
605EXPORT_SYMBOL_GPL(tracepoint_iter_start);
606
607void tracepoint_iter_next(struct tracepoint_iter *iter)
608{
609 iter->tracepoint++;
610 /*
611 * iter->tracepoint may be invalid because we blindly incremented it.
612 * Make sure it is valid by marshalling on the tracepoints, getting the
613 * tracepoints from following modules if necessary.
614 */
615 tracepoint_get_iter(iter);
616}
617EXPORT_SYMBOL_GPL(tracepoint_iter_next);
618 340
619void tracepoint_iter_stop(struct tracepoint_iter *iter)
620{
621} 341}
622EXPORT_SYMBOL_GPL(tracepoint_iter_stop); 342EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
623 343
624void tracepoint_iter_reset(struct tracepoint_iter *iter) 344/*
345 * Ensure the tracer unregistered the module's probes before the module
346 * teardown is performed. Prevents leaks of probe and data pointers.
347 */
348static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
349 struct tracepoint * const *end)
625{ 350{
626#ifdef CONFIG_MODULES 351 struct tracepoint * const *iter;
627 iter->module = NULL;
628#endif /* CONFIG_MODULES */
629 iter->tracepoint = NULL;
630}
631EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
632 352
633#ifdef CONFIG_MODULES 353 if (!begin)
634bool trace_module_has_bad_taint(struct module *mod) 354 return;
635{ 355 for (iter = begin; iter < end; iter++)
636 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); 356 WARN_ON_ONCE((*iter)->funcs);
637} 357}
638 358
639static int tracepoint_module_coming(struct module *mod) 359static int tracepoint_module_coming(struct module *mod)
640{ 360{
641 struct tp_module *tp_mod, *iter; 361 struct tp_module *tp_mod;
642 int ret = 0; 362 int ret = 0;
643 363
364 if (!mod->num_tracepoints)
365 return 0;
366
644 /* 367 /*
645 * We skip modules that taint the kernel, especially those with different 368 * We skip modules that taint the kernel, especially those with different
646 * module headers (for forced load), to make sure we don't cause a crash. 369 * module headers (for forced load), to make sure we don't cause a crash.
647 * Staging and out-of-tree GPL modules are fine. 370 * Staging, out-of-tree, and unsigned GPL modules are fine.
648 */ 371 */
649 if (trace_module_has_bad_taint(mod)) 372 if (trace_module_has_bad_taint(mod))
650 return 0; 373 return 0;
651 mutex_lock(&tracepoints_mutex); 374 mutex_lock(&tracepoint_module_list_mutex);
652 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); 375 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
653 if (!tp_mod) { 376 if (!tp_mod) {
654 ret = -ENOMEM; 377 ret = -ENOMEM;
655 goto end; 378 goto end;
656 } 379 }
657 tp_mod->num_tracepoints = mod->num_tracepoints; 380 tp_mod->mod = mod;
658 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; 381 list_add_tail(&tp_mod->list, &tracepoint_module_list);
659 382 blocking_notifier_call_chain(&tracepoint_notify_list,
660 /* 383 MODULE_STATE_COMING, tp_mod);
661 * tracepoint_module_list is kept sorted by struct module pointer
662 * address for iteration on tracepoints from a seq_file that can release
663 * the mutex between calls.
664 */
665 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
666 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
667 if (iter < tp_mod) {
668 /* We belong to the location right after iter. */
669 list_add(&tp_mod->list, &iter->list);
670 goto module_added;
671 }
672 }
673 /* We belong to the beginning of the list */
674 list_add(&tp_mod->list, &tracepoint_module_list);
675module_added:
676 tracepoint_update_probe_range(mod->tracepoints_ptrs,
677 mod->tracepoints_ptrs + mod->num_tracepoints);
678end: 384end:
679 mutex_unlock(&tracepoints_mutex); 385 mutex_unlock(&tracepoint_module_list_mutex);
680 return ret; 386 return ret;
681} 387}
682 388
683static int tracepoint_module_going(struct module *mod) 389static void tracepoint_module_going(struct module *mod)
684{ 390{
685 struct tp_module *pos; 391 struct tp_module *tp_mod;
686 392
687 mutex_lock(&tracepoints_mutex); 393 if (!mod->num_tracepoints)
688 tracepoint_update_probe_range(mod->tracepoints_ptrs, 394 return;
689 mod->tracepoints_ptrs + mod->num_tracepoints); 395
690 list_for_each_entry(pos, &tracepoint_module_list, list) { 396 mutex_lock(&tracepoint_module_list_mutex);
691 if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { 397 list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
692 list_del(&pos->list); 398 if (tp_mod->mod == mod) {
693 kfree(pos); 399 blocking_notifier_call_chain(&tracepoint_notify_list,
400 MODULE_STATE_GOING, tp_mod);
401 list_del(&tp_mod->list);
402 kfree(tp_mod);
403 /*
404 * Called the going notifier before checking for
405 * quiescence.
406 */
407 tp_module_going_check_quiescent(mod->tracepoints_ptrs,
408 mod->tracepoints_ptrs + mod->num_tracepoints);
694 break; 409 break;
695 } 410 }
696 } 411 }
@@ -700,12 +415,11 @@ static int tracepoint_module_going(struct module *mod)
700 * flag on "going", in case a module taints the kernel only after being 415 * flag on "going", in case a module taints the kernel only after being
701 * loaded. 416 * loaded.
702 */ 417 */
703 mutex_unlock(&tracepoints_mutex); 418 mutex_unlock(&tracepoint_module_list_mutex);
704 return 0;
705} 419}
706 420
707int tracepoint_module_notify(struct notifier_block *self, 421static int tracepoint_module_notify(struct notifier_block *self,
708 unsigned long val, void *data) 422 unsigned long val, void *data)
709{ 423{
710 struct module *mod = data; 424 struct module *mod = data;
711 int ret = 0; 425 int ret = 0;
@@ -717,24 +431,58 @@ int tracepoint_module_notify(struct notifier_block *self,
717 case MODULE_STATE_LIVE: 431 case MODULE_STATE_LIVE:
718 break; 432 break;
719 case MODULE_STATE_GOING: 433 case MODULE_STATE_GOING:
720 ret = tracepoint_module_going(mod); 434 tracepoint_module_going(mod);
435 break;
436 case MODULE_STATE_UNFORMED:
721 break; 437 break;
722 } 438 }
723 return ret; 439 return ret;
724} 440}
725 441
726struct notifier_block tracepoint_module_nb = { 442static struct notifier_block tracepoint_module_nb = {
727 .notifier_call = tracepoint_module_notify, 443 .notifier_call = tracepoint_module_notify,
728 .priority = 0, 444 .priority = 0,
729}; 445};
730 446
731static int init_tracepoints(void) 447static __init int init_tracepoints(void)
732{ 448{
733 return register_module_notifier(&tracepoint_module_nb); 449 int ret;
450
451 ret = register_module_notifier(&tracepoint_module_nb);
452 if (ret)
453 pr_warning("Failed to register tracepoint module enter notifier\n");
454
455 return ret;
734} 456}
735__initcall(init_tracepoints); 457__initcall(init_tracepoints);
736#endif /* CONFIG_MODULES */ 458#endif /* CONFIG_MODULES */
737 459
460static void for_each_tracepoint_range(struct tracepoint * const *begin,
461 struct tracepoint * const *end,
462 void (*fct)(struct tracepoint *tp, void *priv),
463 void *priv)
464{
465 struct tracepoint * const *iter;
466
467 if (!begin)
468 return;
469 for (iter = begin; iter < end; iter++)
470 fct(*iter, priv);
471}
472
473/**
474 * for_each_kernel_tracepoint - iteration on all kernel tracepoints
475 * @fct: callback
476 * @priv: private data
477 */
478void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
479 void *priv)
480{
481 for_each_tracepoint_range(__start___tracepoints_ptrs,
482 __stop___tracepoints_ptrs, fct, priv);
483}
484EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
485
738#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 486#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
739 487
740/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ 488/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
diff --git a/kernel/user.c b/kernel/user.c
index c006131beb77..294fc6a94168 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -222,5 +222,4 @@ static int __init uid_cache_init(void)
222 222
223 return 0; 223 return 0;
224} 224}
225 225subsys_initcall(uid_cache_init);
226module_init(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index dd06439b9c84..bf71b4b2d632 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
152 152
153 /* Find the matching extent */ 153 /* Find the matching extent */
154 extents = map->nr_extents; 154 extents = map->nr_extents;
155 smp_read_barrier_depends(); 155 smp_rmb();
156 for (idx = 0; idx < extents; idx++) { 156 for (idx = 0; idx < extents; idx++) {
157 first = map->extent[idx].first; 157 first = map->extent[idx].first;
158 last = first + map->extent[idx].count - 1; 158 last = first + map->extent[idx].count - 1;
@@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
176 176
177 /* Find the matching extent */ 177 /* Find the matching extent */
178 extents = map->nr_extents; 178 extents = map->nr_extents;
179 smp_read_barrier_depends(); 179 smp_rmb();
180 for (idx = 0; idx < extents; idx++) { 180 for (idx = 0; idx < extents; idx++) {
181 first = map->extent[idx].first; 181 first = map->extent[idx].first;
182 last = first + map->extent[idx].count - 1; 182 last = first + map->extent[idx].count - 1;
@@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
199 199
200 /* Find the matching extent */ 200 /* Find the matching extent */
201 extents = map->nr_extents; 201 extents = map->nr_extents;
202 smp_read_barrier_depends(); 202 smp_rmb();
203 for (idx = 0; idx < extents; idx++) { 203 for (idx = 0; idx < extents; idx++) {
204 first = map->extent[idx].lower_first; 204 first = map->extent[idx].lower_first;
205 last = first + map->extent[idx].count - 1; 205 last = first + map->extent[idx].count - 1;
@@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
615 * were written before the count of the extents. 615 * were written before the count of the extents.
616 * 616 *
617 * To achieve this smp_wmb() is used on guarantee the write 617 * To achieve this smp_wmb() is used on guarantee the write
618 * order and smp_read_barrier_depends() is guaranteed that we 618 * order and smp_rmb() is guaranteed that we don't have crazy
619 * don't have crazy architectures returning stale data. 619 * architectures returning stale data.
620 *
621 */ 620 */
622 mutex_lock(&id_map_mutex); 621 mutex_lock(&id_map_mutex);
623 622
@@ -902,4 +901,4 @@ static __init int user_namespaces_init(void)
902 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 901 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
903 return 0; 902 return 0;
904} 903}
905module_init(user_namespaces_init); 904subsys_initcall(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 01c6f979486f..516203e665fc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -138,7 +138,11 @@ static void __touch_watchdog(void)
138 138
139void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
140{ 140{
141 __this_cpu_write(watchdog_touch_ts, 0); 141 /*
142 * Preemption can be enabled. It doesn't matter which CPU's timestamp
143 * gets zeroed here, so use the raw_ operation.
144 */
145 raw_cpu_write(watchdog_touch_ts, 0);
142} 146}
143EXPORT_SYMBOL(touch_softlockup_watchdog); 147EXPORT_SYMBOL(touch_softlockup_watchdog);
144 148
@@ -158,14 +162,14 @@ void touch_all_softlockup_watchdogs(void)
158#ifdef CONFIG_HARDLOCKUP_DETECTOR 162#ifdef CONFIG_HARDLOCKUP_DETECTOR
159void touch_nmi_watchdog(void) 163void touch_nmi_watchdog(void)
160{ 164{
161 if (watchdog_user_enabled) { 165 /*
162 unsigned cpu; 166 * Using __raw here because some code paths have
163 167 * preemption enabled. If preemption is enabled
164 for_each_present_cpu(cpu) { 168 * then interrupts should be enabled too, in which
165 if (per_cpu(watchdog_nmi_touch, cpu) != true) 169 * case we shouldn't have to worry about the watchdog
166 per_cpu(watchdog_nmi_touch, cpu) = true; 170 * going off.
167 } 171 */
168 } 172 __raw_get_cpu_var(watchdog_nmi_touch) = true;
169 touch_softlockup_watchdog(); 173 touch_softlockup_watchdog();
170} 174}
171EXPORT_SYMBOL(touch_nmi_watchdog); 175EXPORT_SYMBOL(touch_nmi_watchdog);