aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c693
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c47
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/lockdep.c1
-rw-r--r--kernel/nsproxy.c13
-rw-r--r--kernel/params.c6
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/rcutree.h21
-rw-r--r--kernel/rcutree_plugin.h8
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/sched_rt.c7
-rw-r--r--kernel/sys.c67
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/trace/ftrace.c30
-rw-r--r--kernel/trace/ring_buffer.c16
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_functions_graph.c27
30 files changed, 889 insertions, 189 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
588} 588}
589 589
590/** 590/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 591 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 592 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 593 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..78f7f86aa238 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 398 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..ef909a329750 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -44,6 +48,7 @@
44#include <linux/string.h> 48#include <linux/string.h>
45#include <linux/sort.h> 49#include <linux/sort.h>
46#include <linux/kmod.h> 50#include <linux/kmod.h>
51#include <linux/module.h>
47#include <linux/delayacct.h> 52#include <linux/delayacct.h>
48#include <linux/cgroupstats.h> 53#include <linux/cgroupstats.h>
49#include <linux/hash.h> 54#include <linux/hash.h>
@@ -52,15 +57,21 @@
52#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
53#include <linux/idr.h> 58#include <linux/idr.h>
54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h>
61#include <linux/poll.h>
55 62
56#include <asm/atomic.h> 63#include <asm/atomic.h>
57 64
58static DEFINE_MUTEX(cgroup_mutex); 65static DEFINE_MUTEX(cgroup_mutex);
59 66
60/* Generate an array of cgroup subsystem pointers */ 67/*
68 * Generate an array of cgroup subsystem pointers. At boot time, this is
69 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
70 * registered after that. The mutable section of this array is protected by
71 * cgroup_mutex.
72 */
61#define SUBSYS(_x) &_x ## _subsys, 73#define SUBSYS(_x) &_x ## _subsys,
62 74static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
63static struct cgroup_subsys *subsys[] = {
64#include <linux/cgroup_subsys.h> 75#include <linux/cgroup_subsys.h>
65}; 76};
66 77
@@ -147,6 +158,35 @@ struct css_id {
147 unsigned short stack[0]; /* Array of Length (depth+1) */ 158 unsigned short stack[0]; /* Array of Length (depth+1) */
148}; 159};
149 160
161/*
162 * cgroup_event represents events which userspace want to recieve.
163 */
164struct cgroup_event {
165 /*
166 * Cgroup which the event belongs to.
167 */
168 struct cgroup *cgrp;
169 /*
170 * Control file which the event associated.
171 */
172 struct cftype *cft;
173 /*
174 * eventfd to signal userspace about the event.
175 */
176 struct eventfd_ctx *eventfd;
177 /*
178 * Each of these stored in a list by the cgroup.
179 */
180 struct list_head list;
181 /*
182 * All fields below needed to unregister event when
183 * userspace closes eventfd.
184 */
185 poll_table pt;
186 wait_queue_head_t *wqh;
187 wait_queue_t wait;
188 struct work_struct remove;
189};
150 190
151/* The list of hierarchy roots */ 191/* The list of hierarchy roots */
152 192
@@ -250,7 +290,8 @@ struct cg_cgroup_link {
250static struct css_set init_css_set; 290static struct css_set init_css_set;
251static struct cg_cgroup_link init_css_set_link; 291static struct cg_cgroup_link init_css_set_link;
252 292
253static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 293static int cgroup_init_idr(struct cgroup_subsys *ss,
294 struct cgroup_subsys_state *css);
254 295
255/* css_set_lock protects the list of css_set objects, and the 296/* css_set_lock protects the list of css_set objects, and the
256 * chain of tasks off each css_set. Nests outside task->alloc_lock 297 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -448,8 +489,11 @@ static struct css_set *find_existing_css_set(
448 struct hlist_node *node; 489 struct hlist_node *node;
449 struct css_set *cg; 490 struct css_set *cg;
450 491
451 /* Built the set of subsystem state objects that we want to 492 /*
452 * see in the new css_set */ 493 * Build the set of subsystem state objects that we want to see in the
494 * new css_set. while subsystems can change globally, the entries here
495 * won't change, so no need for locking.
496 */
453 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 497 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
454 if (root->subsys_bits & (1UL << i)) { 498 if (root->subsys_bits & (1UL << i)) {
455 /* Subsystem is in this hierarchy. So we want 499 /* Subsystem is in this hierarchy. So we want
@@ -696,6 +740,7 @@ void cgroup_lock(void)
696{ 740{
697 mutex_lock(&cgroup_mutex); 741 mutex_lock(&cgroup_mutex);
698} 742}
743EXPORT_SYMBOL_GPL(cgroup_lock);
699 744
700/** 745/**
701 * cgroup_unlock - release lock on cgroup changes 746 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +751,7 @@ void cgroup_unlock(void)
706{ 751{
707 mutex_unlock(&cgroup_mutex); 752 mutex_unlock(&cgroup_mutex);
708} 753}
754EXPORT_SYMBOL_GPL(cgroup_unlock);
709 755
710/* 756/*
711 * A couple of forward declarations required, due to cyclic reference loop: 757 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
757 if (ret) 803 if (ret)
758 break; 804 break;
759 } 805 }
806
760 return ret; 807 return ret;
761} 808}
762 809
@@ -884,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
884 css_put(css); 931 css_put(css);
885} 932}
886 933
887 934/*
935 * Call with cgroup_mutex held. Drops reference counts on modules, including
936 * any duplicate ones that parse_cgroupfs_options took. If this function
937 * returns an error, no reference counts are touched.
938 */
888static int rebind_subsystems(struct cgroupfs_root *root, 939static int rebind_subsystems(struct cgroupfs_root *root,
889 unsigned long final_bits) 940 unsigned long final_bits)
890{ 941{
@@ -892,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
892 struct cgroup *cgrp = &root->top_cgroup; 943 struct cgroup *cgrp = &root->top_cgroup;
893 int i; 944 int i;
894 945
946 BUG_ON(!mutex_is_locked(&cgroup_mutex));
947
895 removed_bits = root->actual_subsys_bits & ~final_bits; 948 removed_bits = root->actual_subsys_bits & ~final_bits;
896 added_bits = final_bits & ~root->actual_subsys_bits; 949 added_bits = final_bits & ~root->actual_subsys_bits;
897 /* Check that any added subsystems are currently free */ 950 /* Check that any added subsystems are currently free */
@@ -900,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
900 struct cgroup_subsys *ss = subsys[i]; 953 struct cgroup_subsys *ss = subsys[i];
901 if (!(bit & added_bits)) 954 if (!(bit & added_bits))
902 continue; 955 continue;
956 /*
957 * Nobody should tell us to do a subsys that doesn't exist:
958 * parse_cgroupfs_options should catch that case and refcounts
959 * ensure that subsystems won't disappear once selected.
960 */
961 BUG_ON(ss == NULL);
903 if (ss->root != &rootnode) { 962 if (ss->root != &rootnode) {
904 /* Subsystem isn't free */ 963 /* Subsystem isn't free */
905 return -EBUSY; 964 return -EBUSY;
@@ -919,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
919 unsigned long bit = 1UL << i; 978 unsigned long bit = 1UL << i;
920 if (bit & added_bits) { 979 if (bit & added_bits) {
921 /* We're binding this subsystem to this hierarchy */ 980 /* We're binding this subsystem to this hierarchy */
981 BUG_ON(ss == NULL);
922 BUG_ON(cgrp->subsys[i]); 982 BUG_ON(cgrp->subsys[i]);
923 BUG_ON(!dummytop->subsys[i]); 983 BUG_ON(!dummytop->subsys[i]);
924 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 984 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
930 if (ss->bind) 990 if (ss->bind)
931 ss->bind(ss, cgrp); 991 ss->bind(ss, cgrp);
932 mutex_unlock(&ss->hierarchy_mutex); 992 mutex_unlock(&ss->hierarchy_mutex);
993 /* refcount was already taken, and we're keeping it */
933 } else if (bit & removed_bits) { 994 } else if (bit & removed_bits) {
934 /* We're removing this subsystem */ 995 /* We're removing this subsystem */
996 BUG_ON(ss == NULL);
935 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 997 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
936 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 998 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
937 mutex_lock(&ss->hierarchy_mutex); 999 mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
942 subsys[i]->root = &rootnode; 1004 subsys[i]->root = &rootnode;
943 list_move(&ss->sibling, &rootnode.subsys_list); 1005 list_move(&ss->sibling, &rootnode.subsys_list);
944 mutex_unlock(&ss->hierarchy_mutex); 1006 mutex_unlock(&ss->hierarchy_mutex);
1007 /* subsystem is now free - drop reference on module */
1008 module_put(ss->module);
945 } else if (bit & final_bits) { 1009 } else if (bit & final_bits) {
946 /* Subsystem state should already exist */ 1010 /* Subsystem state should already exist */
1011 BUG_ON(ss == NULL);
947 BUG_ON(!cgrp->subsys[i]); 1012 BUG_ON(!cgrp->subsys[i]);
1013 /*
1014 * a refcount was taken, but we already had one, so
1015 * drop the extra reference.
1016 */
1017 module_put(ss->module);
1018#ifdef CONFIG_MODULE_UNLOAD
1019 BUG_ON(ss->module && !module_refcount(ss->module));
1020#endif
948 } else { 1021 } else {
949 /* Subsystem state shouldn't exist */ 1022 /* Subsystem state shouldn't exist */
950 BUG_ON(cgrp->subsys[i]); 1023 BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1059,20 @@ struct cgroup_sb_opts {
986 1059
987}; 1060};
988 1061
989/* Convert a hierarchy specifier into a bitmask of subsystems and 1062/*
990 * flags. */ 1063 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
991static int parse_cgroupfs_options(char *data, 1064 * with cgroup_mutex held to protect the subsys[] array. This function takes
992 struct cgroup_sb_opts *opts) 1065 * refcounts on subsystems to be used, unless it returns error, in which case
1066 * no refcounts are taken.
1067 */
1068static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
993{ 1069{
994 char *token, *o = data ?: "all"; 1070 char *token, *o = data ?: "all";
995 unsigned long mask = (unsigned long)-1; 1071 unsigned long mask = (unsigned long)-1;
1072 int i;
1073 bool module_pin_failed = false;
1074
1075 BUG_ON(!mutex_is_locked(&cgroup_mutex));
996 1076
997#ifdef CONFIG_CPUSETS 1077#ifdef CONFIG_CPUSETS
998 mask = ~(1UL << cpuset_subsys_id); 1078 mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1085,11 @@ static int parse_cgroupfs_options(char *data,
1005 return -EINVAL; 1085 return -EINVAL;
1006 if (!strcmp(token, "all")) { 1086 if (!strcmp(token, "all")) {
1007 /* Add all non-disabled subsystems */ 1087 /* Add all non-disabled subsystems */
1008 int i;
1009 opts->subsys_bits = 0; 1088 opts->subsys_bits = 0;
1010 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1089 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1011 struct cgroup_subsys *ss = subsys[i]; 1090 struct cgroup_subsys *ss = subsys[i];
1091 if (ss == NULL)
1092 continue;
1012 if (!ss->disabled) 1093 if (!ss->disabled)
1013 opts->subsys_bits |= 1ul << i; 1094 opts->subsys_bits |= 1ul << i;
1014 } 1095 }
@@ -1026,7 +1107,6 @@ static int parse_cgroupfs_options(char *data,
1026 if (!opts->release_agent) 1107 if (!opts->release_agent)
1027 return -ENOMEM; 1108 return -ENOMEM;
1028 } else if (!strncmp(token, "name=", 5)) { 1109 } else if (!strncmp(token, "name=", 5)) {
1029 int i;
1030 const char *name = token + 5; 1110 const char *name = token + 5;
1031 /* Can't specify an empty name */ 1111 /* Can't specify an empty name */
1032 if (!strlen(name)) 1112 if (!strlen(name))
@@ -1050,9 +1130,10 @@ static int parse_cgroupfs_options(char *data,
1050 return -ENOMEM; 1130 return -ENOMEM;
1051 } else { 1131 } else {
1052 struct cgroup_subsys *ss; 1132 struct cgroup_subsys *ss;
1053 int i;
1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1133 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1055 ss = subsys[i]; 1134 ss = subsys[i];
1135 if (ss == NULL)
1136 continue;
1056 if (!strcmp(token, ss->name)) { 1137 if (!strcmp(token, ss->name)) {
1057 if (!ss->disabled) 1138 if (!ss->disabled)
1058 set_bit(i, &opts->subsys_bits); 1139 set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1168,54 @@ static int parse_cgroupfs_options(char *data,
1087 if (!opts->subsys_bits && !opts->name) 1168 if (!opts->subsys_bits && !opts->name)
1088 return -EINVAL; 1169 return -EINVAL;
1089 1170
1171 /*
1172 * Grab references on all the modules we'll need, so the subsystems
1173 * don't dance around before rebind_subsystems attaches them. This may
1174 * take duplicate reference counts on a subsystem that's already used,
1175 * but rebind_subsystems handles this case.
1176 */
1177 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1178 unsigned long bit = 1UL << i;
1179
1180 if (!(bit & opts->subsys_bits))
1181 continue;
1182 if (!try_module_get(subsys[i]->module)) {
1183 module_pin_failed = true;
1184 break;
1185 }
1186 }
1187 if (module_pin_failed) {
1188 /*
1189 * oops, one of the modules was going away. this means that we
1190 * raced with a module_delete call, and to the user this is
1191 * essentially a "subsystem doesn't exist" case.
1192 */
1193 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1194 /* drop refcounts only on the ones we took */
1195 unsigned long bit = 1UL << i;
1196
1197 if (!(bit & opts->subsys_bits))
1198 continue;
1199 module_put(subsys[i]->module);
1200 }
1201 return -ENOENT;
1202 }
1203
1090 return 0; 1204 return 0;
1091} 1205}
1092 1206
1207static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1208{
1209 int i;
1210 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1211 unsigned long bit = 1UL << i;
1212
1213 if (!(bit & subsys_bits))
1214 continue;
1215 module_put(subsys[i]->module);
1216 }
1217}
1218
1093static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1219static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1094{ 1220{
1095 int ret = 0; 1221 int ret = 0;
@@ -1106,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1106 if (ret) 1232 if (ret)
1107 goto out_unlock; 1233 goto out_unlock;
1108 1234
1109 /* Don't allow flags to change at remount */ 1235 /* Don't allow flags or name to change at remount */
1110 if (opts.flags != root->flags) { 1236 if (opts.flags != root->flags ||
1111 ret = -EINVAL; 1237 (opts.name && strcmp(opts.name, root->name))) {
1112 goto out_unlock;
1113 }
1114
1115 /* Don't allow name to change at remount */
1116 if (opts.name && strcmp(opts.name, root->name)) {
1117 ret = -EINVAL; 1238 ret = -EINVAL;
1239 drop_parsed_module_refcounts(opts.subsys_bits);
1118 goto out_unlock; 1240 goto out_unlock;
1119 } 1241 }
1120 1242
1121 ret = rebind_subsystems(root, opts.subsys_bits); 1243 ret = rebind_subsystems(root, opts.subsys_bits);
1122 if (ret) 1244 if (ret) {
1245 drop_parsed_module_refcounts(opts.subsys_bits);
1123 goto out_unlock; 1246 goto out_unlock;
1247 }
1124 1248
1125 /* (re)populate subsystem files */ 1249 /* (re)populate subsystem files */
1126 cgroup_populate_dir(cgrp); 1250 cgroup_populate_dir(cgrp);
@@ -1151,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1151 INIT_LIST_HEAD(&cgrp->release_list); 1275 INIT_LIST_HEAD(&cgrp->release_list);
1152 INIT_LIST_HEAD(&cgrp->pidlists); 1276 INIT_LIST_HEAD(&cgrp->pidlists);
1153 mutex_init(&cgrp->pidlist_mutex); 1277 mutex_init(&cgrp->pidlist_mutex);
1278 INIT_LIST_HEAD(&cgrp->event_list);
1279 spin_lock_init(&cgrp->event_list_lock);
1154} 1280}
1155 1281
1156static void init_cgroup_root(struct cgroupfs_root *root) 1282static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1306 struct cgroupfs_root *new_root; 1432 struct cgroupfs_root *new_root;
1307 1433
1308 /* First find the desired set of subsystems */ 1434 /* First find the desired set of subsystems */
1435 mutex_lock(&cgroup_mutex);
1309 ret = parse_cgroupfs_options(data, &opts); 1436 ret = parse_cgroupfs_options(data, &opts);
1437 mutex_unlock(&cgroup_mutex);
1310 if (ret) 1438 if (ret)
1311 goto out_err; 1439 goto out_err;
1312 1440
@@ -1317,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1317 new_root = cgroup_root_from_opts(&opts); 1445 new_root = cgroup_root_from_opts(&opts);
1318 if (IS_ERR(new_root)) { 1446 if (IS_ERR(new_root)) {
1319 ret = PTR_ERR(new_root); 1447 ret = PTR_ERR(new_root);
1320 goto out_err; 1448 goto drop_modules;
1321 } 1449 }
1322 opts.new_root = new_root; 1450 opts.new_root = new_root;
1323 1451
@@ -1326,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1326 if (IS_ERR(sb)) { 1454 if (IS_ERR(sb)) {
1327 ret = PTR_ERR(sb); 1455 ret = PTR_ERR(sb);
1328 cgroup_drop_root(opts.new_root); 1456 cgroup_drop_root(opts.new_root);
1329 goto out_err; 1457 goto drop_modules;
1330 } 1458 }
1331 1459
1332 root = sb->s_fs_info; 1460 root = sb->s_fs_info;
@@ -1382,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1382 free_cg_links(&tmp_cg_links); 1510 free_cg_links(&tmp_cg_links);
1383 goto drop_new_super; 1511 goto drop_new_super;
1384 } 1512 }
1513 /*
1514 * There must be no failure case after here, since rebinding
1515 * takes care of subsystems' refcounts, which are explicitly
1516 * dropped in the failure exit path.
1517 */
1385 1518
1386 /* EBUSY should be the only error here */ 1519 /* EBUSY should be the only error here */
1387 BUG_ON(ret); 1520 BUG_ON(ret);
@@ -1420,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1420 * any) is not needed 1553 * any) is not needed
1421 */ 1554 */
1422 cgroup_drop_root(opts.new_root); 1555 cgroup_drop_root(opts.new_root);
1556 /* no subsys rebinding, so refcounts don't change */
1557 drop_parsed_module_refcounts(opts.subsys_bits);
1423 } 1558 }
1424 1559
1425 simple_set_mnt(mnt, sb); 1560 simple_set_mnt(mnt, sb);
@@ -1429,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1429 1564
1430 drop_new_super: 1565 drop_new_super:
1431 deactivate_locked_super(sb); 1566 deactivate_locked_super(sb);
1567 drop_modules:
1568 drop_parsed_module_refcounts(opts.subsys_bits);
1432 out_err: 1569 out_err:
1433 kfree(opts.release_agent); 1570 kfree(opts.release_agent);
1434 kfree(opts.name); 1571 kfree(opts.name);
@@ -1542,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1542 memmove(buf, start, buf + buflen - start); 1679 memmove(buf, start, buf + buflen - start);
1543 return 0; 1680 return 0;
1544} 1681}
1682EXPORT_SYMBOL_GPL(cgroup_path);
1545 1683
1546/** 1684/**
1547 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1685 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1554int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1692int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1555{ 1693{
1556 int retval = 0; 1694 int retval = 0;
1557 struct cgroup_subsys *ss; 1695 struct cgroup_subsys *ss, *failed_ss = NULL;
1558 struct cgroup *oldcgrp; 1696 struct cgroup *oldcgrp;
1559 struct css_set *cg; 1697 struct css_set *cg;
1560 struct css_set *newcg; 1698 struct css_set *newcg;
@@ -1568,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 for_each_subsys(root, ss) { 1706 for_each_subsys(root, ss) {
1569 if (ss->can_attach) { 1707 if (ss->can_attach) {
1570 retval = ss->can_attach(ss, cgrp, tsk, false); 1708 retval = ss->can_attach(ss, cgrp, tsk, false);
1571 if (retval) 1709 if (retval) {
1572 return retval; 1710 /*
1711 * Remember on which subsystem the can_attach()
1712 * failed, so that we only call cancel_attach()
1713 * against the subsystems whose can_attach()
1714 * succeeded. (See below)
1715 */
1716 failed_ss = ss;
1717 goto out;
1718 }
1573 } 1719 }
1574 } 1720 }
1575 1721
@@ -1583,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1583 */ 1729 */
1584 newcg = find_css_set(cg, cgrp); 1730 newcg = find_css_set(cg, cgrp);
1585 put_css_set(cg); 1731 put_css_set(cg);
1586 if (!newcg) 1732 if (!newcg) {
1587 return -ENOMEM; 1733 retval = -ENOMEM;
1734 goto out;
1735 }
1588 1736
1589 task_lock(tsk); 1737 task_lock(tsk);
1590 if (tsk->flags & PF_EXITING) { 1738 if (tsk->flags & PF_EXITING) {
1591 task_unlock(tsk); 1739 task_unlock(tsk);
1592 put_css_set(newcg); 1740 put_css_set(newcg);
1593 return -ESRCH; 1741 retval = -ESRCH;
1742 goto out;
1594 } 1743 }
1595 rcu_assign_pointer(tsk->cgroups, newcg); 1744 rcu_assign_pointer(tsk->cgroups, newcg);
1596 task_unlock(tsk); 1745 task_unlock(tsk);
@@ -1616,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1616 * is no longer empty. 1765 * is no longer empty.
1617 */ 1766 */
1618 cgroup_wakeup_rmdir_waiter(cgrp); 1767 cgroup_wakeup_rmdir_waiter(cgrp);
1619 return 0; 1768out:
1769 if (retval) {
1770 for_each_subsys(root, ss) {
1771 if (ss == failed_ss)
1772 /*
1773 * This subsystem was the one that failed the
1774 * can_attach() check earlier, so we don't need
1775 * to call cancel_attach() against it or any
1776 * remaining subsystems.
1777 */
1778 break;
1779 if (ss->cancel_attach)
1780 ss->cancel_attach(ss, cgrp, tsk, false);
1781 }
1782 }
1783 return retval;
1620} 1784}
1621 1785
1622/* 1786/*
@@ -1682,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1682 } 1846 }
1683 return true; 1847 return true;
1684} 1848}
1849EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1685 1850
1686static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1851static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1687 const char *buffer) 1852 const char *buffer)
@@ -1950,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1950 .rename = cgroup_rename, 2115 .rename = cgroup_rename,
1951}; 2116};
1952 2117
2118/*
2119 * Check if a file is a control file
2120 */
2121static inline struct cftype *__file_cft(struct file *file)
2122{
2123 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2124 return ERR_PTR(-EINVAL);
2125 return __d_cft(file->f_dentry);
2126}
2127
1953static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2128static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1954 struct super_block *sb) 2129 struct super_block *sb)
1955{ 2130{
@@ -2069,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2069 error = PTR_ERR(dentry); 2244 error = PTR_ERR(dentry);
2070 return error; 2245 return error;
2071} 2246}
2247EXPORT_SYMBOL_GPL(cgroup_add_file);
2072 2248
2073int cgroup_add_files(struct cgroup *cgrp, 2249int cgroup_add_files(struct cgroup *cgrp,
2074 struct cgroup_subsys *subsys, 2250 struct cgroup_subsys *subsys,
@@ -2083,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2083 } 2259 }
2084 return 0; 2260 return 0;
2085} 2261}
2262EXPORT_SYMBOL_GPL(cgroup_add_files);
2086 2263
2087/** 2264/**
2088 * cgroup_task_count - count the number of tasks in a cgroup. 2265 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468{ 2645{
2469 struct cgroup_pidlist *l; 2646 struct cgroup_pidlist *l;
2470 /* don't need task_nsproxy() if we're looking at ourself */ 2647 /* don't need task_nsproxy() if we're looking at ourself */
2471 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2648 struct pid_namespace *ns = current->nsproxy->pid_ns;
2649
2472 /* 2650 /*
2473 * We can't drop the pidlist_mutex before taking the l->mutex in case 2651 * We can't drop the pidlist_mutex before taking the l->mutex in case
2474 * the last ref-holder is trying to remove l from the list at the same 2652 * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2656,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2478 mutex_lock(&cgrp->pidlist_mutex); 2656 mutex_lock(&cgrp->pidlist_mutex);
2479 list_for_each_entry(l, &cgrp->pidlists, links) { 2657 list_for_each_entry(l, &cgrp->pidlists, links) {
2480 if (l->key.type == type && l->key.ns == ns) { 2658 if (l->key.type == type && l->key.ns == ns) {
2481 /* found a matching list - drop the extra refcount */
2482 put_pid_ns(ns);
2483 /* make sure l doesn't vanish out from under us */ 2659 /* make sure l doesn't vanish out from under us */
2484 down_write(&l->mutex); 2660 down_write(&l->mutex);
2485 mutex_unlock(&cgrp->pidlist_mutex); 2661 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2490 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2666 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2491 if (!l) { 2667 if (!l) {
2492 mutex_unlock(&cgrp->pidlist_mutex); 2668 mutex_unlock(&cgrp->pidlist_mutex);
2493 put_pid_ns(ns);
2494 return l; 2669 return l;
2495 } 2670 }
2496 init_rwsem(&l->mutex); 2671 init_rwsem(&l->mutex);
2497 down_write(&l->mutex); 2672 down_write(&l->mutex);
2498 l->key.type = type; 2673 l->key.type = type;
2499 l->key.ns = ns; 2674 l->key.ns = get_pid_ns(ns);
2500 l->use_count = 0; /* don't increment here */ 2675 l->use_count = 0; /* don't increment here */
2501 l->list = NULL; 2676 l->list = NULL;
2502 l->owner = cgrp; 2677 l->owner = cgrp;
@@ -2804,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2804} 2979}
2805 2980
2806/* 2981/*
2982 * Unregister event and free resources.
2983 *
2984 * Gets called from workqueue.
2985 */
2986static void cgroup_event_remove(struct work_struct *work)
2987{
2988 struct cgroup_event *event = container_of(work, struct cgroup_event,
2989 remove);
2990 struct cgroup *cgrp = event->cgrp;
2991
2992 /* TODO: check return code */
2993 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2994
2995 eventfd_ctx_put(event->eventfd);
2996 kfree(event);
2997 dput(cgrp->dentry);
2998}
2999
3000/*
3001 * Gets called on POLLHUP on eventfd when user closes it.
3002 *
3003 * Called with wqh->lock held and interrupts disabled.
3004 */
3005static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3006 int sync, void *key)
3007{
3008 struct cgroup_event *event = container_of(wait,
3009 struct cgroup_event, wait);
3010 struct cgroup *cgrp = event->cgrp;
3011 unsigned long flags = (unsigned long)key;
3012
3013 if (flags & POLLHUP) {
3014 remove_wait_queue_locked(event->wqh, &event->wait);
3015 spin_lock(&cgrp->event_list_lock);
3016 list_del(&event->list);
3017 spin_unlock(&cgrp->event_list_lock);
3018 /*
3019 * We are in atomic context, but cgroup_event_remove() may
3020 * sleep, so we have to call it in workqueue.
3021 */
3022 schedule_work(&event->remove);
3023 }
3024
3025 return 0;
3026}
3027
3028static void cgroup_event_ptable_queue_proc(struct file *file,
3029 wait_queue_head_t *wqh, poll_table *pt)
3030{
3031 struct cgroup_event *event = container_of(pt,
3032 struct cgroup_event, pt);
3033
3034 event->wqh = wqh;
3035 add_wait_queue(wqh, &event->wait);
3036}
3037
3038/*
3039 * Parse input and register new cgroup event handler.
3040 *
3041 * Input must be in format '<event_fd> <control_fd> <args>'.
3042 * Interpretation of args is defined by control file implementation.
3043 */
3044static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3045 const char *buffer)
3046{
3047 struct cgroup_event *event = NULL;
3048 unsigned int efd, cfd;
3049 struct file *efile = NULL;
3050 struct file *cfile = NULL;
3051 char *endp;
3052 int ret;
3053
3054 efd = simple_strtoul(buffer, &endp, 10);
3055 if (*endp != ' ')
3056 return -EINVAL;
3057 buffer = endp + 1;
3058
3059 cfd = simple_strtoul(buffer, &endp, 10);
3060 if ((*endp != ' ') && (*endp != '\0'))
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 event = kzalloc(sizeof(*event), GFP_KERNEL);
3065 if (!event)
3066 return -ENOMEM;
3067 event->cgrp = cgrp;
3068 INIT_LIST_HEAD(&event->list);
3069 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3070 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3071 INIT_WORK(&event->remove, cgroup_event_remove);
3072
3073 efile = eventfd_fget(efd);
3074 if (IS_ERR(efile)) {
3075 ret = PTR_ERR(efile);
3076 goto fail;
3077 }
3078
3079 event->eventfd = eventfd_ctx_fileget(efile);
3080 if (IS_ERR(event->eventfd)) {
3081 ret = PTR_ERR(event->eventfd);
3082 goto fail;
3083 }
3084
3085 cfile = fget(cfd);
3086 if (!cfile) {
3087 ret = -EBADF;
3088 goto fail;
3089 }
3090
3091 /* the process need read permission on control file */
3092 ret = file_permission(cfile, MAY_READ);
3093 if (ret < 0)
3094 goto fail;
3095
3096 event->cft = __file_cft(cfile);
3097 if (IS_ERR(event->cft)) {
3098 ret = PTR_ERR(event->cft);
3099 goto fail;
3100 }
3101
3102 if (!event->cft->register_event || !event->cft->unregister_event) {
3103 ret = -EINVAL;
3104 goto fail;
3105 }
3106
3107 ret = event->cft->register_event(cgrp, event->cft,
3108 event->eventfd, buffer);
3109 if (ret)
3110 goto fail;
3111
3112 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3113 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3114 ret = 0;
3115 goto fail;
3116 }
3117
3118 /*
3119 * Events should be removed after rmdir of cgroup directory, but before
3120 * destroying subsystem state objects. Let's take reference to cgroup
3121 * directory dentry to do that.
3122 */
3123 dget(cgrp->dentry);
3124
3125 spin_lock(&cgrp->event_list_lock);
3126 list_add(&event->list, &cgrp->event_list);
3127 spin_unlock(&cgrp->event_list_lock);
3128
3129 fput(cfile);
3130 fput(efile);
3131
3132 return 0;
3133
3134fail:
3135 if (cfile)
3136 fput(cfile);
3137
3138 if (event && event->eventfd && !IS_ERR(event->eventfd))
3139 eventfd_ctx_put(event->eventfd);
3140
3141 if (!IS_ERR_OR_NULL(efile))
3142 fput(efile);
3143
3144 kfree(event);
3145
3146 return ret;
3147}
3148
3149/*
2807 * for the common functions, 'private' gives the type of file 3150 * for the common functions, 'private' gives the type of file
2808 */ 3151 */
2809/* for hysterical raisins, we can't put this on the older files */ 3152/* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3171,11 @@ static struct cftype files[] = {
2828 .read_u64 = cgroup_read_notify_on_release, 3171 .read_u64 = cgroup_read_notify_on_release,
2829 .write_u64 = cgroup_write_notify_on_release, 3172 .write_u64 = cgroup_write_notify_on_release,
2830 }, 3173 },
3174 {
3175 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3176 .write_string = cgroup_write_event_control,
3177 .mode = S_IWUGO,
3178 },
2831}; 3179};
2832 3180
2833static struct cftype cft_release_agent = { 3181static struct cftype cft_release_agent = {
@@ -2892,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2892 /* We need to take each hierarchy_mutex in a consistent order */ 3240 /* We need to take each hierarchy_mutex in a consistent order */
2893 int i; 3241 int i;
2894 3242
3243 /*
3244 * No worry about a race with rebind_subsystems that might mess up the
3245 * locking order, since both parties are under cgroup_mutex.
3246 */
2895 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3247 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2896 struct cgroup_subsys *ss = subsys[i]; 3248 struct cgroup_subsys *ss = subsys[i];
3249 if (ss == NULL)
3250 continue;
2897 if (ss->root == root) 3251 if (ss->root == root)
2898 mutex_lock(&ss->hierarchy_mutex); 3252 mutex_lock(&ss->hierarchy_mutex);
2899 } 3253 }
@@ -2905,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2905 3259
2906 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3260 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2907 struct cgroup_subsys *ss = subsys[i]; 3261 struct cgroup_subsys *ss = subsys[i];
3262 if (ss == NULL)
3263 continue;
2908 if (ss->root == root) 3264 if (ss->root == root)
2909 mutex_unlock(&ss->hierarchy_mutex); 3265 mutex_unlock(&ss->hierarchy_mutex);
2910 } 3266 }
@@ -3028,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3028 * synchronization other than RCU, and the subsystem linked 3384 * synchronization other than RCU, and the subsystem linked
3029 * list isn't RCU-safe */ 3385 * list isn't RCU-safe */
3030 int i; 3386 int i;
3387 /*
3388 * We won't need to lock the subsys array, because the subsystems
3389 * we're concerned about aren't going anywhere since our cgroup root
3390 * has a reference on them.
3391 */
3031 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3392 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3032 struct cgroup_subsys *ss = subsys[i]; 3393 struct cgroup_subsys *ss = subsys[i];
3033 struct cgroup_subsys_state *css; 3394 struct cgroup_subsys_state *css;
3034 /* Skip subsystems not in this hierarchy */ 3395 /* Skip subsystems not present or not in this hierarchy */
3035 if (ss->root != cgrp->root) 3396 if (ss == NULL || ss->root != cgrp->root)
3036 continue; 3397 continue;
3037 css = cgrp->subsys[ss->subsys_id]; 3398 css = cgrp->subsys[ss->subsys_id];
3038 /* When called from check_for_release() it's possible 3399 /* When called from check_for_release() it's possible
@@ -3106,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3106 struct dentry *d; 3467 struct dentry *d;
3107 struct cgroup *parent; 3468 struct cgroup *parent;
3108 DEFINE_WAIT(wait); 3469 DEFINE_WAIT(wait);
3470 struct cgroup_event *event, *tmp;
3109 int ret; 3471 int ret;
3110 3472
3111 /* the vfs holds both inode->i_mutex already */ 3473 /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3551,20 @@ again:
3189 set_bit(CGRP_RELEASABLE, &parent->flags); 3551 set_bit(CGRP_RELEASABLE, &parent->flags);
3190 check_for_release(parent); 3552 check_for_release(parent);
3191 3553
3554 /*
3555 * Unregister events and notify userspace.
3556 * Notify userspace about cgroup removing only after rmdir of cgroup
3557 * directory to avoid race between userspace and kernelspace
3558 */
3559 spin_lock(&cgrp->event_list_lock);
3560 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3561 list_del(&event->list);
3562 remove_wait_queue(event->wqh, &event->wait);
3563 eventfd_signal(event->eventfd, 1);
3564 schedule_work(&event->remove);
3565 }
3566 spin_unlock(&cgrp->event_list_lock);
3567
3192 mutex_unlock(&cgroup_mutex); 3568 mutex_unlock(&cgroup_mutex);
3193 return 0; 3569 return 0;
3194} 3570}
@@ -3223,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3223 mutex_init(&ss->hierarchy_mutex); 3599 mutex_init(&ss->hierarchy_mutex);
3224 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3600 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3225 ss->active = 1; 3601 ss->active = 1;
3602
3603 /* this function shouldn't be used with modular subsystems, since they
3604 * need to register a subsys_id, among other things */
3605 BUG_ON(ss->module);
3606}
3607
3608/**
3609 * cgroup_load_subsys: load and register a modular subsystem at runtime
3610 * @ss: the subsystem to load
3611 *
3612 * This function should be called in a modular subsystem's initcall. If the
3613 * subsytem is built as a module, it will be assigned a new subsys_id and set
3614 * up for use. If the subsystem is built-in anyway, work is delegated to the
3615 * simpler cgroup_init_subsys.
3616 */
3617int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3618{
3619 int i;
3620 struct cgroup_subsys_state *css;
3621
3622 /* check name and function validity */
3623 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3624 ss->create == NULL || ss->destroy == NULL)
3625 return -EINVAL;
3626
3627 /*
3628 * we don't support callbacks in modular subsystems. this check is
3629 * before the ss->module check for consistency; a subsystem that could
3630 * be a module should still have no callbacks even if the user isn't
3631 * compiling it as one.
3632 */
3633 if (ss->fork || ss->exit)
3634 return -EINVAL;
3635
3636 /*
3637 * an optionally modular subsystem is built-in: we want to do nothing,
3638 * since cgroup_init_subsys will have already taken care of it.
3639 */
3640 if (ss->module == NULL) {
3641 /* a few sanity checks */
3642 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3643 BUG_ON(subsys[ss->subsys_id] != ss);
3644 return 0;
3645 }
3646
3647 /*
3648 * need to register a subsys id before anything else - for example,
3649 * init_cgroup_css needs it.
3650 */
3651 mutex_lock(&cgroup_mutex);
3652 /* find the first empty slot in the array */
3653 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3654 if (subsys[i] == NULL)
3655 break;
3656 }
3657 if (i == CGROUP_SUBSYS_COUNT) {
3658 /* maximum number of subsystems already registered! */
3659 mutex_unlock(&cgroup_mutex);
3660 return -EBUSY;
3661 }
3662 /* assign ourselves the subsys_id */
3663 ss->subsys_id = i;
3664 subsys[i] = ss;
3665
3666 /*
3667 * no ss->create seems to need anything important in the ss struct, so
3668 * this can happen first (i.e. before the rootnode attachment).
3669 */
3670 css = ss->create(ss, dummytop);
3671 if (IS_ERR(css)) {
3672 /* failure case - need to deassign the subsys[] slot. */
3673 subsys[i] = NULL;
3674 mutex_unlock(&cgroup_mutex);
3675 return PTR_ERR(css);
3676 }
3677
3678 list_add(&ss->sibling, &rootnode.subsys_list);
3679 ss->root = &rootnode;
3680
3681 /* our new subsystem will be attached to the dummy hierarchy. */
3682 init_cgroup_css(css, ss, dummytop);
3683 /* init_idr must be after init_cgroup_css because it sets css->id. */
3684 if (ss->use_id) {
3685 int ret = cgroup_init_idr(ss, css);
3686 if (ret) {
3687 dummytop->subsys[ss->subsys_id] = NULL;
3688 ss->destroy(ss, dummytop);
3689 subsys[i] = NULL;
3690 mutex_unlock(&cgroup_mutex);
3691 return ret;
3692 }
3693 }
3694
3695 /*
3696 * Now we need to entangle the css into the existing css_sets. unlike
3697 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3698 * will need a new pointer to it; done by iterating the css_set_table.
3699 * furthermore, modifying the existing css_sets will corrupt the hash
3700 * table state, so each changed css_set will need its hash recomputed.
3701 * this is all done under the css_set_lock.
3702 */
3703 write_lock(&css_set_lock);
3704 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3705 struct css_set *cg;
3706 struct hlist_node *node, *tmp;
3707 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3708
3709 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3710 /* skip entries that we already rehashed */
3711 if (cg->subsys[ss->subsys_id])
3712 continue;
3713 /* remove existing entry */
3714 hlist_del(&cg->hlist);
3715 /* set new value */
3716 cg->subsys[ss->subsys_id] = css;
3717 /* recompute hash and restore entry */
3718 new_bucket = css_set_hash(cg->subsys);
3719 hlist_add_head(&cg->hlist, new_bucket);
3720 }
3721 }
3722 write_unlock(&css_set_lock);
3723
3724 mutex_init(&ss->hierarchy_mutex);
3725 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3726 ss->active = 1;
3727
3728 /* success! */
3729 mutex_unlock(&cgroup_mutex);
3730 return 0;
3226} 3731}
3732EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3733
3734/**
3735 * cgroup_unload_subsys: unload a modular subsystem
3736 * @ss: the subsystem to unload
3737 *
3738 * This function should be called in a modular subsystem's exitcall. When this
3739 * function is invoked, the refcount on the subsystem's module will be 0, so
3740 * the subsystem will not be attached to any hierarchy.
3741 */
3742void cgroup_unload_subsys(struct cgroup_subsys *ss)
3743{
3744 struct cg_cgroup_link *link;
3745 struct hlist_head *hhead;
3746
3747 BUG_ON(ss->module == NULL);
3748
3749 /*
3750 * we shouldn't be called if the subsystem is in use, and the use of
3751 * try_module_get in parse_cgroupfs_options should ensure that it
3752 * doesn't start being used while we're killing it off.
3753 */
3754 BUG_ON(ss->root != &rootnode);
3755
3756 mutex_lock(&cgroup_mutex);
3757 /* deassign the subsys_id */
3758 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3759 subsys[ss->subsys_id] = NULL;
3760
3761 /* remove subsystem from rootnode's list of subsystems */
3762 list_del(&ss->sibling);
3763
3764 /*
3765 * disentangle the css from all css_sets attached to the dummytop. as
3766 * in loading, we need to pay our respects to the hashtable gods.
3767 */
3768 write_lock(&css_set_lock);
3769 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3770 struct css_set *cg = link->cg;
3771
3772 hlist_del(&cg->hlist);
3773 BUG_ON(!cg->subsys[ss->subsys_id]);
3774 cg->subsys[ss->subsys_id] = NULL;
3775 hhead = css_set_hash(cg->subsys);
3776 hlist_add_head(&cg->hlist, hhead);
3777 }
3778 write_unlock(&css_set_lock);
3779
3780 /*
3781 * remove subsystem's css from the dummytop and free it - need to free
3782 * before marking as null because ss->destroy needs the cgrp->subsys
3783 * pointer to find their state. note that this also takes care of
3784 * freeing the css_id.
3785 */
3786 ss->destroy(ss, dummytop);
3787 dummytop->subsys[ss->subsys_id] = NULL;
3788
3789 mutex_unlock(&cgroup_mutex);
3790}
3791EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3227 3792
3228/** 3793/**
3229 * cgroup_init_early - cgroup initialization at system boot 3794 * cgroup_init_early - cgroup initialization at system boot
@@ -3253,7 +3818,8 @@ int __init cgroup_init_early(void)
3253 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3818 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3254 INIT_HLIST_HEAD(&css_set_table[i]); 3819 INIT_HLIST_HEAD(&css_set_table[i]);
3255 3820
3256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3821 /* at bootup time, we don't worry about modular subsystems */
3822 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3257 struct cgroup_subsys *ss = subsys[i]; 3823 struct cgroup_subsys *ss = subsys[i];
3258 3824
3259 BUG_ON(!ss->name); 3825 BUG_ON(!ss->name);
@@ -3288,12 +3854,13 @@ int __init cgroup_init(void)
3288 if (err) 3854 if (err)
3289 return err; 3855 return err;
3290 3856
3291 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3857 /* at bootup time, we don't worry about modular subsystems */
3858 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3292 struct cgroup_subsys *ss = subsys[i]; 3859 struct cgroup_subsys *ss = subsys[i];
3293 if (!ss->early_init) 3860 if (!ss->early_init)
3294 cgroup_init_subsys(ss); 3861 cgroup_init_subsys(ss);
3295 if (ss->use_id) 3862 if (ss->use_id)
3296 cgroup_subsys_init_idr(ss); 3863 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3297 } 3864 }
3298 3865
3299 /* Add init_css_set to the hash table */ 3866 /* Add init_css_set to the hash table */
@@ -3397,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3397 int i; 3964 int i;
3398 3965
3399 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3966 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3967 /*
3968 * ideally we don't want subsystems moving around while we do this.
3969 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3970 * subsys/hierarchy state.
3971 */
3400 mutex_lock(&cgroup_mutex); 3972 mutex_lock(&cgroup_mutex);
3401 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3973 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3402 struct cgroup_subsys *ss = subsys[i]; 3974 struct cgroup_subsys *ss = subsys[i];
3975 if (ss == NULL)
3976 continue;
3403 seq_printf(m, "%s\t%d\t%d\t%d\n", 3977 seq_printf(m, "%s\t%d\t%d\t%d\n",
3404 ss->name, ss->root->hierarchy_id, 3978 ss->name, ss->root->hierarchy_id,
3405 ss->root->number_of_cgroups, !ss->disabled); 3979 ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3457{ 4031{
3458 if (need_forkexit_callback) { 4032 if (need_forkexit_callback) {
3459 int i; 4033 int i;
3460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4034 /*
4035 * forkexit callbacks are only supported for builtin
4036 * subsystems, and the builtin section of the subsys array is
4037 * immutable, so we don't need to lock the subsys array here.
4038 */
4039 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3461 struct cgroup_subsys *ss = subsys[i]; 4040 struct cgroup_subsys *ss = subsys[i];
3462 if (ss->fork) 4041 if (ss->fork)
3463 ss->fork(ss, child); 4042 ss->fork(ss, child);
@@ -3526,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3526 struct css_set *cg; 4105 struct css_set *cg;
3527 4106
3528 if (run_callbacks && need_forkexit_callback) { 4107 if (run_callbacks && need_forkexit_callback) {
3529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4108 /*
4109 * modular subsystems can't use callbacks, so no need to lock
4110 * the subsys array
4111 */
4112 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3530 struct cgroup_subsys *ss = subsys[i]; 4113 struct cgroup_subsys *ss = subsys[i];
3531 if (ss->exit) 4114 if (ss->exit)
3532 ss->exit(ss, tsk); 4115 ss->exit(ss, tsk);
@@ -3720,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp)
3720 } 4303 }
3721} 4304}
3722 4305
3723void __css_put(struct cgroup_subsys_state *css) 4306/* Caller must verify that the css is not for root cgroup */
4307void __css_put(struct cgroup_subsys_state *css, int count)
3724{ 4308{
3725 struct cgroup *cgrp = css->cgroup; 4309 struct cgroup *cgrp = css->cgroup;
3726 int val; 4310 int val;
3727 rcu_read_lock(); 4311 rcu_read_lock();
3728 val = atomic_dec_return(&css->refcnt); 4312 val = atomic_sub_return(count, &css->refcnt);
3729 if (val == 1) { 4313 if (val == 1) {
3730 if (notify_on_release(cgrp)) { 4314 if (notify_on_release(cgrp)) {
3731 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4315 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css)
3736 rcu_read_unlock(); 4320 rcu_read_unlock();
3737 WARN_ON_ONCE(val < 1); 4321 WARN_ON_ONCE(val < 1);
3738} 4322}
4323EXPORT_SYMBOL_GPL(__css_put);
3739 4324
3740/* 4325/*
3741 * Notify userspace when a cgroup is released, by running the 4326 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4402,11 @@ static int __init cgroup_disable(char *str)
3817 while ((token = strsep(&str, ",")) != NULL) { 4402 while ((token = strsep(&str, ",")) != NULL) {
3818 if (!*token) 4403 if (!*token)
3819 continue; 4404 continue;
3820 4405 /*
3821 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4406 * cgroup_disable, being at boot time, can't know about module
4407 * subsystems, so we don't worry about them.
4408 */
4409 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3822 struct cgroup_subsys *ss = subsys[i]; 4410 struct cgroup_subsys *ss = subsys[i];
3823 4411
3824 if (!strcmp(token, ss->name)) { 4412 if (!strcmp(token, ss->name)) {
@@ -3848,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3848 return cssid->id; 4436 return cssid->id;
3849 return 0; 4437 return 0;
3850} 4438}
4439EXPORT_SYMBOL_GPL(css_id);
3851 4440
3852unsigned short css_depth(struct cgroup_subsys_state *css) 4441unsigned short css_depth(struct cgroup_subsys_state *css)
3853{ 4442{
@@ -3857,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3857 return cssid->depth; 4446 return cssid->depth;
3858 return 0; 4447 return 0;
3859} 4448}
4449EXPORT_SYMBOL_GPL(css_depth);
3860 4450
3861bool css_is_ancestor(struct cgroup_subsys_state *child, 4451bool css_is_ancestor(struct cgroup_subsys_state *child,
3862 const struct cgroup_subsys_state *root) 4452 const struct cgroup_subsys_state *root)
@@ -3893,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3893 spin_unlock(&ss->id_lock); 4483 spin_unlock(&ss->id_lock);
3894 call_rcu(&id->rcu_head, __free_css_id_cb); 4484 call_rcu(&id->rcu_head, __free_css_id_cb);
3895} 4485}
4486EXPORT_SYMBOL_GPL(free_css_id);
3896 4487
3897/* 4488/*
3898 * This is called by init or create(). Then, calls to this function are 4489 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4533,14 @@ err_out:
3942 4533
3943} 4534}
3944 4535
3945static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4536static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4537 struct cgroup_subsys_state *rootcss)
3946{ 4538{
3947 struct css_id *newid; 4539 struct css_id *newid;
3948 struct cgroup_subsys_state *rootcss;
3949 4540
3950 spin_lock_init(&ss->id_lock); 4541 spin_lock_init(&ss->id_lock);
3951 idr_init(&ss->idr); 4542 idr_init(&ss->idr);
3952 4543
3953 rootcss = init_css_set.subsys[ss->subsys_id];
3954 newid = get_new_cssid(ss, 0); 4544 newid = get_new_cssid(ss, 0);
3955 if (IS_ERR(newid)) 4545 if (IS_ERR(newid))
3956 return PTR_ERR(newid); 4546 return PTR_ERR(newid);
@@ -4010,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4010 4600
4011 return rcu_dereference(cssid->css); 4601 return rcu_dereference(cssid->css);
4012} 4602}
4603EXPORT_SYMBOL_GPL(css_lookup);
4013 4604
4014/** 4605/**
4015 * css_get_next - lookup next cgroup under specified hierarchy. 4606 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/exit.c b/kernel/exit.c
index ce1e48c2d93d..cce59cb5ee6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk)
87 87
88 sighand = rcu_dereference_check(tsk->sighand, 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() || 89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock)); 90 lockdep_tasklist_lock_is_held());
91 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
92 92
93 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b0ec34abc0bb..4799c5f0e6d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock); 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
90 97
91int nr_processes(void) 98int nr_processes(void)
92{ 99{
@@ -833,17 +840,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
833 /* Thread group counters. */ 840 /* Thread group counters. */
834 thread_group_cputime_init(sig); 841 thread_group_cputime_init(sig);
835 842
836 /* Expiration times and increments. */
837 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
838 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
839 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
840 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
841
842 /* Cached expiration times. */
843 sig->cputime_expires.prof_exp = cputime_zero;
844 sig->cputime_expires.virt_exp = cputime_zero;
845 sig->cputime_expires.sched_exp = 0;
846
847 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 843 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
848 if (cpu_limit != RLIM_INFINITY) { 844 if (cpu_limit != RLIM_INFINITY) {
849 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 845 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
@@ -863,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
863 if (clone_flags & CLONE_THREAD) 859 if (clone_flags & CLONE_THREAD)
864 return 0; 860 return 0;
865 861
866 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 862 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
867 tsk->signal = sig; 863 tsk->signal = sig;
868 if (!sig) 864 if (!sig)
869 return -ENOMEM; 865 return -ENOMEM;
@@ -871,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
871 atomic_set(&sig->count, 1); 867 atomic_set(&sig->count, 1);
872 atomic_set(&sig->live, 1); 868 atomic_set(&sig->live, 1);
873 init_waitqueue_head(&sig->wait_chldexit); 869 init_waitqueue_head(&sig->wait_chldexit);
874 sig->flags = 0;
875 if (clone_flags & CLONE_NEWPID) 870 if (clone_flags & CLONE_NEWPID)
876 sig->flags |= SIGNAL_UNKILLABLE; 871 sig->flags |= SIGNAL_UNKILLABLE;
877 sig->group_exit_code = 0;
878 sig->group_exit_task = NULL;
879 sig->group_stop_count = 0;
880 sig->curr_target = tsk; 872 sig->curr_target = tsk;
881 init_sigpending(&sig->shared_pending); 873 init_sigpending(&sig->shared_pending);
882 INIT_LIST_HEAD(&sig->posix_timers); 874 INIT_LIST_HEAD(&sig->posix_timers);
883 875
884 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 876 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
885 sig->it_real_incr.tv64 = 0;
886 sig->real_timer.function = it_real_fn; 877 sig->real_timer.function = it_real_fn;
887 878
888 sig->leader = 0; /* session leadership doesn't inherit */
889 sig->tty_old_pgrp = NULL;
890 sig->tty = NULL;
891
892 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
893 sig->gtime = cputime_zero;
894 sig->cgtime = cputime_zero;
895#ifndef CONFIG_VIRT_CPU_ACCOUNTING
896 sig->prev_utime = sig->prev_stime = cputime_zero;
897#endif
898 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
899 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
900 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
901 sig->maxrss = sig->cmaxrss = 0;
902 task_io_accounting_init(&sig->ioac);
903 sig->sum_sched_runtime = 0;
904 taskstats_tgid_init(sig);
905
906 task_lock(current->group_leader); 879 task_lock(current->group_leader);
907 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 880 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
908 task_unlock(current->group_leader); 881 task_unlock(current->group_leader);
909 882
910 posix_cpu_timers_init_group(sig); 883 posix_cpu_timers_init_group(sig);
911 884
912 acct_init_pacct(&sig->pacct);
913
914 tty_audit_fork(sig); 885 tty_audit_fork(sig);
915 886
916 sig->oom_adj = current->signal->oom_adj; 887 sig->oom_adj = current->signal->oom_adj;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d70394f12ee9..42ec11b2af8a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -554,7 +554,7 @@ out:
554 * signal. The occurence is latched into the irq controller hardware 554 * signal. The occurence is latched into the irq controller hardware
555 * and must be acked in order to be reenabled. After the ack another 555 * and must be acked in order to be reenabled. After the ack another
556 * interrupt can happen on the same source even before the first one 556 * interrupt can happen on the same source even before the first one
557 * is handled by the assosiacted event handler. If this happens it 557 * is handled by the associated event handler. If this happens it
558 * might be necessary to disable (mask) the interrupt depending on the 558 * might be necessary to disable (mask) the interrupt depending on the
559 * controller hardware. This requires to reenable the interrupt inside 559 * controller hardware. This requires to reenable the interrupt inside
560 * of the loop which handles the interrupts which have arrived while 560 * of the loop which handles the interrupts which have arrived while
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 65b5f5b7c298..c927a549db2c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3819,6 +3819,7 @@ void lockdep_rcu_dereference(const char *file, const int line)
3819 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 3819 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3820 file, line); 3820 file, line);
3821 printk("\nother info that might help us debug this:\n\n"); 3821 printk("\nother info that might help us debug this:\n\n");
3822 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3822 lockdep_print_held_locks(curr); 3823 lockdep_print_held_locks(curr);
3823 printk("\nstack backtrace:\n"); 3824 printk("\nstack backtrace:\n");
3824 dump_stack(); 3825 dump_stack();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..2ab67233ee8f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -24,7 +24,18 @@
24 24
25static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = {
28 .count = ATOMIC_INIT(1),
29 .uts_ns = &init_uts_ns,
30#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
31 .ipc_ns = &init_ipc_ns,
32#endif
33 .mnt_ns = NULL,
34 .pid_ns = &init_pid_ns,
35#ifdef CONFIG_NET
36 .net_ns = &init_net,
37#endif
38};
28 39
29static inline struct nsproxy *create_nsproxy(void) 40static inline struct nsproxy *create_nsproxy(void)
30{ 41{
diff --git a/kernel/params.c b/kernel/params.c
index d55a53ec9234..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -401,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
401} 401}
402 402
403/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
404#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
406 406
407extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
408 408
@@ -420,7 +420,7 @@ struct module_param_attrs
420}; 420};
421 421
422#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
423#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
424 424
425static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
426 struct module *mod, char *buf) 426 struct module *mod, char *buf)
diff --git a/kernel/pid.c b/kernel/pid.c
index 86b296943e5f..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..79aac93acf99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 161 rcu_read_lock();
162 162
163 /* 163 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 164 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 165 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 166 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 167 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 168 if (task)
170 force_sig(SIGKILL, task); 169 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 170
172 rcu_read_unlock(); 171 rcu_read_unlock();
173 172
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1439eb504c22..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -246,12 +246,21 @@ struct rcu_data {
246 246
247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
249#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 249
250#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 250#ifdef CONFIG_PROVE_RCU
251#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 251#define RCU_STALL_DELAY_DELTA (5 * HZ)
252 /* to take at least one */ 252#else
253 /* scheduling clock irq */ 253#define RCU_STALL_DELAY_DELTA 0
254 /* before ratting on them. */ 254#endif
255
256#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
257 /* for rsp->jiffies_stall */
258#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
259 /* for rsp->jiffies_stall */
260#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
261 /* to take at least one */
262 /* scheduling clock irq */
263 /* before ratting on them. */
255 264
256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 265#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257 266
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 464ad2cdee00..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu)
1010 int c = 0; 1010 int c = 0;
1011 int thatcpu; 1011 int thatcpu;
1012 1012
1013 /* Check for being in the holdoff period. */
1014 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1015 return rcu_needs_cpu_quick_check(cpu);
1016
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1018 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) { 1019 if (thatcpu != cpu) {
@@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu)
1041 } 1045 }
1042 1046
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1047 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) { 1048 if (c)
1045 raise_softirq(RCU_SOFTIRQ); 1049 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c; 1050 return c;
1049} 1051}
1050 1052
diff --git a/kernel/sched.c b/kernel/sched.c
index 150b6988de49..9ab3cd7858d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2359,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359{ 2359{
2360 int cpu, orig_cpu, this_cpu, success = 0; 2360 int cpu, orig_cpu, this_cpu, success = 0;
2361 unsigned long flags; 2361 unsigned long flags;
2362 struct rq *rq, *orig_rq; 2362 struct rq *rq;
2363 2363
2364 if (!sched_feat(SYNC_WAKEUPS)) 2364 if (!sched_feat(SYNC_WAKEUPS))
2365 wake_flags &= ~WF_SYNC; 2365 wake_flags &= ~WF_SYNC;
@@ -2367,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2367 this_cpu = get_cpu(); 2367 this_cpu = get_cpu();
2368 2368
2369 smp_wmb(); 2369 smp_wmb();
2370 rq = orig_rq = task_rq_lock(p, &flags); 2370 rq = task_rq_lock(p, &flags);
2371 update_rq_clock(rq); 2371 update_rq_clock(rq);
2372 if (!(p->state & state)) 2372 if (!(p->state & state))
2373 goto out; 2373 goto out;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 82095bf2099f..fccf9fbb0d7b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -56,7 +56,7 @@ static int convert_prio(int prio)
56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
57 * 57 *
58 * Note: This function returns the recommended CPUs as calculated during the 58 * Note: This function returns the recommended CPUs as calculated during the
59 * current invokation. By the time the call returns, the CPUs may have in 59 * current invocation. By the time the call returns, the CPUs may have in
60 * fact changed priorities any number of times. While not ideal, it is not 60 * fact changed priorities any number of times. While not ideal, it is not
61 * an issue of correctness since the normal rebalancer logic will correct 61 * an issue of correctness since the normal rebalancer logic will correct
62 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf9..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3476 3476
3477static inline int on_null_domain(int cpu) 3477static inline int on_null_domain(int cpu)
3478{ 3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd); 3479 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3480} 3480}
3481 3481
3482/* 3482/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5a6ed1f0990a..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1146,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1146 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
1147 continue; 1147 continue;
1148 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1148 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1149 struct task_struct *p = rt_task_of(rt_se); 1149 struct task_struct *p;
1150
1151 if (!rt_entity_is_task(rt_se))
1152 continue;
1153
1154 p = rt_task_of(rt_se);
1150 if (pick_rt_task(rq, p, cpu)) { 1155 if (pick_rt_task(rq, p, cpu)) {
1151 next = p; 1156 next = p;
1152 break; 1157 break;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9814e43fb23b..8298878f4f71 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
38 39
@@ -1114,6 +1115,15 @@ out:
1114 1115
1115DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1116 1117
1118#ifdef COMPAT_UTS_MACHINE
1119#define override_architecture(name) \
1120 (current->personality == PER_LINUX32 && \
1121 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1122 sizeof(COMPAT_UTS_MACHINE)))
1123#else
1124#define override_architecture(name) 0
1125#endif
1126
1117SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1127SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1118{ 1128{
1119 int errno = 0; 1129 int errno = 0;
@@ -1122,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1122 if (copy_to_user(name, utsname(), sizeof *name)) 1132 if (copy_to_user(name, utsname(), sizeof *name))
1123 errno = -EFAULT; 1133 errno = -EFAULT;
1124 up_read(&uts_sem); 1134 up_read(&uts_sem);
1135
1136 if (!errno && override_architecture(name))
1137 errno = -EFAULT;
1125 return errno; 1138 return errno;
1126} 1139}
1127 1140
1141#ifdef __ARCH_WANT_SYS_OLD_UNAME
1142/*
1143 * Old cruft
1144 */
1145SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1146{
1147 int error = 0;
1148
1149 if (!name)
1150 return -EFAULT;
1151
1152 down_read(&uts_sem);
1153 if (copy_to_user(name, utsname(), sizeof(*name)))
1154 error = -EFAULT;
1155 up_read(&uts_sem);
1156
1157 if (!error && override_architecture(name))
1158 error = -EFAULT;
1159 return error;
1160}
1161
1162SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1163{
1164 int error;
1165
1166 if (!name)
1167 return -EFAULT;
1168 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1169 return -EFAULT;
1170
1171 down_read(&uts_sem);
1172 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1173 __OLD_UTS_LEN);
1174 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1175 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1176 __OLD_UTS_LEN);
1177 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1178 error |= __copy_to_user(&name->release, &utsname()->release,
1179 __OLD_UTS_LEN);
1180 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1181 error |= __copy_to_user(&name->version, &utsname()->version,
1182 __OLD_UTS_LEN);
1183 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1184 error |= __copy_to_user(&name->machine, &utsname()->machine,
1185 __OLD_UTS_LEN);
1186 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1187 up_read(&uts_sem);
1188
1189 if (!error && override_architecture(name))
1190 error = -EFAULT;
1191 return error ? -EFAULT : 0;
1192}
1193#endif
1194
1128SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1195SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1129{ 1196{
1130 int errno; 1197 int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ef19c614f6d..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -60,13 +61,23 @@
60#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
61#include <asm/io.h> 62#include <asm/io.h>
62#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
63 76
64 77
65#if defined(CONFIG_SYSCTL) 78#if defined(CONFIG_SYSCTL)
66 79
67/* External variables not in a header file. */ 80/* External variables not in a header file. */
68extern int C_A_D;
69extern int print_fatal_signals;
70extern int sysctl_overcommit_memory; 81extern int sysctl_overcommit_memory;
71extern int sysctl_overcommit_ratio; 82extern int sysctl_overcommit_ratio;
72extern int sysctl_panic_on_oom; 83extern int sysctl_panic_on_oom;
@@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
88#ifndef CONFIG_MMU 99#ifndef CONFIG_MMU
89extern int sysctl_nr_trim_pages; 100extern int sysctl_nr_trim_pages;
90#endif 101#endif
91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled; 103extern int blk_iopoll_enabled;
96#endif 104#endif
@@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
120 128
121static int ngroups_max = NGROUPS_MAX; 129static int ngroups_max = NGROUPS_MAX;
122 130
123#ifdef CONFIG_MODULES
124extern char modprobe_path[];
125extern int modules_disabled;
126#endif
127#ifdef CONFIG_CHR_DEV_SG
128extern int sg_big_buff;
129#endif
130
131#ifdef CONFIG_SPARC 131#ifdef CONFIG_SPARC
132#include <asm/system.h> 132#include <asm/system.h>
133#endif 133#endif
@@ -149,10 +149,6 @@ extern int sysctl_userprocess_debug;
149extern int spin_retry; 149extern int spin_retry;
150#endif 150#endif
151 151
152#ifdef CONFIG_BSD_PROCESS_ACCT
153extern int acct_parm[];
154#endif
155
156#ifdef CONFIG_IA64 152#ifdef CONFIG_IA64
157extern int no_unaligned_warning; 153extern int no_unaligned_warning;
158extern int unaligned_dump_stack; 154extern int unaligned_dump_stack;
@@ -160,10 +156,6 @@ extern int unaligned_dump_stack;
160 156
161extern struct ratelimit_state printk_ratelimit_state; 157extern struct ratelimit_state printk_ratelimit_state;
162 158
163#ifdef CONFIG_RT_MUTEXES
164extern int max_lock_depth;
165#endif
166
167#ifdef CONFIG_PROC_SYSCTL 159#ifdef CONFIG_PROC_SYSCTL
168static int proc_do_cad_pid(struct ctl_table *table, int write, 160static int proc_do_cad_pid(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 161 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -202,9 +194,6 @@ extern struct ctl_table epoll_table[];
202int sysctl_legacy_va_layout; 194int sysctl_legacy_va_layout;
203#endif 195#endif
204 196
205extern int prove_locking;
206extern int lock_stat;
207
208/* The default sysctl tables: */ 197/* The default sysctl tables: */
209 198
210static struct ctl_table root_table[] = { 199static struct ctl_table root_table[] = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f663d23e85e..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -592,6 +592,10 @@ static inline void clocksource_select(void) { }
592 */ 592 */
593static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
594{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
595 finished_booting = 1; 599 finished_booting = 1;
596 600
597 /* 601 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83783579378f..d9062f5cc0c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
27#include <linux/ctype.h> 27#include <linux/ctype.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/hash.h> 29#include <linux/hash.h>
30#include <linux/rcupdate.h>
30 31
31#include <trace/events/sched.h> 32#include <trace/events/sched.h>
32 33
@@ -84,22 +85,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
84ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
86 87
87#ifdef CONFIG_FUNCTION_GRAPH_TRACER 88/*
88static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 89 * Traverse the ftrace_list, invoking all entries. The reason that we
89#endif 90 * can use rcu_dereference_raw() is that elements removed from this list
90 91 * are simply leaked, so there is no need to interact with a grace-period
92 * mechanism. The rcu_dereference_raw() calls are needed to handle
93 * concurrent insertions into the ftrace_list.
94 *
95 * Silly Alpha and silly pointer-speculation compiler optimizations!
96 */
91static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 97static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
92{ 98{
93 struct ftrace_ops *op = ftrace_list; 99 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
94
95 /* in case someone actually ports this to alpha! */
96 read_barrier_depends();
97 100
98 while (op != &ftrace_list_end) { 101 while (op != &ftrace_list_end) {
99 /* silly alpha */
100 read_barrier_depends();
101 op->func(ip, parent_ip); 102 op->func(ip, parent_ip);
102 op = op->next; 103 op = rcu_dereference_raw(op->next); /*see above*/
103 }; 104 };
104} 105}
105 106
@@ -154,8 +155,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
154 * the ops->next pointer is valid before another CPU sees 155 * the ops->next pointer is valid before another CPU sees
155 * the ops pointer included into the ftrace_list. 156 * the ops pointer included into the ftrace_list.
156 */ 157 */
157 smp_wmb(); 158 rcu_assign_pointer(ftrace_list, ops);
158 ftrace_list = ops;
159 159
160 if (ftrace_enabled) { 160 if (ftrace_enabled) {
161 ftrace_func_t func; 161 ftrace_func_t func;
@@ -2276,6 +2276,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2276 2276
2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2279static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2280
2279static int __init set_graph_function(char *str) 2281static int __init set_graph_function(char *str)
2280{ 2282{
2281 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2283 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3351,6 +3353,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3351{ 3353{
3352 /* Make sure we do not use the parent ret_stack */ 3354 /* Make sure we do not use the parent ret_stack */
3353 t->ret_stack = NULL; 3355 t->ret_stack = NULL;
3356 t->curr_ret_stack = -1;
3354 3357
3355 if (ftrace_graph_active) { 3358 if (ftrace_graph_active) {
3356 struct ftrace_ret_stack *ret_stack; 3359 struct ftrace_ret_stack *ret_stack;
@@ -3360,7 +3363,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3360 GFP_KERNEL); 3363 GFP_KERNEL);
3361 if (!ret_stack) 3364 if (!ret_stack)
3362 return; 3365 return;
3363 t->curr_ret_stack = -1;
3364 atomic_set(&t->tracing_graph_pause, 0); 3366 atomic_set(&t->tracing_graph_pause, 0);
3365 atomic_set(&t->trace_overrun, 0); 3367 atomic_set(&t->trace_overrun, 0);
3366 t->ftrace_timestamp = 0; 3368 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0287f9f52f5a..05a9f83b8819 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2233,12 +2233,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2233 if (ring_buffer_flags != RB_BUFFERS_ON) 2233 if (ring_buffer_flags != RB_BUFFERS_ON)
2234 return NULL; 2234 return NULL;
2235 2235
2236 if (atomic_read(&buffer->record_disabled))
2237 return NULL;
2238
2239 /* If we are tracing schedule, we don't want to recurse */ 2236 /* If we are tracing schedule, we don't want to recurse */
2240 resched = ftrace_preempt_disable(); 2237 resched = ftrace_preempt_disable();
2241 2238
2239 if (atomic_read(&buffer->record_disabled))
2240 goto out_nocheck;
2241
2242 if (trace_recursive_lock()) 2242 if (trace_recursive_lock())
2243 goto out_nocheck; 2243 goto out_nocheck;
2244 2244
@@ -2470,11 +2470,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2470 if (ring_buffer_flags != RB_BUFFERS_ON) 2470 if (ring_buffer_flags != RB_BUFFERS_ON)
2471 return -EBUSY; 2471 return -EBUSY;
2472 2472
2473 if (atomic_read(&buffer->record_disabled))
2474 return -EBUSY;
2475
2476 resched = ftrace_preempt_disable(); 2473 resched = ftrace_preempt_disable();
2477 2474
2475 if (atomic_read(&buffer->record_disabled))
2476 goto out;
2477
2478 cpu = raw_smp_processor_id(); 2478 cpu = raw_smp_processor_id();
2479 2479
2480 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2480 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2542,7 +2542,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2542 * @buffer: The ring buffer to enable writes 2542 * @buffer: The ring buffer to enable writes
2543 * 2543 *
2544 * Note, multiple disables will need the same number of enables 2544 * Note, multiple disables will need the same number of enables
2545 * to truely enable the writing (much like preempt_disable). 2545 * to truly enable the writing (much like preempt_disable).
2546 */ 2546 */
2547void ring_buffer_record_enable(struct ring_buffer *buffer) 2547void ring_buffer_record_enable(struct ring_buffer *buffer)
2548{ 2548{
@@ -2578,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2578 * @cpu: The CPU to enable. 2578 * @cpu: The CPU to enable.
2579 * 2579 *
2580 * Note, multiple disables will need the same number of enables 2580 * Note, multiple disables will need the same number of enables
2581 * to truely enable the writing (much like preempt_disable). 2581 * to truly enable the writing (much like preempt_disable).
2582 */ 2582 */
2583void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2583void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2584{ 2584{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ed01fdba4a55..3ec2ee6f6560 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -374,6 +374,21 @@ static int __init set_buf_size(char *str)
374} 374}
375__setup("trace_buf_size=", set_buf_size); 375__setup("trace_buf_size=", set_buf_size);
376 376
377static int __init set_tracing_thresh(char *str)
378{
379 unsigned long threshhold;
380 int ret;
381
382 if (!str)
383 return 0;
384 ret = strict_strtoul(str, 0, &threshhold);
385 if (ret < 0)
386 return 0;
387 tracing_thresh = threshhold * 1000;
388 return 1;
389}
390__setup("tracing_thresh=", set_tracing_thresh);
391
377unsigned long nsecs_to_usecs(unsigned long nsecs) 392unsigned long nsecs_to_usecs(unsigned long nsecs)
378{ 393{
379 return nsecs / 1000; 394 return nsecs / 1000;
@@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
579static arch_spinlock_t ftrace_max_lock = 594static arch_spinlock_t ftrace_max_lock =
580 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 595 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
581 596
597unsigned long __read_mostly tracing_thresh;
598
582#ifdef CONFIG_TRACER_MAX_TRACE 599#ifdef CONFIG_TRACER_MAX_TRACE
583unsigned long __read_mostly tracing_max_latency; 600unsigned long __read_mostly tracing_max_latency;
584unsigned long __read_mostly tracing_thresh;
585 601
586/* 602/*
587 * Copy the new maximum trace into the separate maximum-trace 603 * Copy the new maximum trace into the separate maximum-trace
@@ -592,7 +608,7 @@ static void
592__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 608__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
593{ 609{
594 struct trace_array_cpu *data = tr->data[cpu]; 610 struct trace_array_cpu *data = tr->data[cpu];
595 struct trace_array_cpu *max_data = tr->data[cpu]; 611 struct trace_array_cpu *max_data;
596 612
597 max_tr.cpu = cpu; 613 max_tr.cpu = cpu;
598 max_tr.time_start = data->preempt_timestamp; 614 max_tr.time_start = data->preempt_timestamp;
@@ -602,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
602 max_data->critical_start = data->critical_start; 618 max_data->critical_start = data->critical_start;
603 max_data->critical_end = data->critical_end; 619 max_data->critical_end = data->critical_end;
604 620
605 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 621 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
606 max_data->pid = tsk->pid; 622 max_data->pid = tsk->pid;
607 max_data->uid = task_uid(tsk); 623 max_data->uid = task_uid(tsk);
608 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 624 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -824,10 +840,10 @@ out:
824 mutex_unlock(&trace_types_lock); 840 mutex_unlock(&trace_types_lock);
825} 841}
826 842
827static void __tracing_reset(struct trace_array *tr, int cpu) 843static void __tracing_reset(struct ring_buffer *buffer, int cpu)
828{ 844{
829 ftrace_disable_cpu(); 845 ftrace_disable_cpu();
830 ring_buffer_reset_cpu(tr->buffer, cpu); 846 ring_buffer_reset_cpu(buffer, cpu);
831 ftrace_enable_cpu(); 847 ftrace_enable_cpu();
832} 848}
833 849
@@ -839,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
839 855
840 /* Make sure all commits have finished */ 856 /* Make sure all commits have finished */
841 synchronize_sched(); 857 synchronize_sched();
842 __tracing_reset(tr, cpu); 858 __tracing_reset(buffer, cpu);
843 859
844 ring_buffer_record_enable(buffer); 860 ring_buffer_record_enable(buffer);
845} 861}
@@ -857,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
857 tr->time_start = ftrace_now(tr->cpu); 873 tr->time_start = ftrace_now(tr->cpu);
858 874
859 for_each_online_cpu(cpu) 875 for_each_online_cpu(cpu)
860 __tracing_reset(tr, cpu); 876 __tracing_reset(buffer, cpu);
861 877
862 ring_buffer_record_enable(buffer); 878 ring_buffer_record_enable(buffer);
863} 879}
@@ -934,6 +950,8 @@ void tracing_start(void)
934 goto out; 950 goto out;
935 } 951 }
936 952
953 /* Prevent the buffers from switching */
954 arch_spin_lock(&ftrace_max_lock);
937 955
938 buffer = global_trace.buffer; 956 buffer = global_trace.buffer;
939 if (buffer) 957 if (buffer)
@@ -943,6 +961,8 @@ void tracing_start(void)
943 if (buffer) 961 if (buffer)
944 ring_buffer_record_enable(buffer); 962 ring_buffer_record_enable(buffer);
945 963
964 arch_spin_unlock(&ftrace_max_lock);
965
946 ftrace_start(); 966 ftrace_start();
947 out: 967 out:
948 spin_unlock_irqrestore(&tracing_start_lock, flags); 968 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -964,6 +984,9 @@ void tracing_stop(void)
964 if (trace_stop_count++) 984 if (trace_stop_count++)
965 goto out; 985 goto out;
966 986
987 /* Prevent the buffers from switching */
988 arch_spin_lock(&ftrace_max_lock);
989
967 buffer = global_trace.buffer; 990 buffer = global_trace.buffer;
968 if (buffer) 991 if (buffer)
969 ring_buffer_record_disable(buffer); 992 ring_buffer_record_disable(buffer);
@@ -972,6 +995,8 @@ void tracing_stop(void)
972 if (buffer) 995 if (buffer)
973 ring_buffer_record_disable(buffer); 996 ring_buffer_record_disable(buffer);
974 997
998 arch_spin_unlock(&ftrace_max_lock);
999
975 out: 1000 out:
976 spin_unlock_irqrestore(&tracing_start_lock, flags); 1001 spin_unlock_irqrestore(&tracing_start_lock, flags);
977} 1002}
@@ -1259,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1259 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1284 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1260 return; 1285 return;
1261 1286
1287 /*
1288 * NMIs can not handle page faults, even with fix ups.
1289 * The save user stack can (and often does) fault.
1290 */
1291 if (unlikely(in_nmi()))
1292 return;
1293
1262 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1294 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1263 sizeof(*entry), flags, pc); 1295 sizeof(*entry), flags, pc);
1264 if (!event) 1296 if (!event)
@@ -1703,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1703 1735
1704 ftrace_enable_cpu(); 1736 ftrace_enable_cpu();
1705 1737
1738 iter->leftover = 0;
1706 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1739 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1707 ; 1740 ;
1708 1741
@@ -4248,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
4248#ifdef CONFIG_TRACER_MAX_TRACE 4281#ifdef CONFIG_TRACER_MAX_TRACE
4249 trace_create_file("tracing_max_latency", 0644, d_tracer, 4282 trace_create_file("tracing_max_latency", 0644, d_tracer,
4250 &tracing_max_latency, &tracing_max_lat_fops); 4283 &tracing_max_latency, &tracing_max_lat_fops);
4284#endif
4251 4285
4252 trace_create_file("tracing_thresh", 0644, d_tracer, 4286 trace_create_file("tracing_thresh", 0644, d_tracer,
4253 &tracing_thresh, &tracing_max_lat_fops); 4287 &tracing_thresh, &tracing_max_lat_fops);
4254#endif
4255 4288
4256 trace_create_file("README", 0444, d_tracer, 4289 trace_create_file("README", 0444, d_tracer,
4257 NULL, &tracing_readme_fops); 4290 NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 396
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 398
399extern unsigned long tracing_thresh;
400
399#ifdef CONFIG_TRACER_MAX_TRACE 401#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 402extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 403
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 404void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 405void update_max_tr_single(struct trace_array *tr,
@@ -550,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
550 * struct trace_parser - servers for reading the user input separated by spaces 551 * struct trace_parser - servers for reading the user input separated by spaces
551 * @cont: set if the input is not complete - no final space char was found 552 * @cont: set if the input is not complete - no final space char was found
552 * @buffer: holds the parsed user input 553 * @buffer: holds the parsed user input
553 * @idx: user input lenght 554 * @idx: user input length
554 * @size: buffer size 555 * @size: buffer size
555 */ 556 */
556struct trace_parser { 557struct trace_parser {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..6fbfb8f417b9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7d79a10c3cde..81f691eb3a30 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -142,9 +142,9 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
142 cpu = smp_processor_id(); 142 cpu = smp_processor_id();
143 143
144 if (in_nmi()) 144 if (in_nmi())
145 trace_buf = rcu_dereference(perf_trace_buf_nmi); 145 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
146 else 146 else
147 trace_buf = rcu_dereference(perf_trace_buf); 147 trace_buf = rcu_dereference_sched(perf_trace_buf);
148 148
149 if (!trace_buf) 149 if (!trace_buf)
150 goto err; 150 goto err;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3fc2a575664f..e6989d9b44da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
237 return ret; 237 return ret;
238} 238}
239 239
240int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
241{
242 if (tracing_thresh)
243 return 1;
244 else
245 return trace_graph_entry(trace);
246}
247
240static void __trace_graph_return(struct trace_array *tr, 248static void __trace_graph_return(struct trace_array *tr,
241 struct ftrace_graph_ret *trace, 249 struct ftrace_graph_ret *trace,
242 unsigned long flags, 250 unsigned long flags,
@@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr)
290 smp_mb(); 298 smp_mb();
291} 299}
292 300
301void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
302{
303 if (tracing_thresh &&
304 (trace->rettime - trace->calltime < tracing_thresh))
305 return;
306 else
307 trace_graph_return(trace);
308}
309
293static int graph_trace_init(struct trace_array *tr) 310static int graph_trace_init(struct trace_array *tr)
294{ 311{
295 int ret; 312 int ret;
296 313
297 set_graph_array(tr); 314 set_graph_array(tr);
298 ret = register_ftrace_graph(&trace_graph_return, 315 if (tracing_thresh)
299 &trace_graph_entry); 316 ret = register_ftrace_graph(&trace_graph_thresh_return,
317 &trace_graph_thresh_entry);
318 else
319 ret = register_ftrace_graph(&trace_graph_return,
320 &trace_graph_entry);
300 if (ret) 321 if (ret)
301 return ret; 322 return ret;
302 tracing_start_cmdline_record(); 323 tracing_start_cmdline_record();
@@ -920,7 +941,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
920 if (!ret) 941 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE; 942 return TRACE_TYPE_PARTIAL_LINE;
922 } else { 943 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); 944 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
924 if (!ret) 945 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE; 946 return TRACE_TYPE_PARTIAL_LINE;
926 } 947 }