aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c67
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c9
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c16
-rw-r--r--kernel/cgroup.c141
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c69
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/hw_breakpoint.c3
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/jump_label.c77
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/module.c14
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c229
-rw-r--r--kernel/pm_qos_params.c4
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/power/swap.c59
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk.c30
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/resource.c153
-rw-r--r--kernel/sched.c47
-rw-r--r--kernel/sched_fair.c73
-rw-r--r--kernel/sched_stats.h20
-rw-r--r--kernel/sched_stoptask.c4
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/ring_buffer.c335
-rw-r--r--kernel/trace/trace.c28
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c6
61 files changed, 1265 insertions, 749 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
467 struct task_struct *tsk; 467 struct task_struct *tsk;
468 int err; 468 int err;
469 469
470 read_lock(&tasklist_lock); 470 rcu_read_lock();
471 tsk = find_task_by_vpid(pid); 471 tsk = find_task_by_vpid(pid);
472 err = -ESRCH; 472 if (!tsk) {
473 if (!tsk) 473 rcu_read_unlock();
474 goto out; 474 return -ESRCH;
475 err = 0; 475 }
476 476 get_task_struct(tsk);
477 spin_lock_irq(&tsk->sighand->siglock); 477 rcu_read_unlock();
478 if (!tsk->signal->audit_tty) 478 err = tty_audit_push_task(tsk, loginuid, sessionid);
479 err = -EPERM; 479 put_task_struct(tsk);
480 spin_unlock_irq(&tsk->sighand->siglock);
481 if (err)
482 goto out;
483
484 tty_audit_push_task(tsk, loginuid, sessionid);
485out:
486 read_unlock(&tasklist_lock);
487 return err; 480 return err;
488} 481}
489 482
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
506} 499}
507 500
508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 501struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
509 int multi, void *payload, int size) 502 int multi, const void *payload, int size)
510{ 503{
511 struct sk_buff *skb; 504 struct sk_buff *skb;
512 struct nlmsghdr *nlh; 505 struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
555 * Allocates an skb, builds the netlink message, and sends it to the pid. 548 * Allocates an skb, builds the netlink message, and sends it to the pid.
556 * No failure notifications. 549 * No failure notifications.
557 */ 550 */
558void audit_send_reply(int pid, int seq, int type, int done, int multi, 551static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 void *payload, int size) 552 const void *payload, int size)
560{ 553{
561 struct sk_buff *skb; 554 struct sk_buff *skb;
562 struct task_struct *tsk; 555 struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
880 case AUDIT_TTY_GET: { 873 case AUDIT_TTY_GET: {
881 struct audit_tty_status s; 874 struct audit_tty_status s;
882 struct task_struct *tsk; 875 struct task_struct *tsk;
876 unsigned long flags;
883 877
884 read_lock(&tasklist_lock); 878 rcu_read_lock();
885 tsk = find_task_by_vpid(pid); 879 tsk = find_task_by_vpid(pid);
886 if (!tsk) 880 if (tsk && lock_task_sighand(tsk, &flags)) {
887 err = -ESRCH;
888 else {
889 spin_lock_irq(&tsk->sighand->siglock);
890 s.enabled = tsk->signal->audit_tty != 0; 881 s.enabled = tsk->signal->audit_tty != 0;
891 spin_unlock_irq(&tsk->sighand->siglock); 882 unlock_task_sighand(tsk, &flags);
892 } 883 } else
893 read_unlock(&tasklist_lock); 884 err = -ESRCH;
894 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, 885 rcu_read_unlock();
895 &s, sizeof(s)); 886
887 if (!err)
888 audit_send_reply(NETLINK_CB(skb).pid, seq,
889 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 890 break;
897 } 891 }
898 case AUDIT_TTY_SET: { 892 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 893 struct audit_tty_status *s;
900 struct task_struct *tsk; 894 struct task_struct *tsk;
895 unsigned long flags;
901 896
902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 897 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
903 return -EINVAL; 898 return -EINVAL;
904 s = data; 899 s = data;
905 if (s->enabled != 0 && s->enabled != 1) 900 if (s->enabled != 0 && s->enabled != 1)
906 return -EINVAL; 901 return -EINVAL;
907 read_lock(&tasklist_lock); 902 rcu_read_lock();
908 tsk = find_task_by_vpid(pid); 903 tsk = find_task_by_vpid(pid);
909 if (!tsk) 904 if (tsk && lock_task_sighand(tsk, &flags)) {
910 err = -ESRCH;
911 else {
912 spin_lock_irq(&tsk->sighand->siglock);
913 tsk->signal->audit_tty = s->enabled != 0; 905 tsk->signal->audit_tty = s->enabled != 0;
914 spin_unlock_irq(&tsk->sighand->siglock); 906 unlock_task_sighand(tsk, &flags);
915 } 907 } else
916 read_unlock(&tasklist_lock); 908 err = -ESRCH;
909 rcu_read_unlock();
917 break; 910 break;
918 } 911 }
919 default: 912 default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
84 int *dirlen); 84 int *dirlen);
85extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
86 int done, int multi, 86 int done, int multi,
87 void *payload, int size); 87 const void *payload, int size);
88extern void audit_send_reply(int pid, int seq, int type,
89 int done, int multi,
90 void *payload, int size);
91extern void audit_panic(const char *message); 88extern void audit_panic(const char *message);
92 89
93struct audit_netlink_list { 90struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
223{ 223{
224 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark; 225 struct fsnotify_mark *entry = &chunk->mark;
226 struct audit_chunk *new; 226 struct audit_chunk *new = NULL;
227 struct audit_tree *owner; 227 struct audit_tree *owner;
228 int size = chunk->count - 1; 228 int size = chunk->count - 1;
229 int i, j; 229 int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
232 232
233 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
234 234
235 if (size)
236 new = alloc_chunk(size);
237
235 spin_lock(&entry->lock); 238 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->i.inode) { 239 if (chunk->dead || !entry->i.inode) {
237 spin_unlock(&entry->lock); 240 spin_unlock(&entry->lock);
241 if (new)
242 free_chunk(new);
238 goto out; 243 goto out;
239 } 244 }
240 245
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
255 goto out; 260 goto out;
256 } 261 }
257 262
258 new = alloc_chunk(size);
259 if (!new) 263 if (!new)
260 goto Fallback; 264 goto Fallback;
265
261 fsnotify_duplicate_mark(&new->mark, entry); 266 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 267 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
263 free_chunk(new); 268 free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
60}; 60};
61 61
62/* fsnotify handle. */ 62/* fsnotify handle. */
63struct fsnotify_group *audit_watch_group; 63static struct fsnotify_group *audit_watch_group;
64 64
65/* fsnotify events we care about. */ 65/* fsnotify events we care about. */
66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
123 } 123 }
124} 124}
125 125
126void audit_remove_watch(struct audit_watch *watch) 126static void audit_remove_watch(struct audit_watch *watch)
127{ 127{
128 list_del(&watch->wlist); 128 list_del(&watch->wlist);
129 audit_put_parent(watch->parent); 129 audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1252 case AUDIT_LOGINUID: 1252 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1253 result = audit_comparator(cb->loginuid, f->op, f->val);
1254 break; 1254 break;
1255 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule)
1261 result = security_audit_rule_match(cb->sid,
1262 f->type,
1263 f->op,
1264 f->lsm_rule,
1265 NULL);
1266 break;
1255 } 1267 }
1256 1268
1257 if (!result) 1269 if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
241 pid_t pid; 241 pid_t pid;
242 struct audit_cap_data cap; 242 struct audit_cap_data cap;
243 } capset; 243 } capset;
244 struct {
245 int fd;
246 int flags;
247 } mmap;
244 }; 248 };
245 int fds[2]; 249 int fds[2];
246 250
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1309 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1310 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; } 1311 break; }
1312 case AUDIT_MMAP: {
1313 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1314 context->mmap.flags);
1315 break; }
1308 } 1316 }
1309 audit_log_end(ab); 1317 audit_log_end(ab);
1310} 1318}
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
2476 context->type = AUDIT_CAPSET; 2484 context->type = AUDIT_CAPSET;
2477} 2485}
2478 2486
2487void __audit_mmap_fd(int fd, int flags)
2488{
2489 struct audit_context *context = current->audit_context;
2490 context->mmap.fd = fd;
2491 context->mmap.flags = flags;
2492 context->type = AUDIT_MMAP;
2493}
2494
2479/** 2495/**
2480 * audit_core_dumps - record information about processes that end abnormally 2496 * audit_core_dumps - record information about processes that end abnormally
2481 * @signr: signal value 2497 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b69b8d0313d..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
244} 244}
245 245
246static int clone_children(const struct cgroup *cgrp)
247{
248 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
249}
250
246/* 251/*
247 * for_each_subsys() allows you to iterate on each subsystem attached to 252 * for_each_subsys() allows you to iterate on each subsystem attached to
248 * an active hierarchy 253 * an active hierarchy
@@ -777,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
777 struct inode *inode = new_inode(sb); 782 struct inode *inode = new_inode(sb);
778 783
779 if (inode) { 784 if (inode) {
785 inode->i_ino = get_next_ino();
780 inode->i_mode = mode; 786 inode->i_mode = mode;
781 inode->i_uid = current_fsuid(); 787 inode->i_uid = current_fsuid();
782 inode->i_gid = current_fsgid(); 788 inode->i_gid = current_fsgid();
@@ -1039,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1039 seq_puts(seq, ",noprefix"); 1045 seq_puts(seq, ",noprefix");
1040 if (strlen(root->release_agent_path)) 1046 if (strlen(root->release_agent_path))
1041 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1047 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1048 if (clone_children(&root->top_cgroup))
1049 seq_puts(seq, ",clone_children");
1042 if (strlen(root->name)) 1050 if (strlen(root->name))
1043 seq_printf(seq, ",name=%s", root->name); 1051 seq_printf(seq, ",name=%s", root->name);
1044 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
@@ -1049,6 +1057,7 @@ struct cgroup_sb_opts {
1049 unsigned long subsys_bits; 1057 unsigned long subsys_bits;
1050 unsigned long flags; 1058 unsigned long flags;
1051 char *release_agent; 1059 char *release_agent;
1060 bool clone_children;
1052 char *name; 1061 char *name;
1053 /* User explicitly requested empty subsystem */ 1062 /* User explicitly requested empty subsystem */
1054 bool none; 1063 bool none;
@@ -1065,7 +1074,8 @@ struct cgroup_sb_opts {
1065 */ 1074 */
1066static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1075static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1067{ 1076{
1068 char *token, *o = data ?: "all"; 1077 char *token, *o = data;
1078 bool all_ss = false, one_ss = false;
1069 unsigned long mask = (unsigned long)-1; 1079 unsigned long mask = (unsigned long)-1;
1070 int i; 1080 int i;
1071 bool module_pin_failed = false; 1081 bool module_pin_failed = false;
@@ -1081,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1081 while ((token = strsep(&o, ",")) != NULL) { 1091 while ((token = strsep(&o, ",")) != NULL) {
1082 if (!*token) 1092 if (!*token)
1083 return -EINVAL; 1093 return -EINVAL;
1084 if (!strcmp(token, "all")) { 1094 if (!strcmp(token, "none")) {
1085 /* Add all non-disabled subsystems */
1086 opts->subsys_bits = 0;
1087 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1088 struct cgroup_subsys *ss = subsys[i];
1089 if (ss == NULL)
1090 continue;
1091 if (!ss->disabled)
1092 opts->subsys_bits |= 1ul << i;
1093 }
1094 } else if (!strcmp(token, "none")) {
1095 /* Explicitly have no subsystems */ 1095 /* Explicitly have no subsystems */
1096 opts->none = true; 1096 opts->none = true;
1097 } else if (!strcmp(token, "noprefix")) { 1097 continue;
1098 }
1099 if (!strcmp(token, "all")) {
1100 /* Mutually exclusive option 'all' + subsystem name */
1101 if (one_ss)
1102 return -EINVAL;
1103 all_ss = true;
1104 continue;
1105 }
1106 if (!strcmp(token, "noprefix")) {
1098 set_bit(ROOT_NOPREFIX, &opts->flags); 1107 set_bit(ROOT_NOPREFIX, &opts->flags);
1099 } else if (!strncmp(token, "release_agent=", 14)) { 1108 continue;
1109 }
1110 if (!strcmp(token, "clone_children")) {
1111 opts->clone_children = true;
1112 continue;
1113 }
1114 if (!strncmp(token, "release_agent=", 14)) {
1100 /* Specifying two release agents is forbidden */ 1115 /* Specifying two release agents is forbidden */
1101 if (opts->release_agent) 1116 if (opts->release_agent)
1102 return -EINVAL; 1117 return -EINVAL;
@@ -1104,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1104 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1119 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1105 if (!opts->release_agent) 1120 if (!opts->release_agent)
1106 return -ENOMEM; 1121 return -ENOMEM;
1107 } else if (!strncmp(token, "name=", 5)) { 1122 continue;
1123 }
1124 if (!strncmp(token, "name=", 5)) {
1108 const char *name = token + 5; 1125 const char *name = token + 5;
1109 /* Can't specify an empty name */ 1126 /* Can't specify an empty name */
1110 if (!strlen(name)) 1127 if (!strlen(name))
@@ -1126,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1126 GFP_KERNEL); 1143 GFP_KERNEL);
1127 if (!opts->name) 1144 if (!opts->name)
1128 return -ENOMEM; 1145 return -ENOMEM;
1129 } else { 1146
1130 struct cgroup_subsys *ss; 1147 continue;
1131 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1148 }
1132 ss = subsys[i]; 1149
1133 if (ss == NULL) 1150 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1134 continue; 1151 struct cgroup_subsys *ss = subsys[i];
1135 if (!strcmp(token, ss->name)) { 1152 if (ss == NULL)
1136 if (!ss->disabled) 1153 continue;
1137 set_bit(i, &opts->subsys_bits); 1154 if (strcmp(token, ss->name))
1138 break; 1155 continue;
1139 } 1156 if (ss->disabled)
1140 } 1157 continue;
1141 if (i == CGROUP_SUBSYS_COUNT) 1158
1142 return -ENOENT; 1159 /* Mutually exclusive option 'all' + subsystem name */
1160 if (all_ss)
1161 return -EINVAL;
1162 set_bit(i, &opts->subsys_bits);
1163 one_ss = true;
1164
1165 break;
1166 }
1167 if (i == CGROUP_SUBSYS_COUNT)
1168 return -ENOENT;
1169 }
1170
1171 /*
1172 * If the 'all' option was specified select all the subsystems,
1173 * otherwise 'all, 'none' and a subsystem name options were not
1174 * specified, let's default to 'all'
1175 */
1176 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1178 struct cgroup_subsys *ss = subsys[i];
1179 if (ss == NULL)
1180 continue;
1181 if (ss->disabled)
1182 continue;
1183 set_bit(i, &opts->subsys_bits);
1143 } 1184 }
1144 } 1185 }
1145 1186
@@ -1354,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1354 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1355 if (opts->name) 1396 if (opts->name)
1356 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1398 if (opts->clone_children)
1399 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1357 return root; 1400 return root;
1358} 1401}
1359 1402
@@ -1417,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1417 return 0; 1460 return 0;
1418} 1461}
1419 1462
1420static int cgroup_get_sb(struct file_system_type *fs_type, 1463static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1421 int flags, const char *unused_dev_name, 1464 int flags, const char *unused_dev_name,
1422 void *data, struct vfsmount *mnt) 1465 void *data)
1423{ 1466{
1424 struct cgroup_sb_opts opts; 1467 struct cgroup_sb_opts opts;
1425 struct cgroupfs_root *root; 1468 struct cgroupfs_root *root;
@@ -1553,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1553 drop_parsed_module_refcounts(opts.subsys_bits); 1596 drop_parsed_module_refcounts(opts.subsys_bits);
1554 } 1597 }
1555 1598
1556 simple_set_mnt(mnt, sb);
1557 kfree(opts.release_agent); 1599 kfree(opts.release_agent);
1558 kfree(opts.name); 1600 kfree(opts.name);
1559 return 0; 1601 return dget(sb->s_root);
1560 1602
1561 drop_new_super: 1603 drop_new_super:
1562 deactivate_locked_super(sb); 1604 deactivate_locked_super(sb);
@@ -1565,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1565 out_err: 1607 out_err:
1566 kfree(opts.release_agent); 1608 kfree(opts.release_agent);
1567 kfree(opts.name); 1609 kfree(opts.name);
1568 return ret; 1610 return ERR_PTR(ret);
1569} 1611}
1570 1612
1571static void cgroup_kill_sb(struct super_block *sb) { 1613static void cgroup_kill_sb(struct super_block *sb) {
@@ -1615,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1615 1657
1616static struct file_system_type cgroup_fs_type = { 1658static struct file_system_type cgroup_fs_type = {
1617 .name = "cgroup", 1659 .name = "cgroup",
1618 .get_sb = cgroup_get_sb, 1660 .mount = cgroup_mount,
1619 .kill_sb = cgroup_kill_sb, 1661 .kill_sb = cgroup_kill_sb,
1620}; 1662};
1621 1663
@@ -1879,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1879 const char *buffer) 1921 const char *buffer)
1880{ 1922{
1881 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1923 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1924 if (strlen(buffer) >= PATH_MAX)
1925 return -EINVAL;
1882 if (!cgroup_lock_live_group(cgrp)) 1926 if (!cgroup_lock_live_group(cgrp))
1883 return -ENODEV; 1927 return -ENODEV;
1884 strcpy(cgrp->root->release_agent_path, buffer); 1928 strcpy(cgrp->root->release_agent_path, buffer);
@@ -3172,6 +3216,23 @@ fail:
3172 return ret; 3216 return ret;
3173} 3217}
3174 3218
3219static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3220 struct cftype *cft)
3221{
3222 return clone_children(cgrp);
3223}
3224
3225static int cgroup_clone_children_write(struct cgroup *cgrp,
3226 struct cftype *cft,
3227 u64 val)
3228{
3229 if (val)
3230 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3231 else
3232 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3233 return 0;
3234}
3235
3175/* 3236/*
3176 * for the common functions, 'private' gives the type of file 3237 * for the common functions, 'private' gives the type of file
3177 */ 3238 */
@@ -3202,6 +3263,11 @@ static struct cftype files[] = {
3202 .write_string = cgroup_write_event_control, 3263 .write_string = cgroup_write_event_control,
3203 .mode = S_IWUGO, 3264 .mode = S_IWUGO,
3204 }, 3265 },
3266 {
3267 .name = "cgroup.clone_children",
3268 .read_u64 = cgroup_clone_children_read,
3269 .write_u64 = cgroup_clone_children_write,
3270 },
3205}; 3271};
3206 3272
3207static struct cftype cft_release_agent = { 3273static struct cftype cft_release_agent = {
@@ -3331,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3331 if (notify_on_release(parent)) 3397 if (notify_on_release(parent))
3332 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3398 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3333 3399
3400 if (clone_children(parent))
3401 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3402
3334 for_each_subsys(root, ss) { 3403 for_each_subsys(root, ss) {
3335 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3404 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3336 3405
@@ -3345,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3345 goto err_destroy; 3414 goto err_destroy;
3346 } 3415 }
3347 /* At error, ->destroy() callback has to free assigned ID. */ 3416 /* At error, ->destroy() callback has to free assigned ID. */
3417 if (clone_children(parent) && ss->post_clone)
3418 ss->post_clone(ss, cgrp);
3348 } 3419 }
3349 3420
3350 cgroup_lock_hierarchy(root); 3421 cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task)) 171 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED)
183 return -EBUSY; 173 return -EBUSY;
184 174
185 freezer = cgroup_freezer(new_cgroup); 175 rcu_read_lock();
186 if (freezer->state == CGROUP_FROZEN) 176 if (__cgroup_freezing_or_frozen(task)) {
177 rcu_read_unlock();
187 return -EBUSY; 178 return -EBUSY;
179 }
180 rcu_read_unlock();
188 181
189 if (threadgroup) { 182 if (threadgroup) {
190 struct task_struct *c; 183 struct task_struct *c;
191 184
192 rcu_read_lock(); 185 rcu_read_lock();
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
194 if (is_task_frozen_enough(c)) { 187 if (__cgroup_freezing_or_frozen(c)) {
195 rcu_read_unlock(); 188 rcu_read_unlock();
196 return -EBUSY; 189 return -EBUSY;
197 } 190 }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 229/*
237 * caller must hold freezer->lock 230 * caller must hold freezer->lock
238 */ 231 */
239static void update_freezer_state(struct cgroup *cgroup, 232static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 233 struct freezer *freezer)
241{ 234{
242 struct cgroup_iter it; 235 struct cgroup_iter it;
243 struct task_struct *task; 236 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 237 unsigned int nfrozen = 0, ntotal = 0;
238 enum freezer_state old_state = freezer->state;
245 239
246 cgroup_iter_start(cgroup, &it); 240 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 241 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 242 ntotal++;
249 if (is_task_frozen_enough(task)) 243 if (frozen(task))
250 nfrozen++; 244 nfrozen++;
251 } 245 }
252 246
253 /* 247 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 248 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 249 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 250 if (nfrozen == ntotal)
257 */ 251 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 252 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 253 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 254 }
261 freezer->state = CGROUP_FREEZING; 255
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 256 cgroup_iter_end(cgroup, &it);
265} 257}
266 258
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 271 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 272 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 273 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 274 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 275 state = freezer->state;
284 } 276 }
285 spin_unlock_irq(&freezer->lock); 277 spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 293 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 294 if (!freeze_task(task, true))
303 continue; 295 continue;
304 if (is_task_frozen_enough(task)) 296 if (frozen(task))
305 continue; 297 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 298 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 299 num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 327
336 spin_lock_irq(&freezer->lock); 328 spin_lock_irq(&freezer->lock);
337 329
338 update_freezer_state(cgroup, freezer); 330 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 331 if (goal_state == freezer->state)
340 goto out; 332 goto out;
341 333
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
325 325
326/* 326/*
327 * Prepare credentials for current to perform an execve() 327 * Prepare credentials for current to perform an execve()
328 * - The caller must hold current->cred_guard_mutex 328 * - The caller must hold ->cred_guard_mutex
329 */ 329 */
330struct cred *prepare_exec_creds(void) 330struct cred *prepare_exec_creds(void)
331{ 331{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
384 struct cred *new; 384 struct cred *new;
385 int ret; 385 int ret;
386 386
387 mutex_init(&p->cred_guard_mutex);
388
389 if ( 387 if (
390#ifdef CONFIG_KEYS 388#ifdef CONFIG_KEYS
391 !p->cred->thread_keyring && 389 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index fec596da9bd0..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
209 return 0; 209 return 0;
210} 210}
211 211
212/**
213 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
214 * @regs: Current &struct pt_regs.
215 *
216 * This function will be called if the particular architecture must
217 * disable hardware debugging while it is processing gdb packets or
218 * handling exception.
219 */
220void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
221{
222}
223
224/* 212/*
225 * Some architectures need cache flushes when we set/clear a 213 * Some architectures need cache flushes when we set/clear a
226 * breakpoint: 214 * breakpoint:
@@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
484 atomic_inc(&masters_in_kgdb); 472 atomic_inc(&masters_in_kgdb);
485 else 473 else
486 atomic_inc(&slaves_in_kgdb); 474 atomic_inc(&slaves_in_kgdb);
487 kgdb_disable_hw_debug(ks->linux_regs); 475
476 if (arch_kgdb_ops.disable_hw_break)
477 arch_kgdb_ops.disable_hw_break(regs);
488 478
489acquirelock: 479acquirelock:
490 /* 480 /*
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d7bda21a106b..a6e729766821 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) 85 num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
86 86
87typedef struct _kdbmsg { 87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */ 88 int km_diag; /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
646 } 646 }
647 if (!s->usable) 647 if (!s->usable)
648 return KDB_NOTIMP; 648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); 649 s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) { 650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n", 651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr); 652 cmdstr);
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1127 /* special case below */ 1127 /* special case below */
1128 } else { 1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", 1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid); 1130 kdb_current, kdb_current ? kdb_current->pid : 0);
1131#if defined(CONFIG_SMP) 1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id()); 1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif 1133#endif
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
2361 */ 2361 */
2362static int kdb_ll(int argc, const char **argv) 2362static int kdb_ll(int argc, const char **argv)
2363{ 2363{
2364 int diag; 2364 int diag = 0;
2365 unsigned long addr; 2365 unsigned long addr;
2366 long offset = 0; 2366 long offset = 0;
2367 unsigned long va; 2367 unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
2400 char buf[80]; 2400 char buf[80];
2401 2401
2402 if (KDB_FLAG(CMD_INTERRUPT)) 2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0; 2403 goto out;
2404 2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); 2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf); 2406 diag = kdb_parse(buf);
2407 if (diag) 2407 if (diag)
2408 return diag; 2408 goto out;
2409 2409
2410 addr = va + linkoffset; 2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va))) 2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0; 2412 goto out;
2413 } 2413 }
2414 kfree(command);
2415 2414
2416 return 0; 2415out:
2416 kfree(command);
2417 return diag;
2417} 2418}
2418 2419
2419static int kdb_kgdb(int argc, const char **argv) 2420static int kdb_kgdb(int argc, const char **argv)
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv)
2603 */ 2604 */
2604static int kdb_per_cpu(int argc, const char **argv) 2605static int kdb_per_cpu(int argc, const char **argv)
2605{ 2606{
2606 char buf[256], fmtstr[64]; 2607 char fmtstr[64];
2607 kdb_symtab_t symtab; 2608 int cpu, diag, nextarg = 1;
2608 cpumask_t suppress = CPU_MASK_NONE; 2609 unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611 2610
2612 if (argc < 1 || argc > 3) 2611 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT; 2612 return KDB_ARGCOUNT;
2614 2613
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); 2614 diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
2616 if (!kdbgetsymval(buf, &symtab)) { 2615 if (diag)
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]); 2616 return diag;
2618 return KDB_BADADDR; 2617
2619 }
2620 if (argc >= 2) { 2618 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword); 2619 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag) 2620 if (diag)
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv)
2649#define KDB_PCU(cpu) 0 2647#define KDB_PCU(cpu) 0
2650#endif 2648#endif
2651#endif 2649#endif
2652
2653 for_each_online_cpu(cpu) { 2650 for_each_online_cpu(cpu) {
2651 if (KDB_FLAG(CMD_INTERRUPT))
2652 return 0;
2653
2654 if (whichcpu != ~0UL && whichcpu != cpu) 2654 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue; 2655 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu); 2656 addr = symaddr + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword); 2657 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) { 2658 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " 2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag); 2660 "read, diag=%d\n", cpu, addr, diag);
2661 continue; 2661 continue;
2662 } 2662 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu); 2663 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr, 2664 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE, 2665 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0); 2666 1, bytesperword, 1, 1, 0);
2673 } 2667 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU 2668#undef KDB_PCU
2691
2692 return 0; 2669 return 0;
2693} 2670}
2694 2671
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
2763 } 2740 }
2764 if (kdb_commands) { 2741 if (kdb_commands) {
2765 memcpy(new, kdb_commands, 2742 memcpy(new, kdb_commands,
2766 kdb_max_commands * sizeof(*new)); 2743 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2767 kfree(kdb_commands); 2744 kfree(kdb_commands);
2768 } 2745 }
2769 memset(new + kdb_max_commands, 0, 2746 memset(new + kdb_max_commands, 0,
2770 kdb_command_extend * sizeof(*new)); 2747 kdb_command_extend * sizeof(*new));
2771 kdb_commands = new; 2748 kdb_commands = new;
2772 kp = kdb_commands + kdb_max_commands; 2749 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
2773 kdb_max_commands += kdb_command_extend; 2750 kdb_max_commands += kdb_command_extend;
2774 } 2751 }
2775 2752
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..676149a4ac5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)
95 sig->tty = NULL; 96 sig->tty = NULL;
96 } else { 97 } else {
97 /* 98 /*
99 * This can only happen if the caller is de_thread().
100 * FIXME: this is the temporary hack, we should teach
101 * posix-cpu-timers to handle this case correctly.
102 */
103 if (unlikely(has_group_leader_pid(tsk)))
104 posix_cpu_timers_exit_group(tsk);
105
106 /*
98 * If there is any task waiting for the group exit 107 * If there is any task waiting for the group exit
99 * then notify it: 108 * then notify it:
100 */ 109 */
@@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk)
687 enter_lazy_tlb(mm, current); 696 enter_lazy_tlb(mm, current);
688 /* We don't want this task to be frozen prematurely */ 697 /* We don't want this task to be frozen prematurely */
689 clear_freeze_flag(tsk); 698 clear_freeze_flag(tsk);
699 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
700 atomic_dec(&mm->oom_disable_count);
690 task_unlock(tsk); 701 task_unlock(tsk);
691 mm_update_next_owner(mm); 702 mm_update_next_owner(mm);
692 mmput(mm); 703 mmput(mm);
@@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)
700 * space. 711 * space.
701 */ 712 */
702static struct task_struct *find_new_reaper(struct task_struct *father) 713static struct task_struct *find_new_reaper(struct task_struct *father)
714 __releases(&tasklist_lock)
715 __acquires(&tasklist_lock)
703{ 716{
704 struct pid_namespace *pid_ns = task_active_pid_ns(father); 717 struct pid_namespace *pid_ns = task_active_pid_ns(father);
705 struct task_struct *thread; 718 struct task_struct *thread;
@@ -901,6 +914,15 @@ NORET_TYPE void do_exit(long code)
901 if (unlikely(!tsk->pid)) 914 if (unlikely(!tsk->pid))
902 panic("Attempted to kill the idle task!"); 915 panic("Attempted to kill the idle task!");
903 916
917 /*
918 * If do_exit is called because this processes oopsed, it's possible
919 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
920 * continuing. Amongst other possible reasons, this is to prevent
921 * mm_release()->clear_child_tid() from writing to a user-controlled
922 * kernel address.
923 */
924 set_fs(USER_DS);
925
904 tracehook_report_exit(&code); 926 tracehook_report_exit(&code);
905 927
906 validate_creds_for_do_exit(tsk); 928 validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
68 69
69#include <asm/pgtable.h> 70#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
488 mm->cached_hole_size = ~0UL; 489 mm->cached_hole_size = ~0UL;
489 mm_init_aio(mm); 490 mm_init_aio(mm);
490 mm_init_owner(mm, p); 491 mm_init_owner(mm, p);
492 atomic_set(&mm->oom_disable_count, 0);
491 493
492 if (likely(!mm_alloc_pgd(mm))) { 494 if (likely(!mm_alloc_pgd(mm))) {
493 mm->def_flags = 0; 495 mm->def_flags = 0;
@@ -741,6 +743,8 @@ good_mm:
741 /* Initializing for Swap token stuff */ 743 /* Initializing for Swap token stuff */
742 mm->token_priority = 0; 744 mm->token_priority = 0;
743 mm->last_interval = 0; 745 mm->last_interval = 0;
746 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
747 atomic_inc(&mm->oom_disable_count);
744 748
745 tsk->mm = mm; 749 tsk->mm = mm;
746 tsk->active_mm = mm; 750 tsk->active_mm = mm;
@@ -904,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 sig->oom_adj = current->signal->oom_adj; 908 sig->oom_adj = current->signal->oom_adj;
905 sig->oom_score_adj = current->signal->oom_score_adj; 909 sig->oom_score_adj = current->signal->oom_score_adj;
906 910
911 mutex_init(&sig->cred_guard_mutex);
912
907 return 0; 913 return 0;
908} 914}
909 915
@@ -1299,8 +1305,13 @@ bad_fork_cleanup_io:
1299bad_fork_cleanup_namespaces: 1305bad_fork_cleanup_namespaces:
1300 exit_task_namespaces(p); 1306 exit_task_namespaces(p);
1301bad_fork_cleanup_mm: 1307bad_fork_cleanup_mm:
1302 if (p->mm) 1308 if (p->mm) {
1309 task_lock(p);
1310 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1311 atomic_dec(&p->mm->oom_disable_count);
1312 task_unlock(p);
1303 mmput(p->mm); 1313 mmput(p->mm);
1314 }
1304bad_fork_cleanup_signal: 1315bad_fork_cleanup_signal:
1305 if (!(clone_flags & CLONE_THREAD)) 1316 if (!(clone_flags & CLONE_THREAD))
1306 free_signal_struct(p->signal); 1317 free_signal_struct(p->signal);
@@ -1693,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1693 active_mm = current->active_mm; 1704 active_mm = current->active_mm;
1694 current->mm = new_mm; 1705 current->mm = new_mm;
1695 current->active_mm = new_mm; 1706 current->active_mm = new_mm;
1707 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1708 atomic_dec(&mm->oom_disable_count);
1709 atomic_inc(&new_mm->oom_disable_count);
1710 }
1696 activate_mm(active_mm, new_mm); 1711 activate_mm(active_mm, new_mm);
1697 new_mm = mm; 1712 new_mm = mm;
1698 } 1713 }
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..40a8777a27d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
169 169
170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
171 case FUT_OFF_INODE: 171 case FUT_OFF_INODE:
172 atomic_inc(&key->shared.inode->i_count); 172 ihold(key->shared.inode);
173 break; 173 break;
174 case FUT_OFF_MMSHARED: 174 case FUT_OFF_MMSHARED:
175 atomic_inc(&key->private.mm->mm_count); 175 atomic_inc(&key->private.mm->mm_count);
@@ -2489,7 +2489,8 @@ void exit_robust_list(struct task_struct *curr)
2489{ 2489{
2490 struct robust_list_head __user *head = curr->robust_list; 2490 struct robust_list_head __user *head = curr->robust_list;
2491 struct robust_list __user *entry, *next_entry, *pending; 2491 struct robust_list __user *entry, *next_entry, *pending;
2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2493 unsigned int uninitialized_var(next_pi);
2493 unsigned long futex_offset; 2494 unsigned long futex_offset;
2494 int rc; 2495 int rc;
2495 2496
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
49{ 49{
50 struct compat_robust_list_head __user *head = curr->compat_robust_list; 50 struct compat_robust_list_head __user *head = curr->compat_robust_list;
51 struct robust_list __user *entry, *next_entry, *pending; 51 struct robust_list __user *entry, *next_entry, *pending;
52 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 52 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
53 unsigned int uninitialized_var(next_pi);
53 compat_uptr_t uentry, next_uentry, upending; 54 compat_uptr_t uentry, next_uentry, upending;
54 compat_long_t futex_offset; 55 compat_long_t futex_offset;
55 int rc; 56 int rc;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..e5325825aeb6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
620 .read = hw_breakpoint_pmu_read, 620 .read = hw_breakpoint_pmu_read,
621}; 621};
622 622
623static int __init init_hw_breakpoint(void) 623int __init init_hw_breakpoint(void)
624{ 624{
625 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
626 int cpu, err_cpu; 626 int cpu, err_cpu;
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
655 655
656 return -ENOMEM; 656 return -ENOMEM;
657} 657}
658core_initcall(init_hw_breakpoint);
659 658
660 659
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
393 struct irq_desc *desc = irq_to_desc(irq); 393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 394 return desc ? desc->kstat_irqs[cpu] : 0;
395} 395}
396
397#ifdef CONFIG_GENERIC_HARDIRQS
398unsigned int kstat_irqs(unsigned int irq)
399{
400 struct irq_desc *desc = irq_to_desc(irq);
401 int cpu;
402 int sum = 0;
403
404 if (!desc)
405 return 0;
406 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu];
408 return sum;
409}
410#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 644e8d5fa367..5f92acc5f952 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)
324 if (!desc) 324 if (!desc)
325 return; 325 return;
326 326
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
327 chip_bus_lock(desc); 331 chip_bus_lock(desc);
328 raw_spin_lock_irqsave(&desc->lock, flags); 332 raw_spin_lock_irqsave(&desc->lock, flags);
329 __enable_irq(desc, irq, false); 333 __enable_irq(desc, irq, false);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
214 214
215static int irq_spurious_proc_open(struct inode *inode, struct file *file) 215static int irq_spurious_proc_open(struct inode *inode, struct file *file)
216{ 216{
217 return single_open(file, irq_spurious_proc_show, NULL); 217 return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
218} 218}
219 219
220static const struct file_operations irq_spurious_proc_fops = { 220static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..90f881904bb1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -145,7 +145,9 @@ void irq_work_run(void)
145 * Clear the BUSY bit and return to the free state if 145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 146 * no-one else claimed it meanwhile.
147 */ 147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
149 } 151 }
150} 152}
151EXPORT_SYMBOL_GPL(irq_work_run); 153EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7be868bf25c6..3b79bd938330 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -39,6 +39,16 @@ struct jump_label_module_entry {
39 struct module *mod; 39 struct module *mod;
40}; 40};
41 41
42void jump_label_lock(void)
43{
44 mutex_lock(&jump_label_mutex);
45}
46
47void jump_label_unlock(void)
48{
49 mutex_unlock(&jump_label_mutex);
50}
51
42static int jump_label_cmp(const void *a, const void *b) 52static int jump_label_cmp(const void *a, const void *b)
43{ 53{
44 const struct jump_entry *jea = a; 54 const struct jump_entry *jea = a;
@@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
152 struct jump_label_module_entry *e_module; 162 struct jump_label_module_entry *e_module;
153 int count; 163 int count;
154 164
155 mutex_lock(&jump_label_mutex); 165 jump_label_lock();
156 entry = get_jump_label_entry((jump_label_t)key); 166 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) { 167 if (entry) {
158 count = entry->nr_entries; 168 count = entry->nr_entries;
@@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
168 count = e_module->nr_entries; 178 count = e_module->nr_entries;
169 iter = e_module->table; 179 iter = e_module->table;
170 while (count--) { 180 while (count--) {
171 if (kernel_text_address(iter->code)) 181 if (iter->key &&
182 kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type); 183 arch_jump_label_transform(iter, type);
173 iter++; 184 iter++;
174 } 185 }
175 } 186 }
176 } 187 }
177 mutex_unlock(&jump_label_mutex); 188 jump_label_unlock();
178} 189}
179 190
180static int addr_conflict(struct jump_entry *entry, void *start, void *end) 191static int addr_conflict(struct jump_entry *entry, void *start, void *end)
@@ -231,6 +242,7 @@ out:
231 * overlaps with any of the jump label patch addresses. Code 242 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that 243 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses. 244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
234 * 246 *
235 * returns 1 if there is an overlap, 0 otherwise 247 * returns 1 if there is an overlap, 0 otherwise
236 */ 248 */
@@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)
241 struct jump_entry *iter_stop = __start___jump_table; 253 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0; 254 int conflict = 0;
243 255
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start; 256 iter = iter_start;
246 while (iter < iter_stop) { 257 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) { 258 if (addr_conflict(iter, start, end)) {
@@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)
256 conflict = module_conflict(start, end); 267 conflict = module_conflict(start, end);
257#endif 268#endif
258out: 269out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict; 270 return conflict;
261} 271}
262 272
273/*
274 * Not all archs need this.
275 */
276void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{
278}
279
263static __init int init_jump_label(void) 280static __init int init_jump_label(void)
264{ 281{
265 int ret; 282 int ret;
@@ -267,7 +284,7 @@ static __init int init_jump_label(void)
267 struct jump_entry *iter_stop = __stop___jump_table; 284 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter; 285 struct jump_entry *iter;
269 286
270 mutex_lock(&jump_label_mutex); 287 jump_label_lock();
271 ret = build_jump_label_hashtable(__start___jump_table, 288 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table); 289 __stop___jump_table);
273 iter = iter_start; 290 iter = iter_start;
@@ -275,7 +292,7 @@ static __init int init_jump_label(void)
275 arch_jump_label_text_poke_early(iter->code); 292 arch_jump_label_text_poke_early(iter->code);
276 iter++; 293 iter++;
277 } 294 }
278 mutex_unlock(&jump_label_mutex); 295 jump_label_unlock();
279 return ret; 296 return ret;
280} 297}
281early_initcall(init_jump_label); 298early_initcall(init_jump_label);
@@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)
366 } 383 }
367} 384}
368 385
386static void remove_jump_label_module_init(struct module *mod)
387{
388 struct hlist_head *head;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
400 head = &jump_label_table[i];
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 }
417}
418
369static int 419static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val, 420jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data) 421 void *data)
@@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
375 425
376 switch (val) { 426 switch (val) {
377 case MODULE_STATE_COMING: 427 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex); 428 jump_label_lock();
379 ret = add_jump_label_module(mod); 429 ret = add_jump_label_module(mod);
380 if (ret) 430 if (ret)
381 remove_jump_label_module(mod); 431 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex); 432 jump_label_unlock();
383 break; 433 break;
384 case MODULE_STATE_GOING: 434 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex); 435 jump_label_lock();
386 remove_jump_label_module(mod); 436 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex); 437 jump_label_unlock();
438 break;
439 case MODULE_STATE_LIVE:
440 jump_label_lock();
441 remove_jump_label_module_init(mod);
442 jump_label_unlock();
388 break; 443 break;
389 } 444 }
390 return ret; 445 return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
816 816
817 ptr = kmap(page); 817 ptr = kmap(page);
818 /* Start with a clear page */ 818 /* Start with a clear page */
819 memset(ptr, 0, PAGE_SIZE); 819 clear_page(ptr);
820 ptr += maddr & ~PAGE_MASK; 820 ptr += maddr & ~PAGE_MASK;
821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
822 if (mchunk > mbytes) 822 if (mchunk > mbytes)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 56a891914273..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
74/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
75static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
76 76
77static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
78static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79static struct { 80static struct {
80 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
595} 596}
596 597
597#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
598static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
599{ 601{
600 struct hlist_head *head; 602 struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
607 return; 609 return;
608 610
609 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
610 mutex_lock(&text_mutex);
611 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
612 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
613 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
614 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
615 optimize_kprobe(p); 616 optimize_kprobe(p);
616 } 617 }
617 mutex_unlock(&text_mutex);
618 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
619} 619}
620 620
621/* This should be called with kprobe_mutex locked */
621static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
622{ 623{
623 struct hlist_head *head; 624 struct hlist_head *head;
@@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
1144 if (ret) 1145 if (ret)
1145 return ret; 1146 return ret;
1146 1147
1148 jump_label_lock();
1147 preempt_disable(); 1149 preempt_disable();
1148 if (!kernel_text_address((unsigned long) p->addr) || 1150 if (!kernel_text_address((unsigned long) p->addr) ||
1149 in_kprobes_functions((unsigned long) p->addr) || 1151 in_kprobes_functions((unsigned long) p->addr) ||
1150 ftrace_text_reserved(p->addr, p->addr) || 1152 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) { 1153 jump_label_text_reserved(p->addr, p->addr))
1152 preempt_enable(); 1154 goto fail_with_jump_label;
1153 return -EINVAL;
1154 }
1155 1155
1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1157 p->flags &= KPROBE_FLAG_DISABLED; 1157 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1165 * We must hold a refcount of the probed module while updating 1165 * We must hold a refcount of the probed module while updating
1166 * its code to prohibit unexpected unloading. 1166 * its code to prohibit unexpected unloading.
1167 */ 1167 */
1168 if (unlikely(!try_module_get(probed_mod))) { 1168 if (unlikely(!try_module_get(probed_mod)))
1169 preempt_enable(); 1169 goto fail_with_jump_label;
1170 return -EINVAL; 1170
1171 }
1172 /* 1171 /*
1173 * If the module freed .init.text, we couldn't insert 1172 * If the module freed .init.text, we couldn't insert
1174 * kprobes in there. 1173 * kprobes in there.
@@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1176 if (within_module_init((unsigned long)p->addr, probed_mod) && 1175 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1177 probed_mod->state != MODULE_STATE_COMING) { 1176 probed_mod->state != MODULE_STATE_COMING) {
1178 module_put(probed_mod); 1177 module_put(probed_mod);
1179 preempt_enable(); 1178 goto fail_with_jump_label;
1180 return -EINVAL;
1181 } 1179 }
1182 } 1180 }
1183 preempt_enable(); 1181 preempt_enable();
1182 jump_label_unlock();
1184 1183
1185 p->nmissed = 0; 1184 p->nmissed = 0;
1186 INIT_LIST_HEAD(&p->list); 1185 INIT_LIST_HEAD(&p->list);
1187 mutex_lock(&kprobe_mutex); 1186 mutex_lock(&kprobe_mutex);
1188 1187
1188 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1189
1189 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1190 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1190 mutex_lock(&text_mutex); 1191 mutex_lock(&text_mutex);
1191 1192
@@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1213out: 1214out:
1214 mutex_unlock(&text_mutex); 1215 mutex_unlock(&text_mutex);
1215 put_online_cpus(); 1216 put_online_cpus();
1217 jump_label_unlock();
1216 mutex_unlock(&kprobe_mutex); 1218 mutex_unlock(&kprobe_mutex);
1217 1219
1218 if (probed_mod) 1220 if (probed_mod)
1219 module_put(probed_mod); 1221 module_put(probed_mod);
1220 1222
1221 return ret; 1223 return ret;
1224
1225fail_with_jump_label:
1226 preempt_enable();
1227 jump_label_unlock();
1228 return -EINVAL;
1222} 1229}
1223EXPORT_SYMBOL_GPL(register_kprobe); 1230EXPORT_SYMBOL_GPL(register_kprobe);
1224 1231
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..d190664f25ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
2037{ 2037{
2038} 2038}
2039 2039
2040static void add_kallsyms(struct module *mod, struct load_info *info) 2040static void add_kallsyms(struct module *mod, const struct load_info *info)
2041{ 2041{
2042} 2042}
2043#endif /* CONFIG_KALLSYMS */ 2043#endif /* CONFIG_KALLSYMS */
@@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * 2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2327 mod->num_trace_events, GFP_KERNEL); 2327 mod->num_trace_events, GFP_KERNEL);
2328#endif 2328#endif
2329#ifdef CONFIG_TRACING
2330 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2331 sizeof(*mod->trace_bprintk_fmt_start),
2332 &mod->num_trace_bprintk_fmt);
2333 /*
2334 * This section contains pointers to allocated objects in the trace
2335 * code and not scanning it leads to false positives.
2336 */
2337 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2338 sizeof(*mod->trace_bprintk_fmt_start) *
2339 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2340#endif
2329#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2341#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2330 /* sechdrs[0].sh_size is always zero */ 2342 /* sechdrs[0].sh_size is always zero */
2331 mod->ftrace_callsites = section_objs(info, "__mcount_loc", 2343 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
85 return ERR_PTR(-EPERM); 85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current)) 86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM); 87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
88 96
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup) 98 if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..eac7e3364335 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,6 +31,7 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
34 35
35#include <asm/irq_regs.h> 36#include <asm/irq_regs.h>
36 37
@@ -417,8 +418,8 @@ event_filter_match(struct perf_event *event)
417 return event->cpu == -1 || event->cpu == smp_processor_id(); 418 return event->cpu == -1 || event->cpu == smp_processor_id();
418} 419}
419 420
420static int 421static void
421__event_sched_out(struct perf_event *event, 422event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 423 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 424 struct perf_event_context *ctx)
424{ 425{
@@ -437,13 +438,14 @@ __event_sched_out(struct perf_event *event,
437 } 438 }
438 439
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 440 if (event->state != PERF_EVENT_STATE_ACTIVE)
440 return 0; 441 return;
441 442
442 event->state = PERF_EVENT_STATE_INACTIVE; 443 event->state = PERF_EVENT_STATE_INACTIVE;
443 if (event->pending_disable) { 444 if (event->pending_disable) {
444 event->pending_disable = 0; 445 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 446 event->state = PERF_EVENT_STATE_OFF;
446 } 447 }
448 event->tstamp_stopped = ctx->time;
447 event->pmu->del(event, 0); 449 event->pmu->del(event, 0);
448 event->oncpu = -1; 450 event->oncpu = -1;
449 451
@@ -452,19 +454,6 @@ __event_sched_out(struct perf_event *event,
452 ctx->nr_active--; 454 ctx->nr_active--;
453 if (event->attr.exclusive || !cpuctx->active_oncpu) 455 if (event->attr.exclusive || !cpuctx->active_oncpu)
454 cpuctx->exclusive = 0; 456 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
468} 457}
469 458
470static void 459static void
@@ -664,7 +653,7 @@ retry:
664} 653}
665 654
666static int 655static int
667__event_sched_in(struct perf_event *event, 656event_sched_in(struct perf_event *event,
668 struct perf_cpu_context *cpuctx, 657 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 658 struct perf_event_context *ctx)
670{ 659{
@@ -684,6 +673,10 @@ __event_sched_in(struct perf_event *event,
684 return -EAGAIN; 673 return -EAGAIN;
685 } 674 }
686 675
676 event->tstamp_running += ctx->time - event->tstamp_stopped;
677
678 event->shadow_ctx_time = ctx->time - ctx->timestamp;
679
687 if (!is_software_event(event)) 680 if (!is_software_event(event))
688 cpuctx->active_oncpu++; 681 cpuctx->active_oncpu++;
689 ctx->nr_active++; 682 ctx->nr_active++;
@@ -694,35 +687,6 @@ __event_sched_in(struct perf_event *event,
694 return 0; 687 return 0;
695} 688}
696 689
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
726static int 690static int
727group_sched_in(struct perf_event *group_event, 691group_sched_in(struct perf_event *group_event,
728 struct perf_cpu_context *cpuctx, 692 struct perf_cpu_context *cpuctx,
@@ -730,19 +694,15 @@ group_sched_in(struct perf_event *group_event,
730{ 694{
731 struct perf_event *event, *partial_group = NULL; 695 struct perf_event *event, *partial_group = NULL;
732 struct pmu *pmu = group_event->pmu; 696 struct pmu *pmu = group_event->pmu;
697 u64 now = ctx->time;
698 bool simulate = false;
733 699
734 if (group_event->state == PERF_EVENT_STATE_OFF) 700 if (group_event->state == PERF_EVENT_STATE_OFF)
735 return 0; 701 return 0;
736 702
737 pmu->start_txn(pmu); 703 pmu->start_txn(pmu);
738 704
739 /* 705 if (event_sched_in(group_event, cpuctx, ctx)) {
740 * use __event_sched_in() to delay updating tstamp_running
741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu); 706 pmu->cancel_txn(pmu);
747 return -EAGAIN; 707 return -EAGAIN;
748 } 708 }
@@ -751,31 +711,42 @@ group_sched_in(struct perf_event *group_event,
751 * Schedule in siblings as one group (if any): 711 * Schedule in siblings as one group (if any):
752 */ 712 */
753 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 713 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
754 if (__event_sched_in(event, cpuctx, ctx)) { 714 if (event_sched_in(event, cpuctx, ctx)) {
755 partial_group = event; 715 partial_group = event;
756 goto group_error; 716 goto group_error;
757 } 717 }
758 } 718 }
759 719
760 if (!pmu->commit_txn(pmu)) { 720 if (!pmu->commit_txn(pmu))
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
763 return 0; 721 return 0;
764 } 722
765group_error: 723group_error:
766 /* 724 /*
767 * Groups can be scheduled in as one unit only, so undo any 725 * Groups can be scheduled in as one unit only, so undo any
768 * partial group before returning: 726 * partial group before returning:
727 * The events up to the failed event are scheduled out normally,
728 * tstamp_stopped will be updated.
769 * 729 *
770 * use __event_sched_out() to avoid updating tstamp_stopped 730 * The failed events and the remaining siblings need to have
771 * because the event never actually ran 731 * their timings updated as if they had gone thru event_sched_in()
732 * and event_sched_out(). This is required to get consistent timings
733 * across the group. This also takes care of the case where the group
734 * could never be scheduled by ensuring tstamp_stopped is set to mark
735 * the time the event was actually stopped, such that time delta
736 * calculation in update_event_times() is correct.
772 */ 737 */
773 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 738 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
774 if (event == partial_group) 739 if (event == partial_group)
775 break; 740 simulate = true;
776 __event_sched_out(event, cpuctx, ctx); 741
742 if (simulate) {
743 event->tstamp_running += now - event->tstamp_stopped;
744 event->tstamp_stopped = now;
745 } else {
746 event_sched_out(event, cpuctx, ctx);
747 }
777 } 748 }
778 __event_sched_out(group_event, cpuctx, ctx); 749 event_sched_out(group_event, cpuctx, ctx);
779 750
780 pmu->cancel_txn(pmu); 751 pmu->cancel_txn(pmu);
781 752
@@ -1316,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1316{ 1287{
1317 int ctxn; 1288 int ctxn;
1318 1289
1319 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1320
1321 for_each_task_context_nr(ctxn) 1290 for_each_task_context_nr(ctxn)
1322 perf_event_context_sched_out(task, ctxn, next); 1291 perf_event_context_sched_out(task, ctxn, next);
1323} 1292}
@@ -1651,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1651{ 1620{
1652 raw_spin_lock(&ctx->lock); 1621 raw_spin_lock(&ctx->lock);
1653 1622
1654 /* Rotate the first entry last of non-pinned groups */ 1623 /*
1655 list_rotate_left(&ctx->flexible_groups); 1624 * Rotate the first entry last of non-pinned groups. Rotation might be
1625 * disabled by the inheritance code.
1626 */
1627 if (!ctx->rotate_disable)
1628 list_rotate_left(&ctx->flexible_groups);
1656 1629
1657 raw_spin_unlock(&ctx->lock); 1630 raw_spin_unlock(&ctx->lock);
1658} 1631}
@@ -2264,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event)
2264 raw_spin_unlock_irq(&ctx->lock); 2237 raw_spin_unlock_irq(&ctx->lock);
2265 mutex_unlock(&ctx->mutex); 2238 mutex_unlock(&ctx->mutex);
2266 2239
2267 mutex_lock(&event->owner->perf_event_mutex);
2268 list_del_init(&event->owner_entry);
2269 mutex_unlock(&event->owner->perf_event_mutex);
2270 put_task_struct(event->owner);
2271
2272 free_event(event); 2240 free_event(event);
2273 2241
2274 return 0; 2242 return 0;
@@ -2281,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2281static int perf_release(struct inode *inode, struct file *file) 2249static int perf_release(struct inode *inode, struct file *file)
2282{ 2250{
2283 struct perf_event *event = file->private_data; 2251 struct perf_event *event = file->private_data;
2252 struct task_struct *owner;
2284 2253
2285 file->private_data = NULL; 2254 file->private_data = NULL;
2286 2255
2256 rcu_read_lock();
2257 owner = ACCESS_ONCE(event->owner);
2258 /*
2259 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2260 * !owner it means the list deletion is complete and we can indeed
2261 * free this event, otherwise we need to serialize on
2262 * owner->perf_event_mutex.
2263 */
2264 smp_read_barrier_depends();
2265 if (owner) {
2266 /*
2267 * Since delayed_put_task_struct() also drops the last
2268 * task reference we can safely take a new reference
2269 * while holding the rcu_read_lock().
2270 */
2271 get_task_struct(owner);
2272 }
2273 rcu_read_unlock();
2274
2275 if (owner) {
2276 mutex_lock(&owner->perf_event_mutex);
2277 /*
2278 * We have to re-check the event->owner field, if it is cleared
2279 * we raced with perf_event_exit_task(), acquiring the mutex
2280 * ensured they're done, and we can proceed with freeing the
2281 * event.
2282 */
2283 if (event->owner)
2284 list_del_init(&event->owner_entry);
2285 mutex_unlock(&owner->perf_event_mutex);
2286 put_task_struct(owner);
2287 }
2288
2287 return perf_event_release_kernel(event); 2289 return perf_event_release_kernel(event);
2288} 2290}
2289 2291
@@ -3428,7 +3430,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3428} 3430}
3429 3431
3430static void perf_output_read_one(struct perf_output_handle *handle, 3432static void perf_output_read_one(struct perf_output_handle *handle,
3431 struct perf_event *event) 3433 struct perf_event *event,
3434 u64 enabled, u64 running)
3432{ 3435{
3433 u64 read_format = event->attr.read_format; 3436 u64 read_format = event->attr.read_format;
3434 u64 values[4]; 3437 u64 values[4];
@@ -3436,11 +3439,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3436 3439
3437 values[n++] = perf_event_count(event); 3440 values[n++] = perf_event_count(event);
3438 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3441 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3439 values[n++] = event->total_time_enabled + 3442 values[n++] = enabled +
3440 atomic64_read(&event->child_total_time_enabled); 3443 atomic64_read(&event->child_total_time_enabled);
3441 } 3444 }
3442 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3445 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3443 values[n++] = event->total_time_running + 3446 values[n++] = running +
3444 atomic64_read(&event->child_total_time_running); 3447 atomic64_read(&event->child_total_time_running);
3445 } 3448 }
3446 if (read_format & PERF_FORMAT_ID) 3449 if (read_format & PERF_FORMAT_ID)
@@ -3453,7 +3456,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3453 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3456 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3454 */ 3457 */
3455static void perf_output_read_group(struct perf_output_handle *handle, 3458static void perf_output_read_group(struct perf_output_handle *handle,
3456 struct perf_event *event) 3459 struct perf_event *event,
3460 u64 enabled, u64 running)
3457{ 3461{
3458 struct perf_event *leader = event->group_leader, *sub; 3462 struct perf_event *leader = event->group_leader, *sub;
3459 u64 read_format = event->attr.read_format; 3463 u64 read_format = event->attr.read_format;
@@ -3463,10 +3467,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3463 values[n++] = 1 + leader->nr_siblings; 3467 values[n++] = 1 + leader->nr_siblings;
3464 3468
3465 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3469 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3466 values[n++] = leader->total_time_enabled; 3470 values[n++] = enabled;
3467 3471
3468 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3472 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3469 values[n++] = leader->total_time_running; 3473 values[n++] = running;
3470 3474
3471 if (leader != event) 3475 if (leader != event)
3472 leader->pmu->read(leader); 3476 leader->pmu->read(leader);
@@ -3491,13 +3495,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3491 } 3495 }
3492} 3496}
3493 3497
3498#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3499 PERF_FORMAT_TOTAL_TIME_RUNNING)
3500
3494static void perf_output_read(struct perf_output_handle *handle, 3501static void perf_output_read(struct perf_output_handle *handle,
3495 struct perf_event *event) 3502 struct perf_event *event)
3496{ 3503{
3504 u64 enabled = 0, running = 0, now, ctx_time;
3505 u64 read_format = event->attr.read_format;
3506
3507 /*
3508 * compute total_time_enabled, total_time_running
3509 * based on snapshot values taken when the event
3510 * was last scheduled in.
3511 *
3512 * we cannot simply called update_context_time()
3513 * because of locking issue as we are called in
3514 * NMI context
3515 */
3516 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3517 now = perf_clock();
3518 ctx_time = event->shadow_ctx_time + now;
3519 enabled = ctx_time - event->tstamp_enabled;
3520 running = ctx_time - event->tstamp_running;
3521 }
3522
3497 if (event->attr.read_format & PERF_FORMAT_GROUP) 3523 if (event->attr.read_format & PERF_FORMAT_GROUP)
3498 perf_output_read_group(handle, event); 3524 perf_output_read_group(handle, event, enabled, running);
3499 else 3525 else
3500 perf_output_read_one(handle, event); 3526 perf_output_read_one(handle, event, enabled, running);
3501} 3527}
3502 3528
3503void perf_output_sample(struct perf_output_handle *handle, 3529void perf_output_sample(struct perf_output_handle *handle,
@@ -5683,7 +5709,7 @@ SYSCALL_DEFINE5(perf_event_open,
5683 mutex_unlock(&ctx->mutex); 5709 mutex_unlock(&ctx->mutex);
5684 5710
5685 event->owner = current; 5711 event->owner = current;
5686 get_task_struct(current); 5712
5687 mutex_lock(&current->perf_event_mutex); 5713 mutex_lock(&current->perf_event_mutex);
5688 list_add_tail(&event->owner_entry, &current->perf_event_list); 5714 list_add_tail(&event->owner_entry, &current->perf_event_list);
5689 mutex_unlock(&current->perf_event_mutex); 5715 mutex_unlock(&current->perf_event_mutex);
@@ -5751,12 +5777,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5751 ++ctx->generation; 5777 ++ctx->generation;
5752 mutex_unlock(&ctx->mutex); 5778 mutex_unlock(&ctx->mutex);
5753 5779
5754 event->owner = current;
5755 get_task_struct(current);
5756 mutex_lock(&current->perf_event_mutex);
5757 list_add_tail(&event->owner_entry, &current->perf_event_list);
5758 mutex_unlock(&current->perf_event_mutex);
5759
5760 return event; 5780 return event;
5761 5781
5762err_free: 5782err_free:
@@ -5907,8 +5927,24 @@ again:
5907 */ 5927 */
5908void perf_event_exit_task(struct task_struct *child) 5928void perf_event_exit_task(struct task_struct *child)
5909{ 5929{
5930 struct perf_event *event, *tmp;
5910 int ctxn; 5931 int ctxn;
5911 5932
5933 mutex_lock(&child->perf_event_mutex);
5934 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
5935 owner_entry) {
5936 list_del_init(&event->owner_entry);
5937
5938 /*
5939 * Ensure the list deletion is visible before we clear
5940 * the owner, closes a race against perf_release() where
5941 * we need to serialize on the owner->perf_event_mutex.
5942 */
5943 smp_wmb();
5944 event->owner = NULL;
5945 }
5946 mutex_unlock(&child->perf_event_mutex);
5947
5912 for_each_task_context_nr(ctxn) 5948 for_each_task_context_nr(ctxn)
5913 perf_event_exit_task_context(child, ctxn); 5949 perf_event_exit_task_context(child, ctxn);
5914} 5950}
@@ -6128,6 +6164,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6128 struct perf_event *event; 6164 struct perf_event *event;
6129 struct task_struct *parent = current; 6165 struct task_struct *parent = current;
6130 int inherited_all = 1; 6166 int inherited_all = 1;
6167 unsigned long flags;
6131 int ret = 0; 6168 int ret = 0;
6132 6169
6133 child->perf_event_ctxp[ctxn] = NULL; 6170 child->perf_event_ctxp[ctxn] = NULL;
@@ -6168,6 +6205,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6168 break; 6205 break;
6169 } 6206 }
6170 6207
6208 /*
6209 * We can't hold ctx->lock when iterating the ->flexible_group list due
6210 * to allocations, but we need to prevent rotation because
6211 * rotate_ctx() will change the list from interrupt context.
6212 */
6213 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6214 parent_ctx->rotate_disable = 1;
6215 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6216
6171 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6217 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6172 ret = inherit_task_group(event, parent, parent_ctx, 6218 ret = inherit_task_group(event, parent, parent_ctx,
6173 child, ctxn, &inherited_all); 6219 child, ctxn, &inherited_all);
@@ -6175,6 +6221,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6175 break; 6221 break;
6176 } 6222 }
6177 6223
6224 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6225 parent_ctx->rotate_disable = 0;
6226 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6227
6178 child_ctx = child->perf_event_ctxp[ctxn]; 6228 child_ctx = child->perf_event_ctxp[ctxn];
6179 6229
6180 if (child_ctx && inherited_all) { 6230 if (child_ctx && inherited_all) {
@@ -6327,6 +6377,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6327 6377
6328void __init perf_event_init(void) 6378void __init perf_event_init(void)
6329{ 6379{
6380 int ret;
6381
6330 perf_event_init_all_cpus(); 6382 perf_event_init_all_cpus();
6331 init_srcu_struct(&pmus_srcu); 6383 init_srcu_struct(&pmus_srcu);
6332 perf_pmu_register(&perf_swevent); 6384 perf_pmu_register(&perf_swevent);
@@ -6334,4 +6386,7 @@ void __init perf_event_init(void)
6334 perf_pmu_register(&perf_task_clock); 6386 perf_pmu_register(&perf_task_clock);
6335 perf_tp_register(); 6387 perf_tp_register();
6336 perf_cpu_notifier(perf_cpu_notify); 6388 perf_cpu_notifier(perf_cpu_notify);
6389
6390 ret = init_hw_breakpoint();
6391 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6337} 6392}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index c7a8f453919e..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
121 121
122 switch (o->type) { 122 switch (o->type) {
123 case PM_QOS_MIN: 123 case PM_QOS_MIN:
124 return plist_last(&o->requests)->prio; 124 return plist_first(&o->requests)->prio;
125 125
126 case PM_QOS_MAX: 126 case PM_QOS_MAX:
127 return plist_first(&o->requests)->prio; 127 return plist_last(&o->requests)->prio;
128 128
129 default: 129 default:
130 /* runtime check for not using enum */ 130 /* runtime check for not using enum */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
37 if (pid == 0) 37 if (pid == 0)
38 return 0; 38 return 0;
39 39
40 read_lock(&tasklist_lock); 40 rcu_read_lock();
41 p = find_task_by_vpid(pid); 41 p = find_task_by_vpid(pid);
42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43 same_thread_group(p, current) : thread_group_leader(p))) { 43 same_thread_group(p, current) : has_group_leader_pid(p))) {
44 error = -EINVAL; 44 error = -EINVAL;
45 } 45 }
46 read_unlock(&tasklist_lock); 46 rcu_read_unlock();
47 47
48 return error; 48 return error;
49} 49}
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
390 390
391 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 391 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392 392
393 read_lock(&tasklist_lock); 393 rcu_read_lock();
394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395 if (pid == 0) { 395 if (pid == 0) {
396 p = current; 396 p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
404 p = current->group_leader; 404 p = current->group_leader;
405 } else { 405 } else {
406 p = find_task_by_vpid(pid); 406 p = find_task_by_vpid(pid);
407 if (p && !thread_group_leader(p)) 407 if (p && !has_group_leader_pid(p))
408 p = NULL; 408 p = NULL;
409 } 409 }
410 } 410 }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
414 } else { 414 } else {
415 ret = -EINVAL; 415 ret = -EINVAL;
416 } 416 }
417 read_unlock(&tasklist_lock); 417 rcu_read_unlock();
418 418
419 return ret; 419 return ret;
420} 420}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..a5aff3ebad38 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -246,9 +246,13 @@ config PM_OPS
246 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
247 default y 247 default y
248 248
249config ARCH_HAS_OPP
250 bool
251
249config PM_OPP 252config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library" 253 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM 254 depends on PM
255 depends on ARCH_HAS_OPP
252 ---help--- 256 ---help---
253 SOCs have a standard set of tuples consisting of frequency and 257 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This 258 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..048d0b514831 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
327int hibernation_snapshot(int platform_mode) 327int hibernation_snapshot(int platform_mode)
328{ 328{
329 int error; 329 int error;
330 gfp_t saved_mask;
331 330
332 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
333 if (error) 332 if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
339 goto Close; 338 goto Close;
340 339
341 suspend_console(); 340 suspend_console();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 341 pm_restrict_gfp_mask();
343 error = dpm_suspend_start(PMSG_FREEZE); 342 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 343 if (error)
345 goto Recover_platform; 344 goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
348 goto Recover_platform; 347 goto Recover_platform;
349 348
350 error = create_image(platform_mode); 349 error = create_image(platform_mode);
351 /* Control returns here after successful restore */ 350 /*
351 * Control returns here (1) after the image has been created or the
352 * image creation has failed and (2) after a successful restore.
353 */
352 354
353 Resume_devices: 355 Resume_devices:
354 /* We may need to release the preallocated image pages here. */ 356 /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
357 359
358 dpm_resume_end(in_suspend ? 360 dpm_resume_end(in_suspend ?
359 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 361 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
360 set_gfp_allowed_mask(saved_mask); 362
363 if (error || !in_suspend)
364 pm_restore_gfp_mask();
365
361 resume_console(); 366 resume_console();
362 Close: 367 Close:
363 platform_end(platform_mode); 368 platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
452int hibernation_restore(int platform_mode) 457int hibernation_restore(int platform_mode)
453{ 458{
454 int error; 459 int error;
455 gfp_t saved_mask;
456 460
457 pm_prepare_console(); 461 pm_prepare_console();
458 suspend_console(); 462 suspend_console();
459 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 463 pm_restrict_gfp_mask();
460 error = dpm_suspend_start(PMSG_QUIESCE); 464 error = dpm_suspend_start(PMSG_QUIESCE);
461 if (!error) { 465 if (!error) {
462 error = resume_target_kernel(platform_mode); 466 error = resume_target_kernel(platform_mode);
463 dpm_resume_end(PMSG_RECOVER); 467 dpm_resume_end(PMSG_RECOVER);
464 } 468 }
465 set_gfp_allowed_mask(saved_mask); 469 pm_restore_gfp_mask();
466 resume_console(); 470 resume_console();
467 pm_restore_console(); 471 pm_restore_console();
468 return error; 472 return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
476int hibernation_platform_enter(void) 480int hibernation_platform_enter(void)
477{ 481{
478 int error; 482 int error;
479 gfp_t saved_mask;
480 483
481 if (!hibernation_ops) 484 if (!hibernation_ops)
482 return -ENOSYS; 485 return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
492 495
493 entering_platform_hibernation = true; 496 entering_platform_hibernation = true;
494 suspend_console(); 497 suspend_console();
495 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
496 error = dpm_suspend_start(PMSG_HIBERNATE); 498 error = dpm_suspend_start(PMSG_HIBERNATE);
497 if (error) { 499 if (error) {
498 if (hibernation_ops->recover) 500 if (hibernation_ops->recover)
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
536 Resume_devices: 538 Resume_devices:
537 entering_platform_hibernation = false; 539 entering_platform_hibernation = false;
538 dpm_resume_end(PMSG_RESTORE); 540 dpm_resume_end(PMSG_RESTORE);
539 set_gfp_allowed_mask(saved_mask);
540 resume_console(); 541 resume_console();
541 542
542 Close: 543 Close:
@@ -646,6 +647,7 @@ int hibernate(void)
646 swsusp_free(); 647 swsusp_free();
647 if (!error) 648 if (!error)
648 power_down(); 649 power_down();
650 pm_restore_gfp_mask();
649 } else { 651 } else {
650 pr_debug("PM: Image restored successfully.\n"); 652 pr_debug("PM: Image restored successfully.\n");
651 } 653 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
984 src = kmap_atomic(s_page, KM_USER0); 984 src = kmap_atomic(s_page, KM_USER0);
985 dst = kmap_atomic(d_page, KM_USER1); 985 dst = kmap_atomic(d_page, KM_USER1);
986 do_copy_page(dst, src); 986 do_copy_page(dst, src);
987 kunmap_atomic(src, KM_USER0);
988 kunmap_atomic(dst, KM_USER1); 987 kunmap_atomic(dst, KM_USER1);
988 kunmap_atomic(src, KM_USER0);
989 } else { 989 } else {
990 if (PageHighMem(d_page)) { 990 if (PageHighMem(d_page)) {
991 /* Page pointed to by src may contain some kernel 991 /* Page pointed to by src may contain some kernel
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
993 */ 993 */
994 safe_copy_page(buffer, s_page); 994 safe_copy_page(buffer, s_page);
995 dst = kmap_atomic(d_page, KM_USER0); 995 dst = kmap_atomic(d_page, KM_USER0);
996 memcpy(dst, buffer, PAGE_SIZE); 996 copy_page(dst, buffer);
997 kunmap_atomic(dst, KM_USER0); 997 kunmap_atomic(dst, KM_USER0);
998 } else { 998 } else {
999 safe_copy_page(page_address(d_page), s_page); 999 safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1687 memory_bm_position_reset(&orig_bm); 1687 memory_bm_position_reset(&orig_bm);
1688 memory_bm_position_reset(&copy_bm); 1688 memory_bm_position_reset(&copy_bm);
1689 } else if (handle->cur <= nr_meta_pages) { 1689 } else if (handle->cur <= nr_meta_pages) {
1690 memset(buffer, 0, PAGE_SIZE); 1690 clear_page(buffer);
1691 pack_pfns(buffer, &orig_bm); 1691 pack_pfns(buffer, &orig_bm);
1692 } else { 1692 } else {
1693 struct page *page; 1693 struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1701 void *kaddr; 1701 void *kaddr;
1702 1702
1703 kaddr = kmap_atomic(page, KM_USER0); 1703 kaddr = kmap_atomic(page, KM_USER0);
1704 memcpy(buffer, kaddr, PAGE_SIZE); 1704 copy_page(buffer, kaddr);
1705 kunmap_atomic(kaddr, KM_USER0); 1705 kunmap_atomic(kaddr, KM_USER0);
1706 handle->buffer = buffer; 1706 handle->buffer = buffer;
1707 } else { 1707 } else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
1984 void *dst; 1984 void *dst;
1985 1985
1986 dst = kmap_atomic(last_highmem_page, KM_USER0); 1986 dst = kmap_atomic(last_highmem_page, KM_USER0);
1987 memcpy(dst, buffer, PAGE_SIZE); 1987 copy_page(dst, buffer);
1988 kunmap_atomic(dst, KM_USER0); 1988 kunmap_atomic(dst, KM_USER0);
1989 last_highmem_page = NULL; 1989 last_highmem_page = NULL;
1990 } 1990 }
@@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2270 2270
2271 kaddr1 = kmap_atomic(p1, KM_USER0); 2271 kaddr1 = kmap_atomic(p1, KM_USER0);
2272 kaddr2 = kmap_atomic(p2, KM_USER1); 2272 kaddr2 = kmap_atomic(p2, KM_USER1);
2273 memcpy(buf, kaddr1, PAGE_SIZE); 2273 copy_page(buf, kaddr1);
2274 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2274 copy_page(kaddr1, kaddr2);
2275 memcpy(kaddr2, buf, PAGE_SIZE); 2275 copy_page(kaddr2, buf);
2276 kunmap_atomic(kaddr1, KM_USER0);
2277 kunmap_atomic(kaddr2, KM_USER1); 2276 kunmap_atomic(kaddr2, KM_USER1);
2277 kunmap_atomic(kaddr1, KM_USER0);
2278} 2278}
2279 2279
2280/** 2280/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..ecf770509d0d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state)
197int suspend_devices_and_enter(suspend_state_t state) 197int suspend_devices_and_enter(suspend_state_t state)
198{ 198{
199 int error; 199 int error;
200 gfp_t saved_mask;
201 200
202 if (!suspend_ops) 201 if (!suspend_ops)
203 return -ENOSYS; 202 return -ENOSYS;
@@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
208 goto Close; 207 goto Close;
209 } 208 }
210 suspend_console(); 209 suspend_console();
211 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 210 pm_restrict_gfp_mask();
212 suspend_test_start(); 211 suspend_test_start();
213 error = dpm_suspend_start(PMSG_SUSPEND); 212 error = dpm_suspend_start(PMSG_SUSPEND);
214 if (error) { 213 if (error) {
@@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
225 suspend_test_start(); 224 suspend_test_start();
226 dpm_resume_end(PMSG_RESUME); 225 dpm_resume_end(PMSG_RESUME);
227 suspend_test_finish("resume devices"); 226 suspend_test_finish("resume devices");
228 set_gfp_allowed_mask(saved_mask); 227 pm_restore_gfp_mask();
229 resume_console(); 228 resume_console();
230 Close: 229 Close:
231 if (suspend_ops->end) 230 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..baf667bb2794 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
9 * 10 *
10 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
11 * 12 *
@@ -251,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
251 if (bio_chain) { 252 if (bio_chain) {
252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 253 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
253 if (src) { 254 if (src) {
254 memcpy(src, buf, PAGE_SIZE); 255 copy_page(src, buf);
255 } else { 256 } else {
256 WARN_ON_ONCE(1); 257 WARN_ON_ONCE(1);
257 bio_chain = NULL; /* Go synchronous */ 258 bio_chain = NULL; /* Go synchronous */
@@ -325,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
325 error = write_page(handle->cur, handle->cur_swap, NULL); 326 error = write_page(handle->cur, handle->cur_swap, NULL);
326 if (error) 327 if (error)
327 goto out; 328 goto out;
328 memset(handle->cur, 0, PAGE_SIZE); 329 clear_page(handle->cur);
329 handle->cur_swap = offset; 330 handle->cur_swap = offset;
330 handle->k = 0; 331 handle->k = 0;
331 } 332 }
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
753{ 754{
754 unsigned int m; 755 unsigned int m;
755 int error = 0; 756 int error = 0;
757 struct bio *bio;
756 struct timeval start; 758 struct timeval start;
757 struct timeval stop; 759 struct timeval stop;
758 unsigned nr_pages; 760 unsigned nr_pages;
759 size_t off, unc_len, cmp_len; 761 size_t i, off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page; 762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
761 763
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 764 for (i = 0; i < LZO_CMP_PAGES; i++) {
763 if (!page) { 765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 766 if (!page[i]) {
765 return -ENOMEM; 767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
768
769 while (i)
770 free_page((unsigned long)page[--i]);
771
772 return -ENOMEM;
773 }
766 } 774 }
767 775
768 unc = vmalloc(LZO_UNC_SIZE); 776 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) { 777 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page); 779
780 for (i = 0; i < LZO_CMP_PAGES; i++)
781 free_page((unsigned long)page[i]);
782
772 return -ENOMEM; 783 return -ENOMEM;
773 } 784 }
774 785
775 cmp = vmalloc(LZO_CMP_SIZE); 786 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) { 787 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
789
778 vfree(unc); 790 vfree(unc);
779 free_page((unsigned long)page); 791 for (i = 0; i < LZO_CMP_PAGES; i++)
792 free_page((unsigned long)page[i]);
793
780 return -ENOMEM; 794 return -ENOMEM;
781 } 795 }
782 796
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
787 if (!m) 801 if (!m)
788 m = 1; 802 m = 1;
789 nr_pages = 0; 803 nr_pages = 0;
804 bio = NULL;
790 do_gettimeofday(&start); 805 do_gettimeofday(&start);
791 806
792 error = snapshot_write_next(snapshot); 807 error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
794 goto out_finish; 809 goto out_finish;
795 810
796 for (;;) { 811 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */ 812 error = swap_read_page(handle, page[0], NULL); /* sync */
798 if (error) 813 if (error)
799 break; 814 break;
800 815
801 cmp_len = *(size_t *)page; 816 cmp_len = *(size_t *)page[0];
802 if (unlikely(!cmp_len || 817 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
806 break; 821 break;
807 } 822 }
808 823
809 memcpy(cmp, page, PAGE_SIZE); 824 for (off = PAGE_SIZE, i = 1;
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { 825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
811 error = swap_read_page(handle, page, NULL); /* sync */ 826 error = swap_read_page(handle, page[i], &bio);
812 if (error) 827 if (error)
813 goto out_finish; 828 goto out_finish;
829 }
814 830
815 memcpy(cmp + off, page, PAGE_SIZE); 831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
832 if (error)
833 goto out_finish;
834
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
816 } 838 }
817 839
818 unc_len = LZO_UNC_SIZE; 840 unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
857 879
858 vfree(cmp); 880 vfree(cmp);
859 vfree(unc); 881 vfree(unc);
860 free_page((unsigned long)page); 882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]);
861 884
862 return error; 885 return error;
863} 886}
@@ -910,7 +933,7 @@ int swsusp_check(void)
910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
911 if (!IS_ERR(hib_resume_bdev)) { 934 if (!IS_ERR(hib_resume_bdev)) {
912 set_blocksize(hib_resume_bdev, PAGE_SIZE); 935 set_blocksize(hib_resume_bdev, PAGE_SIZE);
913 memset(swsusp_header, 0, PAGE_SIZE); 936 clear_page(swsusp_header);
914 error = hib_bio_read_page(swsusp_resume_block, 937 error = hib_bio_read_page(swsusp_resume_block,
915 swsusp_header, NULL); 938 swsusp_header, NULL);
916 if (error) 939 if (error)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..1b2ea31e6bd8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
263 case SNAPSHOT_UNFREEZE: 263 case SNAPSHOT_UNFREEZE:
264 if (!data->frozen || data->ready) 264 if (!data->frozen || data->ready)
265 break; 265 break;
266 pm_restore_gfp_mask();
266 thaw_processes(); 267 thaw_processes();
267 usermodehelper_enable(); 268 usermodehelper_enable();
268 data->frozen = 0; 269 data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 error = -EPERM; 276 error = -EPERM;
276 break; 277 break;
277 } 278 }
279 pm_restore_gfp_mask();
278 error = hibernation_snapshot(data->platform_support); 280 error = hibernation_snapshot(data->platform_support);
279 if (!error) 281 if (!error)
280 error = put_user(in_suspend, (int __user *)arg); 282 error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..a23315dc4498 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
210 210
211#ifdef CONFIG_BOOT_PRINTK_DELAY 211#ifdef CONFIG_BOOT_PRINTK_DELAY
212 212
213static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 213static int boot_delay; /* msecs delay after each printk during bootup */
214static unsigned long long loops_per_msec; /* based on boot_delay */ 214static unsigned long long loops_per_msec; /* based on boot_delay */
215 215
216static int __init boot_delay_setup(char *str) 216static int __init boot_delay_setup(char *str)
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
261} 261}
262#endif 262#endif
263 263
264#ifdef CONFIG_SECURITY_DMESG_RESTRICT
265int dmesg_restrict = 1;
266#else
267int dmesg_restrict;
268#endif
269
264int do_syslog(int type, char __user *buf, int len, bool from_file) 270int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 271{
266 unsigned i, j, limit, count; 272 unsigned i, j, limit, count;
@@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
268 char c; 274 char c;
269 int error = 0; 275 int error = 0;
270 276
271 error = security_syslog(type, from_file); 277 /*
278 * If this is from /proc/kmsg we only do the capabilities checks
279 * at open time.
280 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
283 return -EPERM;
284 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN))
287 return -EPERM;
288 }
289
290 error = security_syslog(type);
272 if (error) 291 if (error)
273 return error; 292 return error;
274 293
@@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu)
647 * released but interrupts still disabled. 666 * released but interrupts still disabled.
648 */ 667 */
649static int acquire_console_semaphore_for_printk(unsigned int cpu) 668static int acquire_console_semaphore_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock)
650{ 670{
651 int retval = 0; 671 int retval = 0;
652 672
@@ -1062,13 +1082,15 @@ void printk_tick(void)
1062 1082
1063int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1064{ 1084{
1085 if (unlikely(cpu_is_offline(cpu)))
1086 printk_tick();
1065 return per_cpu(printk_pending, cpu); 1087 return per_cpu(printk_pending, cpu);
1066} 1088}
1067 1089
1068void wake_up_klogd(void) 1090void wake_up_klogd(void)
1069{ 1091{
1070 if (waitqueue_active(&log_wait)) 1092 if (waitqueue_active(&log_wait))
1071 __raw_get_cpu_var(printk_pending) = 1; 1093 this_cpu_write(printk_pending, 1);
1072} 1094}
1073 1095
1074/** 1096/**
@@ -1511,7 +1533,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1511} 1533}
1512EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1534EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1513 1535
1514static const char const *kmsg_reasons[] = { 1536static const char * const kmsg_reasons[] = {
1515 [KMSG_DUMP_OOPS] = "oops", 1537 [KMSG_DUMP_OOPS] = "oops",
1516 [KMSG_DUMP_PANIC] = "panic", 1538 [KMSG_DUMP_PANIC] = "panic",
1517 [KMSG_DUMP_KEXEC] = "kexec", 1539 [KMSG_DUMP_KEXEC] = "kexec",
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 181 * under ptrace.
182 */ 182 */
183 retval = -ERESTARTNOINTR; 183 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 184 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 185 goto out;
186 186
187 task_lock(task); 187 task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
208unlock_tasklist: 208unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 209 write_unlock_irq(&tasklist_lock);
210unlock_creds: 210unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 211 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 212out:
213 return retval; 213 return retval;
214} 214}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
329 * and reacquire the lock. 329 * and reacquire the lock.
330 */ 330 */
331void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332 __releases(&tasklist_lock)
333 __acquires(&tasklist_lock)
332{ 334{
333 struct task_struct *p, *n; 335 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 336 LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
402 return copied; 404 return copied;
403} 405}
404 406
405static int ptrace_setoptions(struct task_struct *child, long data) 407static int ptrace_setoptions(struct task_struct *child, unsigned long data)
406{ 408{
407 child->ptrace &= ~PT_TRACE_MASK; 409 child->ptrace &= ~PT_TRACE_MASK;
408 410
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
481#define is_sysemu_singlestep(request) 0 483#define is_sysemu_singlestep(request) 0
482#endif 484#endif
483 485
484static int ptrace_resume(struct task_struct *child, long request, long data) 486static int ptrace_resume(struct task_struct *child, long request,
487 unsigned long data)
485{ 488{
486 if (!valid_signal(data)) 489 if (!valid_signal(data))
487 return -EIO; 490 return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
558#endif 561#endif
559 562
560int ptrace_request(struct task_struct *child, long request, 563int ptrace_request(struct task_struct *child, long request,
561 long addr, long data) 564 unsigned long addr, unsigned long data)
562{ 565{
563 int ret = -EIO; 566 int ret = -EIO;
564 siginfo_t siginfo; 567 siginfo_t siginfo;
568 void __user *datavp = (void __user *) data;
569 unsigned long __user *datalp = datavp;
565 570
566 switch (request) { 571 switch (request) {
567 case PTRACE_PEEKTEXT: 572 case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
578 ret = ptrace_setoptions(child, data); 583 ret = ptrace_setoptions(child, data);
579 break; 584 break;
580 case PTRACE_GETEVENTMSG: 585 case PTRACE_GETEVENTMSG:
581 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 586 ret = put_user(child->ptrace_message, datalp);
582 break; 587 break;
583 588
584 case PTRACE_GETSIGINFO: 589 case PTRACE_GETSIGINFO:
585 ret = ptrace_getsiginfo(child, &siginfo); 590 ret = ptrace_getsiginfo(child, &siginfo);
586 if (!ret) 591 if (!ret)
587 ret = copy_siginfo_to_user((siginfo_t __user *) data, 592 ret = copy_siginfo_to_user(datavp, &siginfo);
588 &siginfo);
589 break; 593 break;
590 594
591 case PTRACE_SETSIGINFO: 595 case PTRACE_SETSIGINFO:
592 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 596 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
593 sizeof siginfo))
594 ret = -EFAULT; 597 ret = -EFAULT;
595 else 598 else
596 ret = ptrace_setsiginfo(child, &siginfo); 599 ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
621 } 624 }
622 mmput(mm); 625 mmput(mm);
623 626
624 ret = put_user(tmp, (unsigned long __user *) data); 627 ret = put_user(tmp, datalp);
625 break; 628 break;
626 } 629 }
627#endif 630#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
650 case PTRACE_SETREGSET: 653 case PTRACE_SETREGSET:
651 { 654 {
652 struct iovec kiov; 655 struct iovec kiov;
653 struct iovec __user *uiov = (struct iovec __user *) data; 656 struct iovec __user *uiov = datavp;
654 657
655 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 658 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
656 return -EFAULT; 659 return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
691#define arch_ptrace_attach(child) do { } while (0) 694#define arch_ptrace_attach(child) do { } while (0)
692#endif 695#endif
693 696
694SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 697SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
698 unsigned long, data)
695{ 699{
696 struct task_struct *child; 700 struct task_struct *child;
697 long ret; 701 long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
732 return ret; 736 return ret;
733} 737}
734 738
735int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 739int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
740 unsigned long data)
736{ 741{
737 unsigned long tmp; 742 unsigned long tmp;
738 int copied; 743 int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
743 return put_user(tmp, (unsigned long __user *)data); 748 return put_user(tmp, (unsigned long __user *)data);
744} 749}
745 750
746int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 751int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
752 unsigned long data)
747{ 753{
748 int copied; 754 int copied;
749 755
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
70 */ 70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages) 71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{ 72{
73 struct page **array; 73 const size_t pa_size = n_pages * sizeof(struct page *);
74 size_t pa_size = n_pages * sizeof(struct page *); 74 if (pa_size > PAGE_SIZE)
75 75 return vzalloc(pa_size);
76 if (pa_size > PAGE_SIZE) { 76 return kzalloc(pa_size, GFP_KERNEL);
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84} 77}
85 78
86/* 79/*
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 60static void *r_next(struct seq_file *m, void *v, loff_t *pos)
44{ 61{
45 struct resource *p = v; 62 struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 375}
359 376
377static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail,
379 resource_size_t size,
380 resource_size_t align)
381{
382 return avail->start;
383}
384
385static void resource_clip(struct resource *res, resource_size_t min,
386 resource_size_t max)
387{
388 if (res->start < min)
389 res->start = min;
390 if (res->end > max)
391 res->end = max;
392}
393
394static bool resource_contains(struct resource *res1, struct resource *res2)
395{
396 return res1->start <= res2->start && res1->end >= res2->end;
397}
398
399/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
360/* 465/*
361 * Find empty slot in the resource tree given range and alignment. 466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
362 */ 468 */
363static int find_resource(struct resource *root, struct resource *new, 469static int find_resource(struct resource *root, struct resource *new,
364 resource_size_t size, resource_size_t min, 470 resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
370 void *alignf_data) 476 void *alignf_data)
371{ 477{
372 struct resource *this = root->child; 478 struct resource *this = root->child;
373 struct resource tmp = *new; 479 struct resource tmp = *new, avail, alloc;
374 480
375 tmp.start = root->start; 481 tmp.start = root->start;
376 /* 482 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 483 * Skip past an allocated resource that starts at 0, since the
378 * of this->start - 1 to tmp->end below would cause an underflow. 484 * assignment of this->start - 1 to tmp->end below would cause an
485 * underflow.
379 */ 486 */
380 if (this && this->start == 0) { 487 if (this && this->start == 0) {
381 tmp.start = this->end + 1; 488 tmp.start = this->end + 1;
382 this = this->sibling; 489 this = this->sibling;
383 } 490 }
384 for(;;) { 491 for (;;) {
385 if (this) 492 if (this)
386 tmp.end = this->start - 1; 493 tmp.end = this->start - 1;
387 else 494 else
388 tmp.end = root->end; 495 tmp.end = root->end;
389 if (tmp.start < min) 496
390 tmp.start = min; 497 resource_clip(&tmp, min, max);
391 if (tmp.end > max) 498
392 tmp.end = max; 499 /* Check for overflow after ALIGN() */
393 tmp.start = ALIGN(tmp.start, align); 500 avail = *new;
394 if (alignf) 501 avail.start = ALIGN(tmp.start, align);
395 tmp.start = alignf(alignf_data, &tmp, size, align); 502 avail.end = tmp.end;
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 503 if (avail.start >= tmp.start) {
397 new->start = tmp.start; 504 alloc.start = alignf(alignf_data, &avail, size, align);
398 new->end = tmp.start + size - 1; 505 alloc.end = alloc.start + size - 1;
399 return 0; 506 if (resource_contains(&avail, &alloc)) {
507 new->start = alloc.start;
508 new->end = alloc.end;
509 return 0;
510 }
400 } 511 }
512
401 if (!this) 513 if (!this)
402 break; 514 break;
515
403 tmp.start = this->end + 1; 516 tmp.start = this->end + 1;
404 this = this->sibling; 517 this = this->sibling;
405 } 518 }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
428{ 541{
429 int err; 542 int err;
430 543
544 if (!alignf)
545 alignf = simple_align_resource;
546
431 write_lock(&resource_lock); 547 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 548 if (resource_alloc_from_bottom)
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
433 if (err >= 0 && __request_resource(root, new)) 552 if (err >= 0 && __request_resource(root, new))
434 err = -EBUSY; 553 err = -EBUSY;
435 write_unlock(&resource_lock); 554 write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 572
454 if (first == parent) 573 if (first == parent)
455 return first; 574 return first;
575 if (WARN_ON(first == new)) /* duplicated insertion */
576 return first;
456 577
457 if ((first->start > new->start) || (first->end < new->end)) 578 if ((first->start > new->start) || (first->end < new->end))
458 break; 579 break;
diff --git a/kernel/sched.c b/kernel/sched.c
index d42992bccdfa..dc91a4d09ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -560,18 +560,8 @@ struct rq {
560 560
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 562
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 563
568 /* 564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 565
576static inline int cpu_of(struct rq *rq) 566static inline int cpu_of(struct rq *rq)
577{ 567{
@@ -2118,6 +2108,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2108 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2109}
2120 2110
2111static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2112{
2113 const struct sched_class *class;
2114
2115 if (p->sched_class == rq->curr->sched_class) {
2116 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2117 } else {
2118 for_each_class(class) {
2119 if (class == rq->curr->sched_class)
2120 break;
2121 if (class == p->sched_class) {
2122 resched_task(rq->curr);
2123 break;
2124 }
2125 }
2126 }
2127
2128 /*
2129 * A queue event has occurred, and we're going to schedule. In
2130 * this case, we can save a useless back to back clock update.
2131 */
2132 if (test_tsk_need_resched(rq->curr))
2133 rq->skip_clock_update = 1;
2134}
2135
2121#ifdef CONFIG_SMP 2136#ifdef CONFIG_SMP
2122/* 2137/*
2123 * Is this task likely cache-hot: 2138 * Is this task likely cache-hot:
@@ -6960,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 6975 if (cpu != group_first_cpu(sd->groups))
6961 return; 6976 return;
6962 6977
6978 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6979
6963 child = sd->child; 6980 child = sd->child;
6964 6981
6965 sd->groups->cpu_power = 0; 6982 sd->groups->cpu_power = 0;
@@ -8510,12 +8527,12 @@ void sched_move_task(struct task_struct *tsk)
8510 if (unlikely(running)) 8527 if (unlikely(running))
8511 tsk->sched_class->put_prev_task(rq, tsk); 8528 tsk->sched_class->put_prev_task(rq, tsk);
8512 8529
8513 set_task_rq(tsk, task_cpu(tsk));
8514
8515#ifdef CONFIG_FAIR_GROUP_SCHED 8530#ifdef CONFIG_FAIR_GROUP_SCHED
8516 if (tsk->sched_class->moved_group) 8531 if (tsk->sched_class->task_move_group)
8517 tsk->sched_class->moved_group(tsk, on_rq); 8532 tsk->sched_class->task_move_group(tsk, on_rq);
8533 else
8518#endif 8534#endif
8535 set_task_rq(tsk, task_cpu(tsk));
8519 8536
8520 if (unlikely(running)) 8537 if (unlikely(running))
8521 tsk->sched_class->set_curr_task(rq); 8538 tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..00ebd7686676 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1655 int scale = cfs_rq->nr_running >= sched_nr_latency;
1656 1656
1657 if (unlikely(rt_prio(p->prio)))
1658 goto preempt;
1659
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1657 if (unlikely(se == pse))
1664 return; 1658 return;
1665 1659
@@ -1764,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1758 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1759 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1760 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1771} 1761}
1772 1762
1773/* 1763/*
@@ -2035,13 +2025,16 @@ struct sd_lb_stats {
2035 unsigned long this_load_per_task; 2025 unsigned long this_load_per_task;
2036 unsigned long this_nr_running; 2026 unsigned long this_nr_running;
2037 unsigned long this_has_capacity; 2027 unsigned long this_has_capacity;
2028 unsigned int this_idle_cpus;
2038 2029
2039 /* Statistics of the busiest group */ 2030 /* Statistics of the busiest group */
2031 unsigned int busiest_idle_cpus;
2040 unsigned long max_load; 2032 unsigned long max_load;
2041 unsigned long busiest_load_per_task; 2033 unsigned long busiest_load_per_task;
2042 unsigned long busiest_nr_running; 2034 unsigned long busiest_nr_running;
2043 unsigned long busiest_group_capacity; 2035 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity; 2036 unsigned long busiest_has_capacity;
2037 unsigned int busiest_group_weight;
2045 2038
2046 int group_imb; /* Is there imbalance in this sd */ 2039 int group_imb; /* Is there imbalance in this sd */
2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2040#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2056,8 @@ struct sg_lb_stats {
2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2056 unsigned long sum_nr_running; /* Nr tasks running in the group */
2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2057 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065 unsigned long group_capacity; 2058 unsigned long group_capacity;
2059 unsigned long idle_cpus;
2060 unsigned long group_weight;
2066 int group_imb; /* Is there an imbalance in the group ? */ 2061 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */ 2062 int group_has_capacity; /* Is there extra capacity in the group? */
2068}; 2063};
@@ -2431,7 +2426,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431 sgs->group_load += load; 2426 sgs->group_load += load;
2432 sgs->sum_nr_running += rq->nr_running; 2427 sgs->sum_nr_running += rq->nr_running;
2433 sgs->sum_weighted_load += weighted_cpuload(i); 2428 sgs->sum_weighted_load += weighted_cpuload(i);
2434 2429 if (idle_cpu(i))
2430 sgs->idle_cpus++;
2435 } 2431 }
2436 2432
2437 /* 2433 /*
@@ -2469,6 +2465,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2465 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470 if (!sgs->group_capacity) 2466 if (!sgs->group_capacity)
2471 sgs->group_capacity = fix_small_capacity(sd, group); 2467 sgs->group_capacity = fix_small_capacity(sd, group);
2468 sgs->group_weight = group->group_weight;
2472 2469
2473 if (sgs->group_capacity > sgs->sum_nr_running) 2470 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1; 2471 sgs->group_has_capacity = 1;
@@ -2576,13 +2573,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576 sds->this_nr_running = sgs.sum_nr_running; 2573 sds->this_nr_running = sgs.sum_nr_running;
2577 sds->this_load_per_task = sgs.sum_weighted_load; 2574 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity; 2575 sds->this_has_capacity = sgs.group_has_capacity;
2576 sds->this_idle_cpus = sgs.idle_cpus;
2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2577 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580 sds->max_load = sgs.avg_load; 2578 sds->max_load = sgs.avg_load;
2581 sds->busiest = sg; 2579 sds->busiest = sg;
2582 sds->busiest_nr_running = sgs.sum_nr_running; 2580 sds->busiest_nr_running = sgs.sum_nr_running;
2581 sds->busiest_idle_cpus = sgs.idle_cpus;
2583 sds->busiest_group_capacity = sgs.group_capacity; 2582 sds->busiest_group_capacity = sgs.group_capacity;
2584 sds->busiest_load_per_task = sgs.sum_weighted_load; 2583 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity; 2584 sds->busiest_has_capacity = sgs.group_has_capacity;
2585 sds->busiest_group_weight = sgs.group_weight;
2586 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2587 } 2587 }
2588 2588
@@ -2860,8 +2860,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860 if (sds.this_load >= sds.avg_load) 2860 if (sds.this_load >= sds.avg_load)
2861 goto out_balanced; 2861 goto out_balanced;
2862 2862
2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 /*
2864 goto out_balanced; 2864 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
2865 * And to check for busy balance use !idle_cpu instead of
2866 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
2867 * even when they are idle.
2868 */
2869 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
2870 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2871 goto out_balanced;
2872 } else {
2873 /*
2874 * This cpu is idle. If the busiest group load doesn't
2875 * have more tasks than the number of available cpu's and
2876 * there is no imbalance between this and busiest group
2877 * wrt to idle cpu's, it is balanced.
2878 */
2879 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
2880 sds.busiest_nr_running <= sds.busiest_group_weight)
2881 goto out_balanced;
2882 }
2865 2883
2866force_balance: 2884force_balance:
2867 /* Looks like there is an imbalance. Compute it */ 2885 /* Looks like there is an imbalance. Compute it */
@@ -3197,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3197 interval = msecs_to_jiffies(sd->balance_interval); 3215 interval = msecs_to_jiffies(sd->balance_interval);
3198 if (time_after(next_balance, sd->last_balance + interval)) 3216 if (time_after(next_balance, sd->last_balance + interval))
3199 next_balance = sd->last_balance + interval; 3217 next_balance = sd->last_balance + interval;
3200 if (pulled_task) 3218 if (pulled_task) {
3219 this_rq->idle_stamp = 0;
3201 break; 3220 break;
3221 }
3202 } 3222 }
3203 3223
3204 raw_spin_lock(&this_rq->lock); 3224 raw_spin_lock(&this_rq->lock);
@@ -3869,13 +3889,26 @@ static void set_curr_task_fair(struct rq *rq)
3869} 3889}
3870 3890
3871#ifdef CONFIG_FAIR_GROUP_SCHED 3891#ifdef CONFIG_FAIR_GROUP_SCHED
3872static void moved_group_fair(struct task_struct *p, int on_rq) 3892static void task_move_group_fair(struct task_struct *p, int on_rq)
3873{ 3893{
3874 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3894 /*
3875 3895 * If the task was not on the rq at the time of this cgroup movement
3876 update_curr(cfs_rq); 3896 * it must have been asleep, sleeping tasks keep their ->vruntime
3897 * absolute on their old rq until wakeup (needed for the fair sleeper
3898 * bonus in place_entity()).
3899 *
3900 * If it was on the rq, we've just 'preempted' it, which does convert
3901 * ->vruntime to a relative base.
3902 *
3903 * Make sure both cases convert their relative position when migrating
3904 * to another cgroup's rq. This does somewhat interfere with the
3905 * fair sleeper stuff for the first placement, but who cares.
3906 */
3907 if (!on_rq)
3908 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
3909 set_task_rq(p, task_cpu(p));
3877 if (!on_rq) 3910 if (!on_rq)
3878 place_entity(cfs_rq, &p->se, 1); 3911 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3879} 3912}
3880#endif 3913#endif
3881 3914
@@ -3927,7 +3960,7 @@ static const struct sched_class fair_sched_class = {
3927 .get_rr_interval = get_rr_interval_fair, 3960 .get_rr_interval = get_rr_interval_fair,
3928 3961
3929#ifdef CONFIG_FAIR_GROUP_SCHED 3962#ifdef CONFIG_FAIR_GROUP_SCHED
3930 .moved_group = moved_group_fair, 3963 .task_move_group = task_move_group_fair,
3931#endif 3964#endif
3932}; 3965};
3933 3966
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
157} 157}
158 158
159/* 159/*
160 * Called when a process is dequeued from the active array and given 160 * We are interested in knowing how long it was from the *first* time a
161 * the cpu. We should note that with the exception of interactive
162 * tasks, the expired queue will become the active queue after the active
163 * queue is empty, without explicitly dequeuing and requeuing tasks in the
164 * expired queue. (Interactive tasks may be requeued directly to the
165 * active queue, thus delaying tasks in the expired queue from running;
166 * see scheduler_tick()).
167 *
168 * Though we are interested in knowing how long it was from the *first* time a
169 * task was queued to the time that it finally hit a cpu, we call this routine 161 * task was queued to the time that it finally hit a cpu, we call this routine
170 * from dequeue_task() to account for possible rq->clock skew across cpus. The 162 * from dequeue_task() to account for possible rq->clock skew across cpus. The
171 * delta taken on each cpu would annul the skew. 163 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
203} 195}
204 196
205/* 197/*
206 * Called when a process is queued into either the active or expired
207 * array. The time is noted and later used to determine how long we
208 * had to wait for us to reach the cpu. Since the expired queue will
209 * become the active queue after active queue is empty, without dequeuing
210 * and requeuing any tasks, we are interested in queuing to either. It
211 * is unusual but not impossible for tasks to be dequeued and immediately
212 * requeued in the same or another array: this can happen in sched_yield(),
213 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
214 * to runqueue.
215 *
216 * This function is only called from enqueue_task(), but also only updates 198 * This function is only called from enqueue_task(), but also only updates
217 * the timestamp if it is already not set. It's assumed that 199 * the timestamp if it is already not set. It's assumed that
218 * sched_info_dequeued() will clear that stamp when appropriate. 200 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
19static void 19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) 20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{ 21{
22 resched_task(rq->curr); /* we preempt everything */ 22 /* we're never preempted */
23} 23}
24 24
25static struct task_struct *pick_next_task_stop(struct rq *rq) 25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 26{
27 struct task_struct *stop = rq->stop; 27 struct task_struct *stop = rq->stop;
28 28
29 if (stop && stop->state == TASK_RUNNING) 29 if (stop && stop->se.on_rq)
30 return stop; 30 return stop;
31 31
32 return NULL; 32 return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1105 return count;
1106} 1106}
1107 1107
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1109 unsigned long *flags)
1109{ 1110{
1110 struct sighand_struct *sighand; 1111 struct sighand_struct *sighand;
1111 1112
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
1617 * is gone, we keep current->exit_code unless clear_code. 1618 * is gone, we keep current->exit_code unless clear_code.
1618 */ 1619 */
1619static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock)
1620{ 1623{
1621 if (arch_ptrace_stop_needed(exit_code, info)) { 1624 if (arch_ptrace_stop_needed(exit_code, info)) {
1622 /* 1625 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 267 *
268 * Returns 0 on success, else a negative status code. 268 * Returns 0 on success, else a negative status code.
269 */ 269 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 271 int wait)
272{ 272{
273 struct call_single_data d = { 273 struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 336 * 3) any other online cpu in @mask
337 */ 337 */
338int smp_call_function_any(const struct cpumask *mask, 338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 339 smp_call_func_t func, void *info, int wait)
340{ 340{
341 unsigned int cpu; 341 unsigned int cpu;
342 const struct cpumask *nodemask; 342 const struct cpumask *nodemask;
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
416 * must be disabled when calling this function. 416 * must be disabled when calling this function.
417 */ 417 */
418void smp_call_function_many(const struct cpumask *mask, 418void smp_call_function_many(const struct cpumask *mask,
419 void (*func)(void *), void *info, bool wait) 419 smp_call_func_t func, void *info, bool wait)
420{ 420{
421 struct call_function_data *data; 421 struct call_function_data *data;
422 unsigned long flags; 422 unsigned long flags;
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
500 * You must not call this function with disabled interrupts or from a 500 * You must not call this function with disabled interrupts or from a
501 * hardware interrupt handler or from a bottom half handler. 501 * hardware interrupt handler or from a bottom half handler.
502 */ 502 */
503int smp_call_function(void (*func)(void *), void *info, int wait) 503int smp_call_function(smp_call_func_t func, void *info, int wait)
504{ 504{
505 preempt_disable(); 505 preempt_disable();
506 smp_call_function_many(cpu_online_mask, func, info, wait); 506 smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f02a9dfa19bc..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -229,18 +229,20 @@ restart:
229 229
230 do { 230 do {
231 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
232 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
233 kstat_incr_softirqs_this_cpu(h - softirq_vec);
234 234
235 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
236 h->action(h); 238 h->action(h);
237 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
238 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
239 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
240 "with preempt_count %08x," 242 "with preempt_count %08x,"
241 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
242 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
243 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
244 preempt_count() = prev_count; 246 preempt_count() = prev_count;
245 } 247 }
246 248
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
305 cpu); 305 cpu);
306 if (IS_ERR(p)) 306 if (IS_ERR(p))
307 return NOTIFY_BAD; 307 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu); 309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p); 310 sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
372 /* start one for the boot cpu */ 372 /* start one for the boot cpu */
373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
374 bcpu); 374 bcpu);
375 BUG_ON(err == NOTIFY_BAD); 375 BUG_ON(err != NOTIFY_OK);
376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
377 register_cpu_notifier(&cpu_stop_cpu_notifier); 377 register_cpu_notifier(&cpu_stop_cpu_notifier);
378 378
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..5abfa1518554 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 161extern int unaligned_dump_stack;
162#endif 162#endif
163 163
164extern struct ratelimit_state printk_ratelimit_state;
165
166#ifdef CONFIG_PROC_SYSCTL 164#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 165static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 166 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -704,6 +702,15 @@ static struct ctl_table kern_table[] = {
704 .extra1 = &zero, 702 .extra1 = &zero,
705 .extra2 = &ten_thousand, 703 .extra2 = &ten_thousand,
706 }, 704 },
705 {
706 .procname = "dmesg_restrict",
707 .data = &dmesg_restrict,
708 .maxlen = sizeof(int),
709 .mode = 0644,
710 .proc_handler = proc_dointvec_minmax,
711 .extra1 = &zero,
712 .extra2 = &one,
713 },
707#endif 714#endif
708 { 715 {
709 .procname = "ngroups_max", 716 .procname = "ngroups_max",
@@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = {
1340 .data = &inodes_stat, 1347 .data = &inodes_stat,
1341 .maxlen = 2*sizeof(int), 1348 .maxlen = 2*sizeof(int),
1342 .mode = 0444, 1349 .mode = 0444,
1343 .proc_handler = proc_dointvec, 1350 .proc_handler = proc_nr_inodes,
1344 }, 1351 },
1345 { 1352 {
1346 .procname = "inode-state", 1353 .procname = "inode-state",
1347 .data = &inodes_stat, 1354 .data = &inodes_stat,
1348 .maxlen = 7*sizeof(int), 1355 .maxlen = 7*sizeof(int),
1349 .mode = 0444, 1356 .mode = 0444,
1350 .proc_handler = proc_dointvec, 1357 .proc_handler = proc_nr_inodes,
1351 }, 1358 },
1352 { 1359 {
1353 .procname = "file-nr", 1360 .procname = "file-nr",
1354 .data = &files_stat, 1361 .data = &files_stat,
1355 .maxlen = 3*sizeof(int), 1362 .maxlen = sizeof(files_stat),
1356 .mode = 0444, 1363 .mode = 0444,
1357 .proc_handler = proc_nr_files, 1364 .proc_handler = proc_nr_files,
1358 }, 1365 },
1359 { 1366 {
1360 .procname = "file-max", 1367 .procname = "file-max",
1361 .data = &files_stat.max_files, 1368 .data = &files_stat.max_files,
1362 .maxlen = sizeof(int), 1369 .maxlen = sizeof(files_stat.max_files),
1363 .mode = 0644, 1370 .mode = 0644,
1364 .proc_handler = proc_dointvec, 1371 .proc_handler = proc_doulongvec_minmax,
1365 }, 1372 },
1366 { 1373 {
1367 .procname = "nr_open", 1374 .procname = "nr_open",
@@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = {
1377 .data = &dentry_stat, 1384 .data = &dentry_stat,
1378 .maxlen = 6*sizeof(int), 1385 .maxlen = 6*sizeof(int),
1379 .mode = 0444, 1386 .mode = 0444,
1380 .proc_handler = proc_dointvec, 1387 .proc_handler = proc_nr_dentry,
1381 }, 1388 },
1382 { 1389 {
1383 .procname = "overflowuid", 1390 .procname = "overflowuid",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 175 up_write(&listeners->sem);
176} 176}
177 177
178static int fill_pid(pid_t pid, struct task_struct *tsk, 178static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 179{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 180 memset(stats, 0, sizeof(*stats));
195 /* 181 /*
196 * Each accounting subsystem adds calls to its functions to 182 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 195
210 /* fill in extended acct fields */ 196 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 197 xacct_add_tsk(stats, tsk);
198}
212 199
213 /* Define err: label here if needed */ 200static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 201{
215 return rc; 202 struct task_struct *tsk;
216 203
204 rcu_read_lock();
205 tsk = find_task_by_vpid(pid);
206 if (tsk)
207 get_task_struct(tsk);
208 rcu_read_unlock();
209 if (!tsk)
210 return -ESRCH;
211 fill_stats(tsk, stats);
212 put_task_struct(tsk);
213 return 0;
217} 214}
218 215
219static int fill_tgid(pid_t tgid, struct task_struct *first, 216static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 217{
222 struct task_struct *tsk; 218 struct task_struct *tsk, *first;
223 unsigned long flags; 219 unsigned long flags;
224 int rc = -ESRCH; 220 int rc = -ESRCH;
225 221
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 224 * leaders who are already counted with the dead tasks
229 */ 225 */
230 rcu_read_lock(); 226 rcu_read_lock();
231 if (!first) 227 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 228
234 if (!first || !lock_task_sighand(first, &flags)) 229 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 230 goto out;
@@ -268,7 +263,6 @@ out:
268 return rc; 263 return rc;
269} 264}
270 265
271
272static void fill_tgid_exit(struct task_struct *tsk) 266static void fill_tgid_exit(struct task_struct *tsk)
273{ 267{
274 unsigned long flags; 268 unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
360 struct nlattr *na, *ret; 354 struct nlattr *na, *ret;
361 int aggr; 355 int aggr;
362 356
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 363 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 364 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 365 : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
367 na = nla_nest_start(skb, aggr); 367 na = nla_nest_start(skb, aggr);
368 if (!na) 368 if (!na)
369 goto err; 369 goto err;
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 370 if (nla_put(skb, type, pid_size, pids) < 0)
371 goto err; 371 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 373 if (!ret)
@@ -424,39 +424,46 @@ err:
424 return rc; 424 return rc;
425} 425}
426 426
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 427static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 428{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 429 cpumask_var_t mask;
430 int rc;
434 431
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 432 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 433 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 434 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 435 if (rc < 0)
440 goto free_return_rc; 436 goto out;
441 if (rc == 0) { 437 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 438out:
443 goto free_return_rc; 439 free_cpumask_var(mask);
444 } 440 return rc;
441}
442
443static int cmd_attr_deregister_cpumask(struct genl_info *info)
444{
445 cpumask_var_t mask;
446 int rc;
445 447
448 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
449 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 450 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 451 if (rc < 0)
448 goto free_return_rc; 452 goto out;
449 if (rc == 0) { 453 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 454out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 455 free_cpumask_var(mask);
456 return rc;
457}
458
459static int cmd_attr_pid(struct genl_info *info)
460{
461 struct taskstats *stats;
462 struct sk_buff *rep_skb;
463 size_t size;
464 u32 pid;
465 int rc;
456 466
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 467 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
462 469
@@ -465,33 +472,64 @@ free_return_rc:
465 return rc; 472 return rc;
466 473
467 rc = -EINVAL; 474 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 475 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 476 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 477 if (!stats)
471 if (!stats) 478 goto err;
472 goto err; 479
473 480 rc = fill_stats_for_pid(pid, stats);
474 rc = fill_pid(pid, NULL, stats); 481 if (rc < 0)
475 if (rc < 0) 482 goto err;
476 goto err; 483 return send_reply(rep_skb, info);
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 484err:
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 485 nlmsg_free(rep_skb);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 486 return rc;
480 if (!stats) 487}
481 goto err; 488
482 489static int cmd_attr_tgid(struct genl_info *info)
483 rc = fill_tgid(tgid, NULL, stats); 490{
484 if (rc < 0) 491 struct taskstats *stats;
485 goto err; 492 struct sk_buff *rep_skb;
486 } else 493 size_t size;
494 u32 tgid;
495 int rc;
496
497 size = nla_total_size(sizeof(u32)) +
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0)
502 return rc;
503
504 rc = -EINVAL;
505 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
506 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
507 if (!stats)
487 goto err; 508 goto err;
488 509
510 rc = fill_stats_for_tgid(tgid, stats);
511 if (rc < 0)
512 goto err;
489 return send_reply(rep_skb, info); 513 return send_reply(rep_skb, info);
490err: 514err:
491 nlmsg_free(rep_skb); 515 nlmsg_free(rep_skb);
492 return rc; 516 return rc;
493} 517}
494 518
519static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
520{
521 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
522 return cmd_attr_register_cpumask(info);
523 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
524 return cmd_attr_deregister_cpumask(info);
525 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
526 return cmd_attr_pid(info);
527 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
528 return cmd_attr_tgid(info);
529 else
530 return -EINVAL;
531}
532
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 533static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 534{
497 struct signal_struct *sig = tsk->signal; 535 struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 593 if (!stats)
556 goto err; 594 goto err;
557 595
558 rc = fill_pid(-1, tsk, stats); 596 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 597
562 /* 598 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 599 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..ea37e2ff4164 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -126,7 +126,7 @@ if FTRACE
126config FUNCTION_TRACER 126config FUNCTION_TRACER
127 bool "Kernel Function Tracer" 127 bool "Kernel Function Tracer"
128 depends on HAVE_FUNCTION_TRACER 128 depends on HAVE_FUNCTION_TRACER
129 select FRAME_POINTER if (!ARM_UNWIND) 129 select FRAME_POINTER if !ARM_UNWIND && !S390
130 select KALLSYMS 130 select KALLSYMS
131 select GENERIC_TRACER 131 select GENERIC_TRACER
132 select CONTEXT_SWITCH_TRACER 132 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed66724..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
169 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
170 170
171#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
172#define BLK_TC_RAHEAD BLK_TC_AHEAD 171#define BLK_TC_RAHEAD BLK_TC_AHEAD
173 172
174/* The ilog2() calls fall out because they're constant */ 173/* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
196 return; 195 return;
197 196
198 what |= ddir_act[rw & WRITE]; 197 what |= ddir_act[rw & WRITE];
199 what |= MASK_TC_BIT(rw, HARDBARRIER);
200 what |= MASK_TC_BIT(rw, SYNC); 198 what |= MASK_TC_BIT(rw, SYNC);
201 what |= MASK_TC_BIT(rw, RAHEAD); 199 what |= MASK_TC_BIT(rw, RAHEAD);
202 what |= MASK_TC_BIT(rw, META); 200 what |= MASK_TC_BIT(rw, META);
@@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1807 1805
1808 if (rw & REQ_RAHEAD) 1806 if (rw & REQ_RAHEAD)
1809 rwbs[i++] = 'A'; 1807 rwbs[i++] = 'A';
1810 if (rw & REQ_HARDBARRIER)
1811 rwbs[i++] = 'B';
1812 if (rw & REQ_SYNC) 1808 if (rw & REQ_SYNC)
1813 rwbs[i++] = 'S'; 1809 rwbs[i++] = 'S';
1814 if (rw & REQ_META) 1810 if (rw & REQ_META)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3dab054d18e..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2308,12 +2298,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2310{ 2300{
2301 u64 delta;
2302
2311 /* 2303 /*
2312 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2313 * time stamp. 2305 * time stamp.
2314 */ 2306 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2317} 2323}
2318 2324
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2359
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2356 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3049 3058
3050 again: 3059 again:
3051 /* 3060 /*
3052 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3053 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3054 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3055 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3056 */ 3065 */
3057 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3058 return NULL; 3067 return NULL;
3059 3068
3060 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3130 return NULL; 3139 return NULL;
3131 3140
3132 /* 3141 /*
3133 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3134 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3135 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3136 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3137 * But we limit them by not adding timestamps if they begin
3138 * at the start of a page.
3139 */ 3146 */
3140 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3141 return NULL; 3148 return NULL;
3142 3149
3143 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3835 if (len > (commit - read)) 3842 if (len > (commit - read))
3836 len = (commit - read); 3843 len = (commit - read);
3837 3844
3838 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3839 3847
3840 if (len < size) 3848 if (len < size)
3841 goto out_unlock; 3849 goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3857 break; 3865 break;
3858 3866
3859 event = rb_reader_event(cpu_buffer); 3867 event = rb_reader_event(cpu_buffer);
3860 size = rb_event_length(event); 3868 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event);
3861 } while (len > size); 3870 } while (len > size);
3862 3871
3863 /* update bpage */ 3872 /* update bpage */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..c380612273bf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
21#include <linux/notifier.h> 20#include <linux/notifier.h>
22#include <linux/irqflags.h> 21#include <linux/irqflags.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
1284 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1283 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1285} 1284}
1286 1285
1286static DEFINE_PER_CPU(int, user_stack_count);
1287
1287void 1288void
1288ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1289ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1289{ 1290{
@@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1302 if (unlikely(in_nmi())) 1303 if (unlikely(in_nmi()))
1303 return; 1304 return;
1304 1305
1306 /*
1307 * prevent recursion, since the user stack tracing may
1308 * trigger other kernel events.
1309 */
1310 preempt_disable();
1311 if (__this_cpu_read(user_stack_count))
1312 goto out;
1313
1314 __this_cpu_inc(user_stack_count);
1315
1316
1317
1305 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1306 sizeof(*entry), flags, pc); 1319 sizeof(*entry), flags, pc);
1307 if (!event) 1320 if (!event)
@@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1319 save_stack_trace_user(&trace); 1332 save_stack_trace_user(&trace);
1320 if (!filter_check_discard(call, entry, buffer, event)) 1333 if (!filter_check_discard(call, entry, buffer, event))
1321 ring_buffer_unlock_commit(buffer, event); 1334 ring_buffer_unlock_commit(buffer, event);
1335
1336 __this_cpu_dec(user_stack_count);
1337
1338 out:
1339 preempt_enable();
1322} 1340}
1323 1341
1324#ifdef UNUSED 1342#ifdef UNUSED
@@ -3996,13 +4014,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3996{ 4014{
3997 struct dentry *d_percpu = tracing_dentry_percpu(); 4015 struct dentry *d_percpu = tracing_dentry_percpu();
3998 struct dentry *d_cpu; 4016 struct dentry *d_cpu;
3999 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 4017 char cpu_dir[30]; /* 30 characters should be more than enough */
4000 char cpu_dir[7];
4001
4002 if (cpu > 999 || cpu < 0)
4003 return;
4004 4018
4005 sprintf(cpu_dir, "cpu%ld", cpu); 4019 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4006 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4020 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4007 if (!d_cpu) { 4021 if (!d_cpu) {
4008 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4022 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b8d2852baa4a..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h> 33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
36 35
37#include "trace.h" 36#include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 91 * upon function exit.
92 */ 92 */
93static void free_user(struct user_struct *up, unsigned long flags) 93static void free_user(struct user_struct *up, unsigned long flags)
94 __releases(&uidhash_lock)
94{ 95{
95 uid_hash_remove(up); 96 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bafba687a6d8..6e3c41a4024c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __initdata no_watchdog; 46static int no_watchdog;
47 47
48 48
49/* boot commands */ 49/* boot commands */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 30acdb74cc23..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2064 * checks and call back into the fixup functions where we 2064 * checks and call back into the fixup functions where we
2065 * might deadlock. 2065 * might deadlock.
2066 */ 2066 */
2067 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2067 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2069 init_completion(&barr->done); 2069 init_completion(&barr->done);
2070 2070
@@ -2791,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2791 } 2791 }
2792 } 2792 }
2793 2793
2794 /* just in case, make sure it's actually aligned */ 2794 /* just in case, make sure it's actually aligned
2795 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2796 */
2795 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2797 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2796 return wq->cpu_wq.v ? 0 : -ENOMEM; 2798 return wq->cpu_wq.v ? 0 : -ENOMEM;
2797} 2799}