aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c67
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c9
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c16
-rw-r--r--kernel/cgroup.c145
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c153
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c66
-rw-r--r--kernel/debug/kdb/kdb_private.h48
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c77
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c136
-rw-r--r--kernel/pm_qos_params.c3
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/resource.c153
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/sched.c8
-rw-r--r--kernel/sched_fair.c25
-rw-r--r--kernel/sched_stats.h20
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/ring_buffer.c336
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c316
62 files changed, 1301 insertions, 911 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
467 struct task_struct *tsk; 467 struct task_struct *tsk;
468 int err; 468 int err;
469 469
470 read_lock(&tasklist_lock); 470 rcu_read_lock();
471 tsk = find_task_by_vpid(pid); 471 tsk = find_task_by_vpid(pid);
472 err = -ESRCH; 472 if (!tsk) {
473 if (!tsk) 473 rcu_read_unlock();
474 goto out; 474 return -ESRCH;
475 err = 0; 475 }
476 476 get_task_struct(tsk);
477 spin_lock_irq(&tsk->sighand->siglock); 477 rcu_read_unlock();
478 if (!tsk->signal->audit_tty) 478 err = tty_audit_push_task(tsk, loginuid, sessionid);
479 err = -EPERM; 479 put_task_struct(tsk);
480 spin_unlock_irq(&tsk->sighand->siglock);
481 if (err)
482 goto out;
483
484 tty_audit_push_task(tsk, loginuid, sessionid);
485out:
486 read_unlock(&tasklist_lock);
487 return err; 480 return err;
488} 481}
489 482
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
506} 499}
507 500
508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 501struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
509 int multi, void *payload, int size) 502 int multi, const void *payload, int size)
510{ 503{
511 struct sk_buff *skb; 504 struct sk_buff *skb;
512 struct nlmsghdr *nlh; 505 struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
555 * Allocates an skb, builds the netlink message, and sends it to the pid. 548 * Allocates an skb, builds the netlink message, and sends it to the pid.
556 * No failure notifications. 549 * No failure notifications.
557 */ 550 */
558void audit_send_reply(int pid, int seq, int type, int done, int multi, 551static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 void *payload, int size) 552 const void *payload, int size)
560{ 553{
561 struct sk_buff *skb; 554 struct sk_buff *skb;
562 struct task_struct *tsk; 555 struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
880 case AUDIT_TTY_GET: { 873 case AUDIT_TTY_GET: {
881 struct audit_tty_status s; 874 struct audit_tty_status s;
882 struct task_struct *tsk; 875 struct task_struct *tsk;
876 unsigned long flags;
883 877
884 read_lock(&tasklist_lock); 878 rcu_read_lock();
885 tsk = find_task_by_vpid(pid); 879 tsk = find_task_by_vpid(pid);
886 if (!tsk) 880 if (tsk && lock_task_sighand(tsk, &flags)) {
887 err = -ESRCH;
888 else {
889 spin_lock_irq(&tsk->sighand->siglock);
890 s.enabled = tsk->signal->audit_tty != 0; 881 s.enabled = tsk->signal->audit_tty != 0;
891 spin_unlock_irq(&tsk->sighand->siglock); 882 unlock_task_sighand(tsk, &flags);
892 } 883 } else
893 read_unlock(&tasklist_lock); 884 err = -ESRCH;
894 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, 885 rcu_read_unlock();
895 &s, sizeof(s)); 886
887 if (!err)
888 audit_send_reply(NETLINK_CB(skb).pid, seq,
889 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 890 break;
897 } 891 }
898 case AUDIT_TTY_SET: { 892 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 893 struct audit_tty_status *s;
900 struct task_struct *tsk; 894 struct task_struct *tsk;
895 unsigned long flags;
901 896
902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 897 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
903 return -EINVAL; 898 return -EINVAL;
904 s = data; 899 s = data;
905 if (s->enabled != 0 && s->enabled != 1) 900 if (s->enabled != 0 && s->enabled != 1)
906 return -EINVAL; 901 return -EINVAL;
907 read_lock(&tasklist_lock); 902 rcu_read_lock();
908 tsk = find_task_by_vpid(pid); 903 tsk = find_task_by_vpid(pid);
909 if (!tsk) 904 if (tsk && lock_task_sighand(tsk, &flags)) {
910 err = -ESRCH;
911 else {
912 spin_lock_irq(&tsk->sighand->siglock);
913 tsk->signal->audit_tty = s->enabled != 0; 905 tsk->signal->audit_tty = s->enabled != 0;
914 spin_unlock_irq(&tsk->sighand->siglock); 906 unlock_task_sighand(tsk, &flags);
915 } 907 } else
916 read_unlock(&tasklist_lock); 908 err = -ESRCH;
909 rcu_read_unlock();
917 break; 910 break;
918 } 911 }
919 default: 912 default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
84 int *dirlen); 84 int *dirlen);
85extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
86 int done, int multi, 86 int done, int multi,
87 void *payload, int size); 87 const void *payload, int size);
88extern void audit_send_reply(int pid, int seq, int type,
89 int done, int multi,
90 void *payload, int size);
91extern void audit_panic(const char *message); 88extern void audit_panic(const char *message);
92 89
93struct audit_netlink_list { 90struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
223{ 223{
224 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark; 225 struct fsnotify_mark *entry = &chunk->mark;
226 struct audit_chunk *new; 226 struct audit_chunk *new = NULL;
227 struct audit_tree *owner; 227 struct audit_tree *owner;
228 int size = chunk->count - 1; 228 int size = chunk->count - 1;
229 int i, j; 229 int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
232 232
233 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
234 234
235 if (size)
236 new = alloc_chunk(size);
237
235 spin_lock(&entry->lock); 238 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->i.inode) { 239 if (chunk->dead || !entry->i.inode) {
237 spin_unlock(&entry->lock); 240 spin_unlock(&entry->lock);
241 if (new)
242 free_chunk(new);
238 goto out; 243 goto out;
239 } 244 }
240 245
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
255 goto out; 260 goto out;
256 } 261 }
257 262
258 new = alloc_chunk(size);
259 if (!new) 263 if (!new)
260 goto Fallback; 264 goto Fallback;
265
261 fsnotify_duplicate_mark(&new->mark, entry); 266 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 267 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
263 free_chunk(new); 268 free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
60}; 60};
61 61
62/* fsnotify handle. */ 62/* fsnotify handle. */
63struct fsnotify_group *audit_watch_group; 63static struct fsnotify_group *audit_watch_group;
64 64
65/* fsnotify events we care about. */ 65/* fsnotify events we care about. */
66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
123 } 123 }
124} 124}
125 125
126void audit_remove_watch(struct audit_watch *watch) 126static void audit_remove_watch(struct audit_watch *watch)
127{ 127{
128 list_del(&watch->wlist); 128 list_del(&watch->wlist);
129 audit_put_parent(watch->parent); 129 audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1252 case AUDIT_LOGINUID: 1252 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1253 result = audit_comparator(cb->loginuid, f->op, f->val);
1254 break; 1254 break;
1255 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule)
1261 result = security_audit_rule_match(cb->sid,
1262 f->type,
1263 f->op,
1264 f->lsm_rule,
1265 NULL);
1266 break;
1255 } 1267 }
1256 1268
1257 if (!result) 1269 if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
241 pid_t pid; 241 pid_t pid;
242 struct audit_cap_data cap; 242 struct audit_cap_data cap;
243 } capset; 243 } capset;
244 struct {
245 int fd;
246 int flags;
247 } mmap;
244 }; 248 };
245 int fds[2]; 249 int fds[2];
246 250
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1309 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1310 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; } 1311 break; }
1312 case AUDIT_MMAP: {
1313 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1314 context->mmap.flags);
1315 break; }
1308 } 1316 }
1309 audit_log_end(ab); 1317 audit_log_end(ab);
1310} 1318}
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
2476 context->type = AUDIT_CAPSET; 2484 context->type = AUDIT_CAPSET;
2477} 2485}
2478 2486
2487void __audit_mmap_fd(int fd, int flags)
2488{
2489 struct audit_context *context = current->audit_context;
2490 context->mmap.fd = fd;
2491 context->mmap.flags = flags;
2492 context->type = AUDIT_MMAP;
2493}
2494
2479/** 2495/**
2480 * audit_core_dumps - record information about processes that end abnormally 2496 * audit_core_dumps - record information about processes that end abnormally
2481 * @signr: signal value 2497 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291ba3d04bea..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 244}
246 245
246static int clone_children(const struct cgroup *cgrp)
247{
248 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
249}
250
247/* 251/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 252 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 253 * an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 782 struct inode *inode = new_inode(sb);
779 783
780 if (inode) { 784 if (inode) {
785 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 786 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 787 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 788 inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1045 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1046 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1047 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1048 if (clone_children(&root->top_cgroup))
1049 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1050 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1051 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1057 unsigned long subsys_bits;
1051 unsigned long flags; 1058 unsigned long flags;
1052 char *release_agent; 1059 char *release_agent;
1060 bool clone_children;
1053 char *name; 1061 char *name;
1054 /* User explicitly requested empty subsystem */ 1062 /* User explicitly requested empty subsystem */
1055 bool none; 1063 bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
1066 */ 1074 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1075static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1076{
1069 char *token, *o = data ?: "all"; 1077 char *token, *o = data;
1078 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1079 unsigned long mask = (unsigned long)-1;
1071 int i; 1080 int i;
1072 bool module_pin_failed = false; 1081 bool module_pin_failed = false;
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1091 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1092 if (!*token)
1084 return -EINVAL; 1093 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1094 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1095 /* Explicitly have no subsystems */
1097 opts->none = true; 1096 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1097 continue;
1098 }
1099 if (!strcmp(token, "all")) {
1100 /* Mutually exclusive option 'all' + subsystem name */
1101 if (one_ss)
1102 return -EINVAL;
1103 all_ss = true;
1104 continue;
1105 }
1106 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1107 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1108 continue;
1109 }
1110 if (!strcmp(token, "clone_children")) {
1111 opts->clone_children = true;
1112 continue;
1113 }
1114 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1115 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1116 if (opts->release_agent)
1103 return -EINVAL; 1117 return -EINVAL;
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1119 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1120 if (!opts->release_agent)
1107 return -ENOMEM; 1121 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1122 continue;
1123 }
1124 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1125 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1126 /* Can't specify an empty name */
1111 if (!strlen(name)) 1127 if (!strlen(name))
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1127 GFP_KERNEL); 1143 GFP_KERNEL);
1128 if (!opts->name) 1144 if (!opts->name)
1129 return -ENOMEM; 1145 return -ENOMEM;
1130 } else { 1146
1131 struct cgroup_subsys *ss; 1147 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1148 }
1133 ss = subsys[i]; 1149
1134 if (ss == NULL) 1150 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1151 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1152 if (ss == NULL)
1137 if (!ss->disabled) 1153 continue;
1138 set_bit(i, &opts->subsys_bits); 1154 if (strcmp(token, ss->name))
1139 break; 1155 continue;
1140 } 1156 if (ss->disabled)
1141 } 1157 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1158
1143 return -ENOENT; 1159 /* Mutually exclusive option 'all' + subsystem name */
1160 if (all_ss)
1161 return -EINVAL;
1162 set_bit(i, &opts->subsys_bits);
1163 one_ss = true;
1164
1165 break;
1166 }
1167 if (i == CGROUP_SUBSYS_COUNT)
1168 return -ENOENT;
1169 }
1170
1171 /*
1172 * If the 'all' option was specified select all the subsystems,
1173 * otherwise 'all, 'none' and a subsystem name options were not
1174 * specified, let's default to 'all'
1175 */
1176 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1178 struct cgroup_subsys *ss = subsys[i];
1179 if (ss == NULL)
1180 continue;
1181 if (ss->disabled)
1182 continue;
1183 set_bit(i, &opts->subsys_bits);
1144 } 1184 }
1145 } 1185 }
1146 1186
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1262 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1263 struct cgroup_sb_opts opts;
1224 1264
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1265 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1266 mutex_lock(&cgroup_mutex);
1228 1267
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1294 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1295 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1296 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1297 return ret;
1260} 1298}
1261 1299
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1396 if (opts->name)
1359 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1398 if (opts->clone_children)
1399 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1400 return root;
1361} 1401}
1362 1402
@@ -1420,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1420 return 0; 1460 return 0;
1421} 1461}
1422 1462
1423static int cgroup_get_sb(struct file_system_type *fs_type, 1463static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1424 int flags, const char *unused_dev_name, 1464 int flags, const char *unused_dev_name,
1425 void *data, struct vfsmount *mnt) 1465 void *data)
1426{ 1466{
1427 struct cgroup_sb_opts opts; 1467 struct cgroup_sb_opts opts;
1428 struct cgroupfs_root *root; 1468 struct cgroupfs_root *root;
@@ -1556,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1556 drop_parsed_module_refcounts(opts.subsys_bits); 1596 drop_parsed_module_refcounts(opts.subsys_bits);
1557 } 1597 }
1558 1598
1559 simple_set_mnt(mnt, sb);
1560 kfree(opts.release_agent); 1599 kfree(opts.release_agent);
1561 kfree(opts.name); 1600 kfree(opts.name);
1562 return 0; 1601 return dget(sb->s_root);
1563 1602
1564 drop_new_super: 1603 drop_new_super:
1565 deactivate_locked_super(sb); 1604 deactivate_locked_super(sb);
@@ -1568,8 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1607 out_err:
1569 kfree(opts.release_agent); 1608 kfree(opts.release_agent);
1570 kfree(opts.name); 1609 kfree(opts.name);
1571 1610 return ERR_PTR(ret);
1572 return ret;
1573} 1611}
1574 1612
1575static void cgroup_kill_sb(struct super_block *sb) { 1613static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1619 1657
1620static struct file_system_type cgroup_fs_type = { 1658static struct file_system_type cgroup_fs_type = {
1621 .name = "cgroup", 1659 .name = "cgroup",
1622 .get_sb = cgroup_get_sb, 1660 .mount = cgroup_mount,
1623 .kill_sb = cgroup_kill_sb, 1661 .kill_sb = cgroup_kill_sb,
1624}; 1662};
1625 1663
@@ -1883,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1883 const char *buffer) 1921 const char *buffer)
1884{ 1922{
1885 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1923 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1924 if (strlen(buffer) >= PATH_MAX)
1925 return -EINVAL;
1886 if (!cgroup_lock_live_group(cgrp)) 1926 if (!cgroup_lock_live_group(cgrp))
1887 return -ENODEV; 1927 return -ENODEV;
1888 strcpy(cgrp->root->release_agent_path, buffer); 1928 strcpy(cgrp->root->release_agent_path, buffer);
@@ -3176,6 +3216,23 @@ fail:
3176 return ret; 3216 return ret;
3177} 3217}
3178 3218
3219static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3220 struct cftype *cft)
3221{
3222 return clone_children(cgrp);
3223}
3224
3225static int cgroup_clone_children_write(struct cgroup *cgrp,
3226 struct cftype *cft,
3227 u64 val)
3228{
3229 if (val)
3230 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3231 else
3232 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3233 return 0;
3234}
3235
3179/* 3236/*
3180 * for the common functions, 'private' gives the type of file 3237 * for the common functions, 'private' gives the type of file
3181 */ 3238 */
@@ -3206,6 +3263,11 @@ static struct cftype files[] = {
3206 .write_string = cgroup_write_event_control, 3263 .write_string = cgroup_write_event_control,
3207 .mode = S_IWUGO, 3264 .mode = S_IWUGO,
3208 }, 3265 },
3266 {
3267 .name = "cgroup.clone_children",
3268 .read_u64 = cgroup_clone_children_read,
3269 .write_u64 = cgroup_clone_children_write,
3270 },
3209}; 3271};
3210 3272
3211static struct cftype cft_release_agent = { 3273static struct cftype cft_release_agent = {
@@ -3335,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3335 if (notify_on_release(parent)) 3397 if (notify_on_release(parent))
3336 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3398 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3337 3399
3400 if (clone_children(parent))
3401 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3402
3338 for_each_subsys(root, ss) { 3403 for_each_subsys(root, ss) {
3339 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3404 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3340 3405
@@ -3349,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3349 goto err_destroy; 3414 goto err_destroy;
3350 } 3415 }
3351 /* At error, ->destroy() callback has to free assigned ID. */ 3416 /* At error, ->destroy() callback has to free assigned ID. */
3417 if (clone_children(parent) && ss->post_clone)
3418 ss->post_clone(ss, cgrp);
3352 } 3419 }
3353 3420
3354 cgroup_lock_hierarchy(root); 3421 cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task)) 171 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED)
183 return -EBUSY; 173 return -EBUSY;
184 174
185 freezer = cgroup_freezer(new_cgroup); 175 rcu_read_lock();
186 if (freezer->state == CGROUP_FROZEN) 176 if (__cgroup_freezing_or_frozen(task)) {
177 rcu_read_unlock();
187 return -EBUSY; 178 return -EBUSY;
179 }
180 rcu_read_unlock();
188 181
189 if (threadgroup) { 182 if (threadgroup) {
190 struct task_struct *c; 183 struct task_struct *c;
191 184
192 rcu_read_lock(); 185 rcu_read_lock();
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
194 if (is_task_frozen_enough(c)) { 187 if (__cgroup_freezing_or_frozen(c)) {
195 rcu_read_unlock(); 188 rcu_read_unlock();
196 return -EBUSY; 189 return -EBUSY;
197 } 190 }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 229/*
237 * caller must hold freezer->lock 230 * caller must hold freezer->lock
238 */ 231 */
239static void update_freezer_state(struct cgroup *cgroup, 232static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 233 struct freezer *freezer)
241{ 234{
242 struct cgroup_iter it; 235 struct cgroup_iter it;
243 struct task_struct *task; 236 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 237 unsigned int nfrozen = 0, ntotal = 0;
238 enum freezer_state old_state = freezer->state;
245 239
246 cgroup_iter_start(cgroup, &it); 240 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 241 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 242 ntotal++;
249 if (is_task_frozen_enough(task)) 243 if (frozen(task))
250 nfrozen++; 244 nfrozen++;
251 } 245 }
252 246
253 /* 247 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 248 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 249 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 250 if (nfrozen == ntotal)
257 */ 251 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 252 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 253 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 254 }
261 freezer->state = CGROUP_FREEZING; 255
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 256 cgroup_iter_end(cgroup, &it);
265} 257}
266 258
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 271 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 272 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 273 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 274 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 275 state = freezer->state;
284 } 276 }
285 spin_unlock_irq(&freezer->lock); 277 spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 293 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 294 if (!freeze_task(task, true))
303 continue; 295 continue;
304 if (is_task_frozen_enough(task)) 296 if (frozen(task))
305 continue; 297 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 298 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 299 num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 327
336 spin_lock_irq(&freezer->lock); 328 spin_lock_irq(&freezer->lock);
337 329
338 update_freezer_state(cgroup, freezer); 330 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 331 if (goal_state == freezer->state)
340 goto out; 332 goto out;
341 333
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
66static const struct file_operations ikconfig_file_ops = { 66static const struct file_operations ikconfig_file_ops = {
67 .owner = THIS_MODULE, 67 .owner = THIS_MODULE,
68 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
69 .llseek = default_llseek,
69}; 70};
70 71
71static int __init ikconfig_init(void) 72static int __init ikconfig_init(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
325 325
326/* 326/*
327 * Prepare credentials for current to perform an execve() 327 * Prepare credentials for current to perform an execve()
328 * - The caller must hold current->cred_guard_mutex 328 * - The caller must hold ->cred_guard_mutex
329 */ 329 */
330struct cred *prepare_exec_creds(void) 330struct cred *prepare_exec_creds(void)
331{ 331{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
384 struct cred *new; 384 struct cred *new;
385 int ret; 385 int ret;
386 386
387 mutex_init(&p->cred_guard_mutex);
388
389 if ( 387 if (
390#ifdef CONFIG_KEYS 388#ifdef CONFIG_KEYS
391 !p->cred->thread_keyring && 389 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
47#include <linux/pid.h> 47#include <linux/pid.h>
48#include <linux/smp.h> 48#include <linux/smp.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/rcupdate.h>
50 51
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/byteorder.h> 53#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 */ 110 */
110atomic_t kgdb_active = ATOMIC_INIT(-1); 111atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active); 112EXPORT_SYMBOL_GPL(kgdb_active);
113static DEFINE_RAW_SPINLOCK(dbg_master_lock);
114static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
112 115
113/* 116/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early 117 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet): 118 * bootup code (which might not have percpu set up yet):
116 */ 119 */
117static atomic_t passive_cpu_wait[NR_CPUS]; 120static atomic_t masters_in_kgdb;
118static atomic_t cpu_in_kgdb[NR_CPUS]; 121static atomic_t slaves_in_kgdb;
119static atomic_t kgdb_break_tasklet_var; 122static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint; 123atomic_t kgdb_setting_breakpoint;
121 124
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
206 return 0; 209 return 0;
207} 210}
208 211
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/* 212/*
222 * Some architectures need cache flushes when we set/clear a 213 * Some architectures need cache flushes when we set/clear a
223 * breakpoint: 214 * breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
457 return 1; 448 return 1;
458} 449}
459 450
460static void dbg_cpu_switch(int cpu, int next_cpu) 451static void dbg_touch_watchdogs(void)
461{ 452{
462 /* Mark the cpu we are switching away from as a slave when it 453 touch_softlockup_watchdog_sync();
463 * holds the kgdb_active token. This must be done so that the 454 clocksource_touch_watchdog();
464 * that all the cpus wait in for the debug core will not enter 455 rcu_cpu_stall_reset();
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471} 456}
472 457
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) 458static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
459 int exception_state)
474{ 460{
475 unsigned long flags; 461 unsigned long flags;
476 int sstep_tries = 100; 462 int sstep_tries = 100;
477 int error; 463 int error;
478 int i, cpu; 464 int cpu;
479 int trace_on = 0; 465 int trace_on = 0;
466 int online_cpus = num_online_cpus();
467
468 kgdb_info[ks->cpu].enter_kgdb++;
469 kgdb_info[ks->cpu].exception_state |= exception_state;
470
471 if (exception_state == DCPU_WANT_MASTER)
472 atomic_inc(&masters_in_kgdb);
473 else
474 atomic_inc(&slaves_in_kgdb);
475
476 if (arch_kgdb_ops.disable_hw_break)
477 arch_kgdb_ops.disable_hw_break(regs);
478
480acquirelock: 479acquirelock:
481 /* 480 /*
482 * Interrupts will be restored by the 'trap return' code, except when 481 * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
489 kgdb_info[cpu].task = current; 488 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0; 489 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; 490 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497 491
498 if (exception_level == 1) 492 /* Make sure the above info reaches the primary CPU */
493 smp_mb();
494
495 if (exception_level == 1) {
496 if (raw_spin_trylock(&dbg_master_lock))
497 atomic_xchg(&kgdb_active, cpu);
499 goto cpu_master_loop; 498 goto cpu_master_loop;
499 }
500 500
501 /* 501 /*
502 * CPU will loop if it is a slave or request to become a kgdb 502 * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; 508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop; 509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { 510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) 511 if (raw_spin_trylock(&dbg_master_lock)) {
512 atomic_xchg(&kgdb_active, cpu);
512 break; 513 break;
514 }
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { 515 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu])) 516 if (!raw_spin_is_locked(&dbg_slave_lock))
515 goto return_normal; 517 goto return_normal;
516 } else { 518 } else {
517return_normal: 519return_normal:
@@ -522,9 +524,12 @@ return_normal:
522 arch_kgdb_ops.correct_hw_break(); 524 arch_kgdb_ops.correct_hw_break();
523 if (trace_on) 525 if (trace_on)
524 tracing_on(); 526 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]); 527 kgdb_info[cpu].exception_state &=
526 touch_softlockup_watchdog_sync(); 528 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
527 clocksource_touch_watchdog(); 529 kgdb_info[cpu].enter_kgdb--;
530 smp_mb__before_atomic_dec();
531 atomic_dec(&slaves_in_kgdb);
532 dbg_touch_watchdogs();
528 local_irq_restore(flags); 533 local_irq_restore(flags);
529 return 0; 534 return 0;
530 } 535 }
@@ -541,8 +546,8 @@ return_normal:
541 (kgdb_info[cpu].task && 546 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 547 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1); 548 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync(); 549 raw_spin_unlock(&dbg_master_lock);
545 clocksource_touch_watchdog(); 550 dbg_touch_watchdogs();
546 local_irq_restore(flags); 551 local_irq_restore(flags);
547 552
548 goto acquirelock; 553 goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
563 if (dbg_io_ops->pre_exception) 568 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception(); 569 dbg_io_ops->pre_exception();
565 570
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /* 571 /*
569 * Get the passive CPU lock which will hold all the non-primary 572 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active 573 * CPU in a spin state while the debugger is active
571 */ 574 */
572 if (!kgdb_single_step) { 575 if (!kgdb_single_step)
573 for (i = 0; i < NR_CPUS; i++) 576 raw_spin_lock(&dbg_slave_lock);
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576 577
577#ifdef CONFIG_SMP 578#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */ 579 /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
583 /* 584 /*
584 * Wait for the other CPUs to be notified and be waiting for us: 585 * Wait for the other CPUs to be notified and be waiting for us:
585 */ 586 */
586 for_each_online_cpu(i) { 587 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) 588 atomic_read(&slaves_in_kgdb)) != online_cpus)
588 cpu_relax(); 589 cpu_relax();
589 }
590 590
591 /* 591 /*
592 * At this point the primary processor is completely 592 * At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
615 if (error == DBG_PASS_EVENT) { 615 if (error == DBG_PASS_EVENT) {
616 dbg_kdb_mode = !dbg_kdb_mode; 616 dbg_kdb_mode = !dbg_kdb_mode;
617 } else if (error == DBG_SWITCH_CPU_EVENT) { 617 } else if (error == DBG_SWITCH_CPU_EVENT) {
618 dbg_cpu_switch(cpu, dbg_switch_cpu); 618 kgdb_info[dbg_switch_cpu].exception_state |=
619 DCPU_NEXT_MASTER;
619 goto cpu_loop; 620 goto cpu_loop;
620 } else { 621 } else {
621 kgdb_info[cpu].ret_state = error; 622 kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
627 if (dbg_io_ops->post_exception) 628 if (dbg_io_ops->post_exception)
628 dbg_io_ops->post_exception(); 629 dbg_io_ops->post_exception();
629 630
630 atomic_dec(&cpu_in_kgdb[ks->cpu]);
631
632 if (!kgdb_single_step) { 631 if (!kgdb_single_step) {
633 for (i = NR_CPUS-1; i >= 0; i--) 632 raw_spin_unlock(&dbg_slave_lock);
634 atomic_dec(&passive_cpu_wait[i]); 633 /* Wait till all the CPUs have quit from the debugger. */
635 /* 634 while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
636 * Wait till all the CPUs have quit from the debugger, 635 cpu_relax();
637 * but allow a CPU that hit an exception and is
638 * waiting to become the master to remain in the debug
639 * core.
640 */
641 for_each_online_cpu(i) {
642 while (kgdb_do_roundup &&
643 atomic_read(&cpu_in_kgdb[i]) &&
644 !(kgdb_info[i].exception_state &
645 DCPU_WANT_MASTER))
646 cpu_relax();
647 }
648 } 636 }
649 637
650kgdb_restore: 638kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
655 else 643 else
656 kgdb_sstep_pid = 0; 644 kgdb_sstep_pid = 0;
657 } 645 }
646 if (arch_kgdb_ops.correct_hw_break)
647 arch_kgdb_ops.correct_hw_break();
658 if (trace_on) 648 if (trace_on)
659 tracing_on(); 649 tracing_on();
650
651 kgdb_info[cpu].exception_state &=
652 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
653 kgdb_info[cpu].enter_kgdb--;
654 smp_mb__before_atomic_dec();
655 atomic_dec(&masters_in_kgdb);
660 /* Free kgdb_active */ 656 /* Free kgdb_active */
661 atomic_set(&kgdb_active, -1); 657 atomic_set(&kgdb_active, -1);
662 touch_softlockup_watchdog_sync(); 658 raw_spin_unlock(&dbg_master_lock);
663 clocksource_touch_watchdog(); 659 dbg_touch_watchdogs();
664 local_irq_restore(flags); 660 local_irq_restore(flags);
665 661
666 return kgdb_info[cpu].ret_state; 662 return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678{ 674{
679 struct kgdb_state kgdb_var; 675 struct kgdb_state kgdb_var;
680 struct kgdb_state *ks = &kgdb_var; 676 struct kgdb_state *ks = &kgdb_var;
681 int ret;
682 677
683 ks->cpu = raw_smp_processor_id(); 678 ks->cpu = raw_smp_processor_id();
684 ks->ex_vector = evector; 679 ks->ex_vector = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
689 684
690 if (kgdb_reenter_check(ks)) 685 if (kgdb_reenter_check(ks))
691 return 0; /* Ouch, double exception ! */ 686 return 0; /* Ouch, double exception ! */
692 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; 687 if (kgdb_info[ks->cpu].enter_kgdb != 0)
693 ret = kgdb_cpu_enter(ks, regs); 688 return 0;
694 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | 689
695 DCPU_IS_SLAVE); 690 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
696 return ret;
697} 691}
698 692
699int kgdb_nmicallback(int cpu, void *regs) 693int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
706 ks->cpu = cpu; 700 ks->cpu = cpu;
707 ks->linux_regs = regs; 701 ks->linux_regs = regs;
708 702
709 if (!atomic_read(&cpu_in_kgdb[cpu]) && 703 if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
710 atomic_read(&kgdb_active) != -1 && 704 raw_spin_is_locked(&dbg_master_lock)) {
711 atomic_read(&kgdb_active) != cpu) { 705 kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
712 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
713 kgdb_cpu_enter(ks, regs);
714 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
715 return 0; 706 return 0;
716 } 707 }
717#endif 708#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
40 int exception_state; 40 int exception_state;
41 int ret_state; 41 int ret_state;
42 int irq_depth; 42 int irq_depth;
43 int enter_kgdb;
43}; 44};
44 45
45extern struct debuggerinfo_struct kgdb_info[]; 46extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
86 } 86 }
87 /* Set initial kdb state variables */ 87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS); 88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu; 89 kdb_initial_cpu = atomic_read(&kgdb_active);
90 kdb_current_task = kgdb_info[ks->cpu].task; 90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; 91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */ 92 /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
105 ks->pass_exception = 1; 105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC); 106 KDB_FLAG_SET(CATASTROPHIC);
107 } 107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 108 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT); 109 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS); 110 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
823 823
824 return r; 824 return r;
825} 825}
826 826EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..37755d621924 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1127 /* special case below */ 1127 /* special case below */
1128 } else { 1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", 1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid); 1130 kdb_current, kdb_current ? kdb_current->pid : 0);
1131#if defined(CONFIG_SMP) 1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id()); 1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif 1133#endif
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
1749 int nextarg; 1749 int nextarg;
1750 long offset; 1750 long offset;
1751 1751
1752 if (raw_smp_processor_id() != kdb_initial_cpu) {
1753 kdb_printf("go must execute on the entry cpu, "
1754 "please use \"cpu %d\" and then execute go\n",
1755 kdb_initial_cpu);
1756 return KDB_BADCPUNUM;
1757 }
1752 if (argc == 1) { 1758 if (argc == 1) {
1753 if (raw_smp_processor_id() != kdb_initial_cpu) {
1754 kdb_printf("go <address> must be issued from the "
1755 "initial cpu, do cpu %d first\n",
1756 kdb_initial_cpu);
1757 return KDB_ARGCOUNT;
1758 }
1759 nextarg = 1; 1759 nextarg = 1;
1760 diag = kdbgetaddrarg(argc, argv, &nextarg, 1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1761 &addr, &offset, NULL); 1761 &addr, &offset, NULL);
@@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv)
2603 */ 2603 */
2604static int kdb_per_cpu(int argc, const char **argv) 2604static int kdb_per_cpu(int argc, const char **argv)
2605{ 2605{
2606 char buf[256], fmtstr[64]; 2606 char fmtstr[64];
2607 kdb_symtab_t symtab; 2607 int cpu, diag, nextarg = 1;
2608 cpumask_t suppress = CPU_MASK_NONE; 2608 unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611 2609
2612 if (argc < 1 || argc > 3) 2610 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT; 2611 return KDB_ARGCOUNT;
2614 2612
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); 2613 diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
2616 if (!kdbgetsymval(buf, &symtab)) { 2614 if (diag)
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]); 2615 return diag;
2618 return KDB_BADADDR; 2616
2619 }
2620 if (argc >= 2) { 2617 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword); 2618 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag) 2619 if (diag)
@@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv)
2649#define KDB_PCU(cpu) 0 2646#define KDB_PCU(cpu) 0
2650#endif 2647#endif
2651#endif 2648#endif
2652
2653 for_each_online_cpu(cpu) { 2649 for_each_online_cpu(cpu) {
2650 if (KDB_FLAG(CMD_INTERRUPT))
2651 return 0;
2652
2654 if (whichcpu != ~0UL && whichcpu != cpu) 2653 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue; 2654 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu); 2655 addr = symaddr + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword); 2656 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) { 2657 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " 2658 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag); 2659 "read, diag=%d\n", cpu, addr, diag);
2661 continue; 2660 continue;
2662 } 2661 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu); 2662 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr, 2663 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE, 2664 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0); 2665 1, bytesperword, 1, 1, 0);
2673 } 2666 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU 2667#undef KDB_PCU
2691
2692 return 0; 2668 return 0;
2693} 2669}
2694 2670
@@ -2783,6 +2759,8 @@ int kdb_register_repeat(char *cmd,
2783 2759
2784 return 0; 2760 return 0;
2785} 2761}
2762EXPORT_SYMBOL_GPL(kdb_register_repeat);
2763
2786 2764
2787/* 2765/*
2788 * kdb_register - Compatibility register function for commands that do 2766 * kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2783,7 @@ int kdb_register(char *cmd,
2805 return kdb_register_repeat(cmd, func, usage, help, minlen, 2783 return kdb_register_repeat(cmd, func, usage, help, minlen,
2806 KDB_REPEAT_NONE); 2784 KDB_REPEAT_NONE);
2807} 2785}
2786EXPORT_SYMBOL_GPL(kdb_register);
2808 2787
2809/* 2788/*
2810 * kdb_unregister - This function is used to unregister a kernel 2789 * kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2802,7 @@ int kdb_unregister(char *cmd)
2823 /* 2802 /*
2824 * find the command. 2803 * find the command.
2825 */ 2804 */
2826 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { 2805 for_each_kdbcmd(kp, i) {
2827 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { 2806 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2828 kp->cmd_name = NULL; 2807 kp->cmd_name = NULL;
2829 return 0; 2808 return 0;
@@ -2833,6 +2812,7 @@ int kdb_unregister(char *cmd)
2833 /* Couldn't find it. */ 2812 /* Couldn't find it. */
2834 return 1; 2813 return 1;
2835} 2814}
2815EXPORT_SYMBOL_GPL(kdb_unregister);
2836 2816
2837/* Initialize the kdb command table. */ 2817/* Initialize the kdb command table. */
2838static void __init kdb_inittab(void) 2818static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index be775f7e81e0..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
15#include <linux/kgdb.h> 15#include <linux/kgdb.h>
16#include "../debug_core.h" 16#include "../debug_core.h"
17 17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */ 18/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
93 */ 70 */
94#define KDB_MAXBPT 16 71#define KDB_MAXBPT 16
95 72
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */ 73/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab { 74typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */ 75 unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len); 89extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124 90
125/* Exported Symbols for kernel loadable modules to use. */ 91/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t); 92extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t); 93extern int kdb_putarea_size(unsigned long, void *, size_t);
133 94
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 105extern int kdb_putword(unsigned long, unsigned long, size_t);
145 106
146extern int kdbgetularg(const char *, unsigned long *); 107extern int kdbgetularg(const char *, unsigned long *);
108extern int kdbgetu64arg(const char *, u64 *);
147extern char *kdbgetenv(const char *); 109extern char *kdbgetenv(const char *);
148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 110extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
149 long *, char **); 111 long *, char **);
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
255extern void kdb_print_nameval(const char *name, unsigned long val); 217extern void kdb_print_nameval(const char *name, unsigned long val);
256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
257extern void kdb_meminfo_proc_show(void); 219extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
266extern char *kdb_getstr(char *, size_t, char *); 220extern char *kdb_getstr(char *, size_t, char *);
267 221
268/* Defines for kdb_symbol_print */ 222/* Defines for kdb_symbol_print */
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..21aa7b3001fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)
95 sig->tty = NULL; 96 sig->tty = NULL;
96 } else { 97 } else {
97 /* 98 /*
99 * This can only happen if the caller is de_thread().
100 * FIXME: this is the temporary hack, we should teach
101 * posix-cpu-timers to handle this case correctly.
102 */
103 if (unlikely(has_group_leader_pid(tsk)))
104 posix_cpu_timers_exit_group(tsk);
105
106 /*
98 * If there is any task waiting for the group exit 107 * If there is any task waiting for the group exit
99 * then notify it: 108 * then notify it:
100 */ 109 */
@@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk)
687 enter_lazy_tlb(mm, current); 696 enter_lazy_tlb(mm, current);
688 /* We don't want this task to be frozen prematurely */ 697 /* We don't want this task to be frozen prematurely */
689 clear_freeze_flag(tsk); 698 clear_freeze_flag(tsk);
699 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
700 atomic_dec(&mm->oom_disable_count);
690 task_unlock(tsk); 701 task_unlock(tsk);
691 mm_update_next_owner(mm); 702 mm_update_next_owner(mm);
692 mmput(mm); 703 mmput(mm);
@@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)
700 * space. 711 * space.
701 */ 712 */
702static struct task_struct *find_new_reaper(struct task_struct *father) 713static struct task_struct *find_new_reaper(struct task_struct *father)
714 __releases(&tasklist_lock)
715 __acquires(&tasklist_lock)
703{ 716{
704 struct pid_namespace *pid_ns = task_active_pid_ns(father); 717 struct pid_namespace *pid_ns = task_active_pid_ns(father);
705 struct task_struct *thread; 718 struct task_struct *thread;
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
68 69
69#include <asm/pgtable.h> 70#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
488 mm->cached_hole_size = ~0UL; 489 mm->cached_hole_size = ~0UL;
489 mm_init_aio(mm); 490 mm_init_aio(mm);
490 mm_init_owner(mm, p); 491 mm_init_owner(mm, p);
492 atomic_set(&mm->oom_disable_count, 0);
491 493
492 if (likely(!mm_alloc_pgd(mm))) { 494 if (likely(!mm_alloc_pgd(mm))) {
493 mm->def_flags = 0; 495 mm->def_flags = 0;
@@ -741,6 +743,8 @@ good_mm:
741 /* Initializing for Swap token stuff */ 743 /* Initializing for Swap token stuff */
742 mm->token_priority = 0; 744 mm->token_priority = 0;
743 mm->last_interval = 0; 745 mm->last_interval = 0;
746 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
747 atomic_inc(&mm->oom_disable_count);
744 748
745 tsk->mm = mm; 749 tsk->mm = mm;
746 tsk->active_mm = mm; 750 tsk->active_mm = mm;
@@ -904,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 sig->oom_adj = current->signal->oom_adj; 908 sig->oom_adj = current->signal->oom_adj;
905 sig->oom_score_adj = current->signal->oom_score_adj; 909 sig->oom_score_adj = current->signal->oom_score_adj;
906 910
911 mutex_init(&sig->cred_guard_mutex);
912
907 return 0; 913 return 0;
908} 914}
909 915
@@ -1299,8 +1305,13 @@ bad_fork_cleanup_io:
1299bad_fork_cleanup_namespaces: 1305bad_fork_cleanup_namespaces:
1300 exit_task_namespaces(p); 1306 exit_task_namespaces(p);
1301bad_fork_cleanup_mm: 1307bad_fork_cleanup_mm:
1302 if (p->mm) 1308 if (p->mm) {
1309 task_lock(p);
1310 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1311 atomic_dec(&p->mm->oom_disable_count);
1312 task_unlock(p);
1303 mmput(p->mm); 1313 mmput(p->mm);
1314 }
1304bad_fork_cleanup_signal: 1315bad_fork_cleanup_signal:
1305 if (!(clone_flags & CLONE_THREAD)) 1316 if (!(clone_flags & CLONE_THREAD))
1306 free_signal_struct(p->signal); 1317 free_signal_struct(p->signal);
@@ -1693,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1693 active_mm = current->active_mm; 1704 active_mm = current->active_mm;
1694 current->mm = new_mm; 1705 current->mm = new_mm;
1695 current->active_mm = new_mm; 1706 current->active_mm = new_mm;
1707 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1708 atomic_dec(&mm->oom_disable_count);
1709 atomic_inc(&new_mm->oom_disable_count);
1710 }
1696 activate_mm(active_mm, new_mm); 1711 activate_mm(active_mm, new_mm);
1697 new_mm = mm; 1712 new_mm = mm;
1698 } 1713 }
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
169 169
170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
171 case FUT_OFF_INODE: 171 case FUT_OFF_INODE:
172 atomic_inc(&key->shared.inode->i_count); 172 ihold(key->shared.inode);
173 break; 173 break;
174 case FUT_OFF_MMSHARED: 174 case FUT_OFF_MMSHARED:
175 atomic_inc(&key->private.mm->mm_count); 175 atomic_inc(&key->private.mm->mm_count);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
561static const struct file_operations gcov_reset_fops = { 561static const struct file_operations gcov_reset_fops = {
562 .write = reset_write, 562 .write = reset_write,
563 .read = reset_read, 563 .read = reset_read,
564 .llseek = noop_llseek,
564}; 565};
565 566
566/* 567/*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
393 struct irq_desc *desc = irq_to_desc(irq); 393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 394 return desc ? desc->kstat_irqs[cpu] : 0;
395} 395}
396
397#ifdef CONFIG_GENERIC_HARDIRQS
398unsigned int kstat_irqs(unsigned int irq)
399{
400 struct irq_desc *desc = irq_to_desc(irq);
401 int cpu;
402 int sum = 0;
403
404 if (!desc)
405 return 0;
406 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu];
408 return sum;
409}
410#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 850f030fa0c2..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)
324 if (!desc) 324 if (!desc)
325 return; 325 return;
326 326
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
327 chip_bus_lock(desc); 331 chip_bus_lock(desc);
328 raw_spin_lock_irqsave(&desc->lock, flags); 332 raw_spin_lock_irqsave(&desc->lock, flags);
329 __enable_irq(desc, irq, false); 333 __enable_irq(desc, irq, false);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7be868bf25c6..3b79bd938330 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -39,6 +39,16 @@ struct jump_label_module_entry {
39 struct module *mod; 39 struct module *mod;
40}; 40};
41 41
42void jump_label_lock(void)
43{
44 mutex_lock(&jump_label_mutex);
45}
46
47void jump_label_unlock(void)
48{
49 mutex_unlock(&jump_label_mutex);
50}
51
42static int jump_label_cmp(const void *a, const void *b) 52static int jump_label_cmp(const void *a, const void *b)
43{ 53{
44 const struct jump_entry *jea = a; 54 const struct jump_entry *jea = a;
@@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
152 struct jump_label_module_entry *e_module; 162 struct jump_label_module_entry *e_module;
153 int count; 163 int count;
154 164
155 mutex_lock(&jump_label_mutex); 165 jump_label_lock();
156 entry = get_jump_label_entry((jump_label_t)key); 166 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) { 167 if (entry) {
158 count = entry->nr_entries; 168 count = entry->nr_entries;
@@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
168 count = e_module->nr_entries; 178 count = e_module->nr_entries;
169 iter = e_module->table; 179 iter = e_module->table;
170 while (count--) { 180 while (count--) {
171 if (kernel_text_address(iter->code)) 181 if (iter->key &&
182 kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type); 183 arch_jump_label_transform(iter, type);
173 iter++; 184 iter++;
174 } 185 }
175 } 186 }
176 } 187 }
177 mutex_unlock(&jump_label_mutex); 188 jump_label_unlock();
178} 189}
179 190
180static int addr_conflict(struct jump_entry *entry, void *start, void *end) 191static int addr_conflict(struct jump_entry *entry, void *start, void *end)
@@ -231,6 +242,7 @@ out:
231 * overlaps with any of the jump label patch addresses. Code 242 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that 243 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses. 244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
234 * 246 *
235 * returns 1 if there is an overlap, 0 otherwise 247 * returns 1 if there is an overlap, 0 otherwise
236 */ 248 */
@@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)
241 struct jump_entry *iter_stop = __start___jump_table; 253 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0; 254 int conflict = 0;
243 255
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start; 256 iter = iter_start;
246 while (iter < iter_stop) { 257 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) { 258 if (addr_conflict(iter, start, end)) {
@@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)
256 conflict = module_conflict(start, end); 267 conflict = module_conflict(start, end);
257#endif 268#endif
258out: 269out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict; 270 return conflict;
261} 271}
262 272
273/*
274 * Not all archs need this.
275 */
276void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{
278}
279
263static __init int init_jump_label(void) 280static __init int init_jump_label(void)
264{ 281{
265 int ret; 282 int ret;
@@ -267,7 +284,7 @@ static __init int init_jump_label(void)
267 struct jump_entry *iter_stop = __stop___jump_table; 284 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter; 285 struct jump_entry *iter;
269 286
270 mutex_lock(&jump_label_mutex); 287 jump_label_lock();
271 ret = build_jump_label_hashtable(__start___jump_table, 288 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table); 289 __stop___jump_table);
273 iter = iter_start; 290 iter = iter_start;
@@ -275,7 +292,7 @@ static __init int init_jump_label(void)
275 arch_jump_label_text_poke_early(iter->code); 292 arch_jump_label_text_poke_early(iter->code);
276 iter++; 293 iter++;
277 } 294 }
278 mutex_unlock(&jump_label_mutex); 295 jump_label_unlock();
279 return ret; 296 return ret;
280} 297}
281early_initcall(init_jump_label); 298early_initcall(init_jump_label);
@@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)
366 } 383 }
367} 384}
368 385
386static void remove_jump_label_module_init(struct module *mod)
387{
388 struct hlist_head *head;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
400 head = &jump_label_table[i];
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 }
417}
418
369static int 419static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val, 420jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data) 421 void *data)
@@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
375 425
376 switch (val) { 426 switch (val) {
377 case MODULE_STATE_COMING: 427 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex); 428 jump_label_lock();
379 ret = add_jump_label_module(mod); 429 ret = add_jump_label_module(mod);
380 if (ret) 430 if (ret)
381 remove_jump_label_module(mod); 431 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex); 432 jump_label_unlock();
383 break; 433 break;
384 case MODULE_STATE_GOING: 434 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex); 435 jump_label_lock();
386 remove_jump_label_module(mod); 436 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex); 437 jump_label_unlock();
438 break;
439 case MODULE_STATE_LIVE:
440 jump_label_lock();
441 remove_jump_label_module_init(mod);
442 jump_label_unlock();
388 break; 443 break;
389 } 444 }
390 return ret; 445 return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
816 816
817 ptr = kmap(page); 817 ptr = kmap(page);
818 /* Start with a clear page */ 818 /* Start with a clear page */
819 memset(ptr, 0, PAGE_SIZE); 819 clear_page(ptr);
820 ptr += maddr & ~PAGE_MASK; 820 ptr += maddr & ~PAGE_MASK;
821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
822 if (mchunk > mbytes) 822 if (mchunk > mbytes)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ec4210c6501e..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
74/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
75static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
76 76
77static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
78static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79static struct { 80static struct {
80 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
595} 596}
596 597
597#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
598static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
599{ 601{
600 struct hlist_head *head; 602 struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
607 return; 609 return;
608 610
609 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
610 mutex_lock(&text_mutex);
611 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
612 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
613 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
614 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
615 optimize_kprobe(p); 616 optimize_kprobe(p);
616 } 617 }
617 mutex_unlock(&text_mutex);
618 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
619} 619}
620 620
621/* This should be called with kprobe_mutex locked */
621static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
622{ 623{
623 struct hlist_head *head; 624 struct hlist_head *head;
@@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
1144 if (ret) 1145 if (ret)
1145 return ret; 1146 return ret;
1146 1147
1148 jump_label_lock();
1147 preempt_disable(); 1149 preempt_disable();
1148 if (!kernel_text_address((unsigned long) p->addr) || 1150 if (!kernel_text_address((unsigned long) p->addr) ||
1149 in_kprobes_functions((unsigned long) p->addr) || 1151 in_kprobes_functions((unsigned long) p->addr) ||
1150 ftrace_text_reserved(p->addr, p->addr) || 1152 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) { 1153 jump_label_text_reserved(p->addr, p->addr))
1152 preempt_enable(); 1154 goto fail_with_jump_label;
1153 return -EINVAL;
1154 }
1155 1155
1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1157 p->flags &= KPROBE_FLAG_DISABLED; 1157 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1165 * We must hold a refcount of the probed module while updating 1165 * We must hold a refcount of the probed module while updating
1166 * its code to prohibit unexpected unloading. 1166 * its code to prohibit unexpected unloading.
1167 */ 1167 */
1168 if (unlikely(!try_module_get(probed_mod))) { 1168 if (unlikely(!try_module_get(probed_mod)))
1169 preempt_enable(); 1169 goto fail_with_jump_label;
1170 return -EINVAL; 1170
1171 }
1172 /* 1171 /*
1173 * If the module freed .init.text, we couldn't insert 1172 * If the module freed .init.text, we couldn't insert
1174 * kprobes in there. 1173 * kprobes in there.
@@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1176 if (within_module_init((unsigned long)p->addr, probed_mod) && 1175 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1177 probed_mod->state != MODULE_STATE_COMING) { 1176 probed_mod->state != MODULE_STATE_COMING) {
1178 module_put(probed_mod); 1177 module_put(probed_mod);
1179 preempt_enable(); 1178 goto fail_with_jump_label;
1180 return -EINVAL;
1181 } 1179 }
1182 } 1180 }
1183 preempt_enable(); 1181 preempt_enable();
1182 jump_label_unlock();
1184 1183
1185 p->nmissed = 0; 1184 p->nmissed = 0;
1186 INIT_LIST_HEAD(&p->list); 1185 INIT_LIST_HEAD(&p->list);
1187 mutex_lock(&kprobe_mutex); 1186 mutex_lock(&kprobe_mutex);
1188 1187
1188 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1189
1189 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1190 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1190 mutex_lock(&text_mutex); 1191 mutex_lock(&text_mutex);
1191 1192
@@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1213out: 1214out:
1214 mutex_unlock(&text_mutex); 1215 mutex_unlock(&text_mutex);
1215 put_online_cpus(); 1216 put_online_cpus();
1217 jump_label_unlock();
1216 mutex_unlock(&kprobe_mutex); 1218 mutex_unlock(&kprobe_mutex);
1217 1219
1218 if (probed_mod) 1220 if (probed_mod)
1219 module_put(probed_mod); 1221 module_put(probed_mod);
1220 1222
1221 return ret; 1223 return ret;
1224
1225fail_with_jump_label:
1226 preempt_enable();
1227 jump_label_unlock();
1228 return -EINVAL;
1222} 1229}
1223EXPORT_SYMBOL_GPL(register_kprobe); 1230EXPORT_SYMBOL_GPL(register_kprobe);
1224 1231
@@ -2000,6 +2007,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2000static const struct file_operations fops_kp = { 2007static const struct file_operations fops_kp = {
2001 .read = read_enabled_file_bool, 2008 .read = read_enabled_file_bool,
2002 .write = write_enabled_file_bool, 2009 .write = write_enabled_file_bool,
2010 .llseek = default_llseek,
2003}; 2011};
2004 2012
2005static int __kprobes debugfs_kprobe_init(void) 2013static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
2037{ 2037{
2038} 2038}
2039 2039
2040static void add_kallsyms(struct module *mod, struct load_info *info) 2040static void add_kallsyms(struct module *mod, const struct load_info *info)
2041{ 2041{
2042} 2042}
2043#endif /* CONFIG_KALLSYMS */ 2043#endif /* CONFIG_KALLSYMS */
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
85 return ERR_PTR(-EPERM); 85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current)) 86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM); 87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
88 96
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup) 98 if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..cb6c0d2af68f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
417 return event->cpu == -1 || event->cpu == smp_processor_id(); 417 return event->cpu == -1 || event->cpu == smp_processor_id();
418} 418}
419 419
420static int 420static void
421__event_sched_out(struct perf_event *event, 421event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
424{ 424{
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
437 } 437 }
438 438
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
440 return 0; 440 return;
441 441
442 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
443 if (event->pending_disable) { 443 if (event->pending_disable) {
444 event->pending_disable = 0; 444 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
446 } 446 }
447 event->tstamp_stopped = ctx->time;
447 event->pmu->del(event, 0); 448 event->pmu->del(event, 0);
448 event->oncpu = -1; 449 event->oncpu = -1;
449 450
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
452 ctx->nr_active--; 453 ctx->nr_active--;
453 if (event->attr.exclusive || !cpuctx->active_oncpu) 454 if (event->attr.exclusive || !cpuctx->active_oncpu)
454 cpuctx->exclusive = 0; 455 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
468} 456}
469 457
470static void 458static void
@@ -664,7 +652,7 @@ retry:
664} 652}
665 653
666static int 654static int
667__event_sched_in(struct perf_event *event, 655event_sched_in(struct perf_event *event,
668 struct perf_cpu_context *cpuctx, 656 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 657 struct perf_event_context *ctx)
670{ 658{
@@ -684,6 +672,10 @@ __event_sched_in(struct perf_event *event,
684 return -EAGAIN; 672 return -EAGAIN;
685 } 673 }
686 674
675 event->tstamp_running += ctx->time - event->tstamp_stopped;
676
677 event->shadow_ctx_time = ctx->time - ctx->timestamp;
678
687 if (!is_software_event(event)) 679 if (!is_software_event(event))
688 cpuctx->active_oncpu++; 680 cpuctx->active_oncpu++;
689 ctx->nr_active++; 681 ctx->nr_active++;
@@ -694,35 +686,6 @@ __event_sched_in(struct perf_event *event,
694 return 0; 686 return 0;
695} 687}
696 688
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
726static int 689static int
727group_sched_in(struct perf_event *group_event, 690group_sched_in(struct perf_event *group_event,
728 struct perf_cpu_context *cpuctx, 691 struct perf_cpu_context *cpuctx,
@@ -730,19 +693,15 @@ group_sched_in(struct perf_event *group_event,
730{ 693{
731 struct perf_event *event, *partial_group = NULL; 694 struct perf_event *event, *partial_group = NULL;
732 struct pmu *pmu = group_event->pmu; 695 struct pmu *pmu = group_event->pmu;
696 u64 now = ctx->time;
697 bool simulate = false;
733 698
734 if (group_event->state == PERF_EVENT_STATE_OFF) 699 if (group_event->state == PERF_EVENT_STATE_OFF)
735 return 0; 700 return 0;
736 701
737 pmu->start_txn(pmu); 702 pmu->start_txn(pmu);
738 703
739 /* 704 if (event_sched_in(group_event, cpuctx, ctx)) {
740 * use __event_sched_in() to delay updating tstamp_running
741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu); 705 pmu->cancel_txn(pmu);
747 return -EAGAIN; 706 return -EAGAIN;
748 } 707 }
@@ -751,31 +710,42 @@ group_sched_in(struct perf_event *group_event,
751 * Schedule in siblings as one group (if any): 710 * Schedule in siblings as one group (if any):
752 */ 711 */
753 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 712 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
754 if (__event_sched_in(event, cpuctx, ctx)) { 713 if (event_sched_in(event, cpuctx, ctx)) {
755 partial_group = event; 714 partial_group = event;
756 goto group_error; 715 goto group_error;
757 } 716 }
758 } 717 }
759 718
760 if (!pmu->commit_txn(pmu)) { 719 if (!pmu->commit_txn(pmu))
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
763 return 0; 720 return 0;
764 } 721
765group_error: 722group_error:
766 /* 723 /*
767 * Groups can be scheduled in as one unit only, so undo any 724 * Groups can be scheduled in as one unit only, so undo any
768 * partial group before returning: 725 * partial group before returning:
726 * The events up to the failed event are scheduled out normally,
727 * tstamp_stopped will be updated.
769 * 728 *
770 * use __event_sched_out() to avoid updating tstamp_stopped 729 * The failed events and the remaining siblings need to have
771 * because the event never actually ran 730 * their timings updated as if they had gone thru event_sched_in()
731 * and event_sched_out(). This is required to get consistent timings
732 * across the group. This also takes care of the case where the group
733 * could never be scheduled by ensuring tstamp_stopped is set to mark
734 * the time the event was actually stopped, such that time delta
735 * calculation in update_event_times() is correct.
772 */ 736 */
773 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 737 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
774 if (event == partial_group) 738 if (event == partial_group)
775 break; 739 simulate = true;
776 __event_sched_out(event, cpuctx, ctx); 740
741 if (simulate) {
742 event->tstamp_running += now - event->tstamp_stopped;
743 event->tstamp_stopped = now;
744 } else {
745 event_sched_out(event, cpuctx, ctx);
746 }
777 } 747 }
778 __event_sched_out(group_event, cpuctx, ctx); 748 event_sched_out(group_event, cpuctx, ctx);
779 749
780 pmu->cancel_txn(pmu); 750 pmu->cancel_txn(pmu);
781 751
@@ -3428,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3428} 3398}
3429 3399
3430static void perf_output_read_one(struct perf_output_handle *handle, 3400static void perf_output_read_one(struct perf_output_handle *handle,
3431 struct perf_event *event) 3401 struct perf_event *event,
3402 u64 enabled, u64 running)
3432{ 3403{
3433 u64 read_format = event->attr.read_format; 3404 u64 read_format = event->attr.read_format;
3434 u64 values[4]; 3405 u64 values[4];
@@ -3436,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3436 3407
3437 values[n++] = perf_event_count(event); 3408 values[n++] = perf_event_count(event);
3438 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3409 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3439 values[n++] = event->total_time_enabled + 3410 values[n++] = enabled +
3440 atomic64_read(&event->child_total_time_enabled); 3411 atomic64_read(&event->child_total_time_enabled);
3441 } 3412 }
3442 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3413 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3443 values[n++] = event->total_time_running + 3414 values[n++] = running +
3444 atomic64_read(&event->child_total_time_running); 3415 atomic64_read(&event->child_total_time_running);
3445 } 3416 }
3446 if (read_format & PERF_FORMAT_ID) 3417 if (read_format & PERF_FORMAT_ID)
@@ -3453,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3453 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3424 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3454 */ 3425 */
3455static void perf_output_read_group(struct perf_output_handle *handle, 3426static void perf_output_read_group(struct perf_output_handle *handle,
3456 struct perf_event *event) 3427 struct perf_event *event,
3428 u64 enabled, u64 running)
3457{ 3429{
3458 struct perf_event *leader = event->group_leader, *sub; 3430 struct perf_event *leader = event->group_leader, *sub;
3459 u64 read_format = event->attr.read_format; 3431 u64 read_format = event->attr.read_format;
@@ -3463,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3463 values[n++] = 1 + leader->nr_siblings; 3435 values[n++] = 1 + leader->nr_siblings;
3464 3436
3465 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3437 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3466 values[n++] = leader->total_time_enabled; 3438 values[n++] = enabled;
3467 3439
3468 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3440 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3469 values[n++] = leader->total_time_running; 3441 values[n++] = running;
3470 3442
3471 if (leader != event) 3443 if (leader != event)
3472 leader->pmu->read(leader); 3444 leader->pmu->read(leader);
@@ -3491,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3491 } 3463 }
3492} 3464}
3493 3465
3466#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3467 PERF_FORMAT_TOTAL_TIME_RUNNING)
3468
3494static void perf_output_read(struct perf_output_handle *handle, 3469static void perf_output_read(struct perf_output_handle *handle,
3495 struct perf_event *event) 3470 struct perf_event *event)
3496{ 3471{
3472 u64 enabled = 0, running = 0, now, ctx_time;
3473 u64 read_format = event->attr.read_format;
3474
3475 /*
3476 * compute total_time_enabled, total_time_running
3477 * based on snapshot values taken when the event
3478 * was last scheduled in.
3479 *
3480 * we cannot simply called update_context_time()
3481 * because of locking issue as we are called in
3482 * NMI context
3483 */
3484 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3485 now = perf_clock();
3486 ctx_time = event->shadow_ctx_time + now;
3487 enabled = ctx_time - event->tstamp_enabled;
3488 running = ctx_time - event->tstamp_running;
3489 }
3490
3497 if (event->attr.read_format & PERF_FORMAT_GROUP) 3491 if (event->attr.read_format & PERF_FORMAT_GROUP)
3498 perf_output_read_group(handle, event); 3492 perf_output_read_group(handle, event, enabled, running);
3499 else 3493 else
3500 perf_output_read_one(handle, event); 3494 perf_output_read_one(handle, event, enabled, running);
3501} 3495}
3502 3496
3503void perf_output_sample(struct perf_output_handle *handle, 3497void perf_output_sample(struct perf_output_handle *handle,
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..c7a8f453919e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 110 .write = pm_qos_power_write,
111 .open = pm_qos_power_open, 111 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
113 .llseek = noop_llseek,
113}; 114};
114 115
115/* unlocked internal variant */ 116/* unlocked internal variant */
@@ -398,7 +399,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
398 } else 399 } else
399 return -EINVAL; 400 return -EINVAL;
400 401
401 pm_qos_req = (struct pm_qos_request_list *)filp->private_data; 402 pm_qos_req = filp->private_data;
402 pm_qos_update_request(pm_qos_req, value); 403 pm_qos_update_request(pm_qos_req, value);
403 404
404 return count; 405 return count;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
984 src = kmap_atomic(s_page, KM_USER0); 984 src = kmap_atomic(s_page, KM_USER0);
985 dst = kmap_atomic(d_page, KM_USER1); 985 dst = kmap_atomic(d_page, KM_USER1);
986 do_copy_page(dst, src); 986 do_copy_page(dst, src);
987 kunmap_atomic(src, KM_USER0);
988 kunmap_atomic(dst, KM_USER1); 987 kunmap_atomic(dst, KM_USER1);
988 kunmap_atomic(src, KM_USER0);
989 } else { 989 } else {
990 if (PageHighMem(d_page)) { 990 if (PageHighMem(d_page)) {
991 /* Page pointed to by src may contain some kernel 991 /* Page pointed to by src may contain some kernel
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
993 */ 993 */
994 safe_copy_page(buffer, s_page); 994 safe_copy_page(buffer, s_page);
995 dst = kmap_atomic(d_page, KM_USER0); 995 dst = kmap_atomic(d_page, KM_USER0);
996 memcpy(dst, buffer, PAGE_SIZE); 996 copy_page(dst, buffer);
997 kunmap_atomic(dst, KM_USER0); 997 kunmap_atomic(dst, KM_USER0);
998 } else { 998 } else {
999 safe_copy_page(page_address(d_page), s_page); 999 safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1687 memory_bm_position_reset(&orig_bm); 1687 memory_bm_position_reset(&orig_bm);
1688 memory_bm_position_reset(&copy_bm); 1688 memory_bm_position_reset(&copy_bm);
1689 } else if (handle->cur <= nr_meta_pages) { 1689 } else if (handle->cur <= nr_meta_pages) {
1690 memset(buffer, 0, PAGE_SIZE); 1690 clear_page(buffer);
1691 pack_pfns(buffer, &orig_bm); 1691 pack_pfns(buffer, &orig_bm);
1692 } else { 1692 } else {
1693 struct page *page; 1693 struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1701 void *kaddr; 1701 void *kaddr;
1702 1702
1703 kaddr = kmap_atomic(page, KM_USER0); 1703 kaddr = kmap_atomic(page, KM_USER0);
1704 memcpy(buffer, kaddr, PAGE_SIZE); 1704 copy_page(buffer, kaddr);
1705 kunmap_atomic(kaddr, KM_USER0); 1705 kunmap_atomic(kaddr, KM_USER0);
1706 handle->buffer = buffer; 1706 handle->buffer = buffer;
1707 } else { 1707 } else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
1984 void *dst; 1984 void *dst;
1985 1985
1986 dst = kmap_atomic(last_highmem_page, KM_USER0); 1986 dst = kmap_atomic(last_highmem_page, KM_USER0);
1987 memcpy(dst, buffer, PAGE_SIZE); 1987 copy_page(dst, buffer);
1988 kunmap_atomic(dst, KM_USER0); 1988 kunmap_atomic(dst, KM_USER0);
1989 last_highmem_page = NULL; 1989 last_highmem_page = NULL;
1990 } 1990 }
@@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2270 2270
2271 kaddr1 = kmap_atomic(p1, KM_USER0); 2271 kaddr1 = kmap_atomic(p1, KM_USER0);
2272 kaddr2 = kmap_atomic(p2, KM_USER1); 2272 kaddr2 = kmap_atomic(p2, KM_USER1);
2273 memcpy(buf, kaddr1, PAGE_SIZE); 2273 copy_page(buf, kaddr1);
2274 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2274 copy_page(kaddr1, kaddr2);
2275 memcpy(kaddr2, buf, PAGE_SIZE); 2275 copy_page(kaddr2, buf);
2276 kunmap_atomic(kaddr1, KM_USER0);
2277 kunmap_atomic(kaddr2, KM_USER1); 2276 kunmap_atomic(kaddr2, KM_USER1);
2277 kunmap_atomic(kaddr1, KM_USER0);
2278} 2278}
2279 2279
2280/** 2280/**
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
251 if (bio_chain) { 251 if (bio_chain) {
252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
253 if (src) { 253 if (src) {
254 memcpy(src, buf, PAGE_SIZE); 254 copy_page(src, buf);
255 } else { 255 } else {
256 WARN_ON_ONCE(1); 256 WARN_ON_ONCE(1);
257 bio_chain = NULL; /* Go synchronous */ 257 bio_chain = NULL; /* Go synchronous */
@@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
325 error = write_page(handle->cur, handle->cur_swap, NULL); 325 error = write_page(handle->cur, handle->cur_swap, NULL);
326 if (error) 326 if (error)
327 goto out; 327 goto out;
328 memset(handle->cur, 0, PAGE_SIZE); 328 clear_page(handle->cur);
329 handle->cur_swap = offset; 329 handle->cur_swap = offset;
330 handle->k = 0; 330 handle->k = 0;
331 } 331 }
@@ -910,7 +910,7 @@ int swsusp_check(void)
910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
911 if (!IS_ERR(hib_resume_bdev)) { 911 if (!IS_ERR(hib_resume_bdev)) {
912 set_blocksize(hib_resume_bdev, PAGE_SIZE); 912 set_blocksize(hib_resume_bdev, PAGE_SIZE);
913 memset(swsusp_header, 0, PAGE_SIZE); 913 clear_page(swsusp_header);
914 error = hib_bio_read_page(swsusp_resume_block, 914 error = hib_bio_read_page(swsusp_resume_block,
915 swsusp_header, NULL); 915 swsusp_header, NULL);
916 if (error) 916 if (error)
diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..9a2264fc42ca 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
210 210
211#ifdef CONFIG_BOOT_PRINTK_DELAY 211#ifdef CONFIG_BOOT_PRINTK_DELAY
212 212
213static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 213static int boot_delay; /* msecs delay after each printk during bootup */
214static unsigned long long loops_per_msec; /* based on boot_delay */ 214static unsigned long long loops_per_msec; /* based on boot_delay */
215 215
216static int __init boot_delay_setup(char *str) 216static int __init boot_delay_setup(char *str)
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
261} 261}
262#endif 262#endif
263 263
264#ifdef CONFIG_SECURITY_DMESG_RESTRICT
265int dmesg_restrict = 1;
266#else
267int dmesg_restrict;
268#endif
269
264int do_syslog(int type, char __user *buf, int len, bool from_file) 270int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 271{
266 unsigned i, j, limit, count; 272 unsigned i, j, limit, count;
@@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
268 char c; 274 char c;
269 int error = 0; 275 int error = 0;
270 276
271 error = security_syslog(type, from_file); 277 /*
278 * If this is from /proc/kmsg we only do the capabilities checks
279 * at open time.
280 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
283 return -EPERM;
284 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN))
287 return -EPERM;
288 }
289
290 error = security_syslog(type);
272 if (error) 291 if (error)
273 return error; 292 return error;
274 293
@@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu)
647 * released but interrupts still disabled. 666 * released but interrupts still disabled.
648 */ 667 */
649static int acquire_console_semaphore_for_printk(unsigned int cpu) 668static int acquire_console_semaphore_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock)
650{ 670{
651 int retval = 0; 671 int retval = 0;
652 672
@@ -1511,7 +1531,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1511} 1531}
1512EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1532EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1513 1533
1514static const char const *kmsg_reasons[] = { 1534static const char * const kmsg_reasons[] = {
1515 [KMSG_DUMP_OOPS] = "oops", 1535 [KMSG_DUMP_OOPS] = "oops",
1516 [KMSG_DUMP_PANIC] = "panic", 1536 [KMSG_DUMP_PANIC] = "panic",
1517 [KMSG_DUMP_KEXEC] = "kexec", 1537 [KMSG_DUMP_KEXEC] = "kexec",
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
555static const struct file_operations proc_profile_operations = { 555static const struct file_operations proc_profile_operations = {
556 .read = read_profile, 556 .read = read_profile,
557 .write = write_profile, 557 .write = write_profile,
558 .llseek = default_llseek,
558}; 559};
559 560
560#ifdef CONFIG_SMP 561#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 181 * under ptrace.
182 */ 182 */
183 retval = -ERESTARTNOINTR; 183 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 184 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 185 goto out;
186 186
187 task_lock(task); 187 task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
208unlock_tasklist: 208unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 209 write_unlock_irq(&tasklist_lock);
210unlock_creds: 210unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 211 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 212out:
213 return retval; 213 return retval;
214} 214}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
329 * and reacquire the lock. 329 * and reacquire the lock.
330 */ 330 */
331void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332 __releases(&tasklist_lock)
333 __acquires(&tasklist_lock)
332{ 334{
333 struct task_struct *p, *n; 335 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 336 LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
402 return copied; 404 return copied;
403} 405}
404 406
405static int ptrace_setoptions(struct task_struct *child, long data) 407static int ptrace_setoptions(struct task_struct *child, unsigned long data)
406{ 408{
407 child->ptrace &= ~PT_TRACE_MASK; 409 child->ptrace &= ~PT_TRACE_MASK;
408 410
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
481#define is_sysemu_singlestep(request) 0 483#define is_sysemu_singlestep(request) 0
482#endif 484#endif
483 485
484static int ptrace_resume(struct task_struct *child, long request, long data) 486static int ptrace_resume(struct task_struct *child, long request,
487 unsigned long data)
485{ 488{
486 if (!valid_signal(data)) 489 if (!valid_signal(data))
487 return -EIO; 490 return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
558#endif 561#endif
559 562
560int ptrace_request(struct task_struct *child, long request, 563int ptrace_request(struct task_struct *child, long request,
561 long addr, long data) 564 unsigned long addr, unsigned long data)
562{ 565{
563 int ret = -EIO; 566 int ret = -EIO;
564 siginfo_t siginfo; 567 siginfo_t siginfo;
568 void __user *datavp = (void __user *) data;
569 unsigned long __user *datalp = datavp;
565 570
566 switch (request) { 571 switch (request) {
567 case PTRACE_PEEKTEXT: 572 case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
578 ret = ptrace_setoptions(child, data); 583 ret = ptrace_setoptions(child, data);
579 break; 584 break;
580 case PTRACE_GETEVENTMSG: 585 case PTRACE_GETEVENTMSG:
581 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 586 ret = put_user(child->ptrace_message, datalp);
582 break; 587 break;
583 588
584 case PTRACE_GETSIGINFO: 589 case PTRACE_GETSIGINFO:
585 ret = ptrace_getsiginfo(child, &siginfo); 590 ret = ptrace_getsiginfo(child, &siginfo);
586 if (!ret) 591 if (!ret)
587 ret = copy_siginfo_to_user((siginfo_t __user *) data, 592 ret = copy_siginfo_to_user(datavp, &siginfo);
588 &siginfo);
589 break; 593 break;
590 594
591 case PTRACE_SETSIGINFO: 595 case PTRACE_SETSIGINFO:
592 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 596 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
593 sizeof siginfo))
594 ret = -EFAULT; 597 ret = -EFAULT;
595 else 598 else
596 ret = ptrace_setsiginfo(child, &siginfo); 599 ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
621 } 624 }
622 mmput(mm); 625 mmput(mm);
623 626
624 ret = put_user(tmp, (unsigned long __user *) data); 627 ret = put_user(tmp, datalp);
625 break; 628 break;
626 } 629 }
627#endif 630#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
650 case PTRACE_SETREGSET: 653 case PTRACE_SETREGSET:
651 { 654 {
652 struct iovec kiov; 655 struct iovec kiov;
653 struct iovec __user *uiov = (struct iovec __user *) data; 656 struct iovec __user *uiov = datavp;
654 657
655 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 658 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
656 return -EFAULT; 659 return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
691#define arch_ptrace_attach(child) do { } while (0) 694#define arch_ptrace_attach(child) do { } while (0)
692#endif 695#endif
693 696
694SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 697SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
698 unsigned long, data)
695{ 699{
696 struct task_struct *child; 700 struct task_struct *child;
697 long ret; 701 long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
732 return ret; 736 return ret;
733} 737}
734 738
735int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 739int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
740 unsigned long data)
736{ 741{
737 unsigned long tmp; 742 unsigned long tmp;
738 int copied; 743 int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
743 return put_user(tmp, (unsigned long __user *)data); 748 return put_user(tmp, (unsigned long __user *)data);
744} 749}
745 750
746int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 751int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
752 unsigned long data)
747{ 753{
748 int copied; 754 int copied;
749 755
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
70 */ 70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages) 71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{ 72{
73 struct page **array; 73 const size_t pa_size = n_pages * sizeof(struct page *);
74 size_t pa_size = n_pages * sizeof(struct page *); 74 if (pa_size > PAGE_SIZE)
75 75 return vzalloc(pa_size);
76 if (pa_size > PAGE_SIZE) { 76 return kzalloc(pa_size, GFP_KERNEL);
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84} 77}
85 78
86/* 79/*
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 60static void *r_next(struct seq_file *m, void *v, loff_t *pos)
44{ 61{
45 struct resource *p = v; 62 struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 375}
359 376
377static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail,
379 resource_size_t size,
380 resource_size_t align)
381{
382 return avail->start;
383}
384
385static void resource_clip(struct resource *res, resource_size_t min,
386 resource_size_t max)
387{
388 if (res->start < min)
389 res->start = min;
390 if (res->end > max)
391 res->end = max;
392}
393
394static bool resource_contains(struct resource *res1, struct resource *res2)
395{
396 return res1->start <= res2->start && res1->end >= res2->end;
397}
398
399/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
360/* 465/*
361 * Find empty slot in the resource tree given range and alignment. 466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
362 */ 468 */
363static int find_resource(struct resource *root, struct resource *new, 469static int find_resource(struct resource *root, struct resource *new,
364 resource_size_t size, resource_size_t min, 470 resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
370 void *alignf_data) 476 void *alignf_data)
371{ 477{
372 struct resource *this = root->child; 478 struct resource *this = root->child;
373 struct resource tmp = *new; 479 struct resource tmp = *new, avail, alloc;
374 480
375 tmp.start = root->start; 481 tmp.start = root->start;
376 /* 482 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 483 * Skip past an allocated resource that starts at 0, since the
378 * of this->start - 1 to tmp->end below would cause an underflow. 484 * assignment of this->start - 1 to tmp->end below would cause an
485 * underflow.
379 */ 486 */
380 if (this && this->start == 0) { 487 if (this && this->start == 0) {
381 tmp.start = this->end + 1; 488 tmp.start = this->end + 1;
382 this = this->sibling; 489 this = this->sibling;
383 } 490 }
384 for(;;) { 491 for (;;) {
385 if (this) 492 if (this)
386 tmp.end = this->start - 1; 493 tmp.end = this->start - 1;
387 else 494 else
388 tmp.end = root->end; 495 tmp.end = root->end;
389 if (tmp.start < min) 496
390 tmp.start = min; 497 resource_clip(&tmp, min, max);
391 if (tmp.end > max) 498
392 tmp.end = max; 499 /* Check for overflow after ALIGN() */
393 tmp.start = ALIGN(tmp.start, align); 500 avail = *new;
394 if (alignf) 501 avail.start = ALIGN(tmp.start, align);
395 tmp.start = alignf(alignf_data, &tmp, size, align); 502 avail.end = tmp.end;
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 503 if (avail.start >= tmp.start) {
397 new->start = tmp.start; 504 alloc.start = alignf(alignf_data, &avail, size, align);
398 new->end = tmp.start + size - 1; 505 alloc.end = alloc.start + size - 1;
399 return 0; 506 if (resource_contains(&avail, &alloc)) {
507 new->start = alloc.start;
508 new->end = alloc.end;
509 return 0;
510 }
400 } 511 }
512
401 if (!this) 513 if (!this)
402 break; 514 break;
515
403 tmp.start = this->end + 1; 516 tmp.start = this->end + 1;
404 this = this->sibling; 517 this = this->sibling;
405 } 518 }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
428{ 541{
429 int err; 542 int err;
430 543
544 if (!alignf)
545 alignf = simple_align_resource;
546
431 write_lock(&resource_lock); 547 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 548 if (resource_alloc_from_bottom)
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
433 if (err >= 0 && __request_resource(root, new)) 552 if (err >= 0 && __request_resource(root, new))
434 err = -EBUSY; 553 err = -EBUSY;
435 write_unlock(&resource_lock); 554 write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 572
454 if (first == parent) 573 if (first == parent)
455 return first; 574 return first;
575 if (WARN_ON(first == new)) /* duplicated insertion */
576 return first;
456 577
457 if ((first->start > new->start) || (first->end < new->end)) 578 if ((first->start > new->start) || (first->end < new->end))
458 break; 579 break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
76 } 76 }
77 77
78 if (!lockwakeup && td->bkl == 4) { 78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
79 unlock_kernel(); 80 unlock_kernel();
81#endif
80 td->bkl = 0; 82 td->bkl = 0;
81 } 83 }
82 return 0; 84 return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
133 if (td->bkl) 135 if (td->bkl)
134 return 0; 136 return 0;
135 td->bkl = 1; 137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
136 lock_kernel(); 139 lock_kernel();
140#endif
137 td->bkl = 4; 141 td->bkl = 4;
138 return 0; 142 return 0;
139 143
140 case RTTEST_UNLOCKBKL: 144 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4) 145 if (td->bkl != 4)
142 break; 146 break;
147#ifdef CONFIG_LOCK_KERNEL
143 unlock_kernel(); 148 unlock_kernel();
149#endif
144 td->bkl = 0; 150 td->bkl = 0;
145 return 0; 151 return 0;
146 152
diff --git a/kernel/sched.c b/kernel/sched.c
index 51944e8c38a8..41f18695b730 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)
8510 if (unlikely(running)) 8510 if (unlikely(running))
8511 tsk->sched_class->put_prev_task(rq, tsk); 8511 tsk->sched_class->put_prev_task(rq, tsk);
8512 8512
8513 set_task_rq(tsk, task_cpu(tsk));
8514
8515#ifdef CONFIG_FAIR_GROUP_SCHED 8513#ifdef CONFIG_FAIR_GROUP_SCHED
8516 if (tsk->sched_class->moved_group) 8514 if (tsk->sched_class->task_move_group)
8517 tsk->sched_class->moved_group(tsk, on_rq); 8515 tsk->sched_class->task_move_group(tsk, on_rq);
8516 else
8518#endif 8517#endif
8518 set_task_rq(tsk, task_cpu(tsk));
8519 8519
8520 if (unlikely(running)) 8520 if (unlikely(running))
8521 tsk->sched_class->set_curr_task(rq); 8521 tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..f4f6a8326dd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
3869} 3869}
3870 3870
3871#ifdef CONFIG_FAIR_GROUP_SCHED 3871#ifdef CONFIG_FAIR_GROUP_SCHED
3872static void moved_group_fair(struct task_struct *p, int on_rq) 3872static void task_move_group_fair(struct task_struct *p, int on_rq)
3873{ 3873{
3874 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3874 /*
3875 3875 * If the task was not on the rq at the time of this cgroup movement
3876 update_curr(cfs_rq); 3876 * it must have been asleep, sleeping tasks keep their ->vruntime
3877 * absolute on their old rq until wakeup (needed for the fair sleeper
3878 * bonus in place_entity()).
3879 *
3880 * If it was on the rq, we've just 'preempted' it, which does convert
3881 * ->vruntime to a relative base.
3882 *
3883 * Make sure both cases convert their relative position when migrating
3884 * to another cgroup's rq. This does somewhat interfere with the
3885 * fair sleeper stuff for the first placement, but who cares.
3886 */
3887 if (!on_rq)
3888 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
3889 set_task_rq(p, task_cpu(p));
3877 if (!on_rq) 3890 if (!on_rq)
3878 place_entity(cfs_rq, &p->se, 1); 3891 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3879} 3892}
3880#endif 3893#endif
3881 3894
@@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
3927 .get_rr_interval = get_rr_interval_fair, 3940 .get_rr_interval = get_rr_interval_fair,
3928 3941
3929#ifdef CONFIG_FAIR_GROUP_SCHED 3942#ifdef CONFIG_FAIR_GROUP_SCHED
3930 .moved_group = moved_group_fair, 3943 .task_move_group = task_move_group_fair,
3931#endif 3944#endif
3932}; 3945};
3933 3946
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
157} 157}
158 158
159/* 159/*
160 * Called when a process is dequeued from the active array and given 160 * We are interested in knowing how long it was from the *first* time a
161 * the cpu. We should note that with the exception of interactive
162 * tasks, the expired queue will become the active queue after the active
163 * queue is empty, without explicitly dequeuing and requeuing tasks in the
164 * expired queue. (Interactive tasks may be requeued directly to the
165 * active queue, thus delaying tasks in the expired queue from running;
166 * see scheduler_tick()).
167 *
168 * Though we are interested in knowing how long it was from the *first* time a
169 * task was queued to the time that it finally hit a cpu, we call this routine 161 * task was queued to the time that it finally hit a cpu, we call this routine
170 * from dequeue_task() to account for possible rq->clock skew across cpus. The 162 * from dequeue_task() to account for possible rq->clock skew across cpus. The
171 * delta taken on each cpu would annul the skew. 163 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
203} 195}
204 196
205/* 197/*
206 * Called when a process is queued into either the active or expired
207 * array. The time is noted and later used to determine how long we
208 * had to wait for us to reach the cpu. Since the expired queue will
209 * become the active queue after active queue is empty, without dequeuing
210 * and requeuing any tasks, we are interested in queuing to either. It
211 * is unusual but not impossible for tasks to be dequeued and immediately
212 * requeued in the same or another array: this can happen in sched_yield(),
213 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
214 * to runqueue.
215 *
216 * This function is only called from enqueue_task(), but also only updates 198 * This function is only called from enqueue_task(), but also only updates
217 * the timestamp if it is already not set. It's assumed that 199 * the timestamp if it is already not set. It's assumed that
218 * sched_info_dequeued() will clear that stamp when appropriate. 200 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1105 return count;
1106} 1106}
1107 1107
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1109 unsigned long *flags)
1109{ 1110{
1110 struct sighand_struct *sighand; 1111 struct sighand_struct *sighand;
1111 1112
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
1617 * is gone, we keep current->exit_code unless clear_code. 1618 * is gone, we keep current->exit_code unless clear_code.
1618 */ 1619 */
1619static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock)
1620{ 1623{
1621 if (arch_ptrace_stop_needed(exit_code, info)) { 1624 if (arch_ptrace_stop_needed(exit_code, info)) {
1622 /* 1625 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 267 *
268 * Returns 0 on success, else a negative status code. 268 * Returns 0 on success, else a negative status code.
269 */ 269 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 271 int wait)
272{ 272{
273 struct call_single_data d = { 273 struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 336 * 3) any other online cpu in @mask
337 */ 337 */
338int smp_call_function_any(const struct cpumask *mask, 338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 339 smp_call_func_t func, void *info, int wait)
340{ 340{
341 unsigned int cpu; 341 unsigned int cpu;
342 const struct cpumask *nodemask; 342 const struct cpumask *nodemask;
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
416 * must be disabled when calling this function. 416 * must be disabled when calling this function.
417 */ 417 */
418void smp_call_function_many(const struct cpumask *mask, 418void smp_call_function_many(const struct cpumask *mask,
419 void (*func)(void *), void *info, bool wait) 419 smp_call_func_t func, void *info, bool wait)
420{ 420{
421 struct call_function_data *data; 421 struct call_function_data *data;
422 unsigned long flags; 422 unsigned long flags;
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
500 * You must not call this function with disabled interrupts or from a 500 * You must not call this function with disabled interrupts or from a
501 * hardware interrupt handler or from a bottom half handler. 501 * hardware interrupt handler or from a bottom half handler.
502 */ 502 */
503int smp_call_function(void (*func)(void *), void *info, int wait) 503int smp_call_function(smp_call_func_t func, void *info, int wait)
504{ 504{
505 preempt_disable(); 505 preempt_disable();
506 smp_call_function_many(cpu_online_mask, func, info, wait); 506 smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 081869ed3a9f..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -229,18 +229,20 @@ restart:
229 229
230 do { 230 do {
231 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
232 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
233 kstat_incr_softirqs_this_cpu(h - softirq_vec);
234 234
235 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
236 h->action(h); 238 h->action(h);
237 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
238 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
239 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
240 "with preempt_count %08x," 242 "with preempt_count %08x,"
241 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
242 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
243 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
244 preempt_count() = prev_count; 246 preempt_count() = prev_count;
245 } 247 }
246 248
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
305 cpu); 305 cpu);
306 if (IS_ERR(p)) 306 if (IS_ERR(p))
307 return NOTIFY_BAD; 307 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu); 309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p); 310 sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
372 /* start one for the boot cpu */ 372 /* start one for the boot cpu */
373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
374 bcpu); 374 bcpu);
375 BUG_ON(err == NOTIFY_BAD); 375 BUG_ON(err != NOTIFY_OK);
376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
377 register_cpu_notifier(&cpu_stop_cpu_notifier); 377 register_cpu_notifier(&cpu_stop_cpu_notifier);
378 378
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..b65bf634035e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 161extern int unaligned_dump_stack;
162#endif 162#endif
163 163
164extern struct ratelimit_state printk_ratelimit_state;
165
166#ifdef CONFIG_PROC_SYSCTL 164#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 165static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 166 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -706,6 +704,15 @@ static struct ctl_table kern_table[] = {
706 }, 704 },
707#endif 705#endif
708 { 706 {
707 .procname = "dmesg_restrict",
708 .data = &dmesg_restrict,
709 .maxlen = sizeof(int),
710 .mode = 0644,
711 .proc_handler = proc_dointvec_minmax,
712 .extra1 = &zero,
713 .extra2 = &one,
714 },
715 {
709 .procname = "ngroups_max", 716 .procname = "ngroups_max",
710 .data = &ngroups_max, 717 .data = &ngroups_max,
711 .maxlen = sizeof (int), 718 .maxlen = sizeof (int),
@@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = {
1340 .data = &inodes_stat, 1347 .data = &inodes_stat,
1341 .maxlen = 2*sizeof(int), 1348 .maxlen = 2*sizeof(int),
1342 .mode = 0444, 1349 .mode = 0444,
1343 .proc_handler = proc_dointvec, 1350 .proc_handler = proc_nr_inodes,
1344 }, 1351 },
1345 { 1352 {
1346 .procname = "inode-state", 1353 .procname = "inode-state",
1347 .data = &inodes_stat, 1354 .data = &inodes_stat,
1348 .maxlen = 7*sizeof(int), 1355 .maxlen = 7*sizeof(int),
1349 .mode = 0444, 1356 .mode = 0444,
1350 .proc_handler = proc_dointvec, 1357 .proc_handler = proc_nr_inodes,
1351 }, 1358 },
1352 { 1359 {
1353 .procname = "file-nr", 1360 .procname = "file-nr",
1354 .data = &files_stat, 1361 .data = &files_stat,
1355 .maxlen = 3*sizeof(int), 1362 .maxlen = sizeof(files_stat),
1356 .mode = 0444, 1363 .mode = 0444,
1357 .proc_handler = proc_nr_files, 1364 .proc_handler = proc_nr_files,
1358 }, 1365 },
1359 { 1366 {
1360 .procname = "file-max", 1367 .procname = "file-max",
1361 .data = &files_stat.max_files, 1368 .data = &files_stat.max_files,
1362 .maxlen = sizeof(int), 1369 .maxlen = sizeof(files_stat.max_files),
1363 .mode = 0644, 1370 .mode = 0644,
1364 .proc_handler = proc_dointvec, 1371 .proc_handler = proc_doulongvec_minmax,
1365 }, 1372 },
1366 { 1373 {
1367 .procname = "nr_open", 1374 .procname = "nr_open",
@@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = {
1377 .data = &dentry_stat, 1384 .data = &dentry_stat,
1378 .maxlen = 6*sizeof(int), 1385 .maxlen = 6*sizeof(int),
1379 .mode = 0444, 1386 .mode = 0444,
1380 .proc_handler = proc_dointvec, 1387 .proc_handler = proc_nr_dentry,
1381 }, 1388 },
1382 { 1389 {
1383 .procname = "overflowuid", 1390 .procname = "overflowuid",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 175 up_write(&listeners->sem);
176} 176}
177 177
178static int fill_pid(pid_t pid, struct task_struct *tsk, 178static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 179{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 180 memset(stats, 0, sizeof(*stats));
195 /* 181 /*
196 * Each accounting subsystem adds calls to its functions to 182 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 195
210 /* fill in extended acct fields */ 196 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 197 xacct_add_tsk(stats, tsk);
198}
212 199
213 /* Define err: label here if needed */ 200static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 201{
215 return rc; 202 struct task_struct *tsk;
216 203
204 rcu_read_lock();
205 tsk = find_task_by_vpid(pid);
206 if (tsk)
207 get_task_struct(tsk);
208 rcu_read_unlock();
209 if (!tsk)
210 return -ESRCH;
211 fill_stats(tsk, stats);
212 put_task_struct(tsk);
213 return 0;
217} 214}
218 215
219static int fill_tgid(pid_t tgid, struct task_struct *first, 216static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 217{
222 struct task_struct *tsk; 218 struct task_struct *tsk, *first;
223 unsigned long flags; 219 unsigned long flags;
224 int rc = -ESRCH; 220 int rc = -ESRCH;
225 221
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 224 * leaders who are already counted with the dead tasks
229 */ 225 */
230 rcu_read_lock(); 226 rcu_read_lock();
231 if (!first) 227 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 228
234 if (!first || !lock_task_sighand(first, &flags)) 229 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 230 goto out;
@@ -268,7 +263,6 @@ out:
268 return rc; 263 return rc;
269} 264}
270 265
271
272static void fill_tgid_exit(struct task_struct *tsk) 266static void fill_tgid_exit(struct task_struct *tsk)
273{ 267{
274 unsigned long flags; 268 unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
360 struct nlattr *na, *ret; 354 struct nlattr *na, *ret;
361 int aggr; 355 int aggr;
362 356
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 363 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 364 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 365 : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
367 na = nla_nest_start(skb, aggr); 367 na = nla_nest_start(skb, aggr);
368 if (!na) 368 if (!na)
369 goto err; 369 goto err;
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 370 if (nla_put(skb, type, pid_size, pids) < 0)
371 goto err; 371 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 373 if (!ret)
@@ -424,39 +424,46 @@ err:
424 return rc; 424 return rc;
425} 425}
426 426
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 427static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 428{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 429 cpumask_var_t mask;
430 int rc;
434 431
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 432 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 433 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 434 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 435 if (rc < 0)
440 goto free_return_rc; 436 goto out;
441 if (rc == 0) { 437 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 438out:
443 goto free_return_rc; 439 free_cpumask_var(mask);
444 } 440 return rc;
441}
442
443static int cmd_attr_deregister_cpumask(struct genl_info *info)
444{
445 cpumask_var_t mask;
446 int rc;
445 447
448 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
449 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 450 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 451 if (rc < 0)
448 goto free_return_rc; 452 goto out;
449 if (rc == 0) { 453 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 454out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 455 free_cpumask_var(mask);
456 return rc;
457}
458
459static int cmd_attr_pid(struct genl_info *info)
460{
461 struct taskstats *stats;
462 struct sk_buff *rep_skb;
463 size_t size;
464 u32 pid;
465 int rc;
456 466
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 467 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
462 469
@@ -465,33 +472,64 @@ free_return_rc:
465 return rc; 472 return rc;
466 473
467 rc = -EINVAL; 474 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 475 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 476 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 477 if (!stats)
471 if (!stats) 478 goto err;
472 goto err; 479
473 480 rc = fill_stats_for_pid(pid, stats);
474 rc = fill_pid(pid, NULL, stats); 481 if (rc < 0)
475 if (rc < 0) 482 goto err;
476 goto err; 483 return send_reply(rep_skb, info);
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 484err:
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 485 nlmsg_free(rep_skb);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 486 return rc;
480 if (!stats) 487}
481 goto err; 488
482 489static int cmd_attr_tgid(struct genl_info *info)
483 rc = fill_tgid(tgid, NULL, stats); 490{
484 if (rc < 0) 491 struct taskstats *stats;
485 goto err; 492 struct sk_buff *rep_skb;
486 } else 493 size_t size;
494 u32 tgid;
495 int rc;
496
497 size = nla_total_size(sizeof(u32)) +
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0)
502 return rc;
503
504 rc = -EINVAL;
505 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
506 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
507 if (!stats)
487 goto err; 508 goto err;
488 509
510 rc = fill_stats_for_tgid(tgid, stats);
511 if (rc < 0)
512 goto err;
489 return send_reply(rep_skb, info); 513 return send_reply(rep_skb, info);
490err: 514err:
491 nlmsg_free(rep_skb); 515 nlmsg_free(rep_skb);
492 return rc; 516 return rc;
493} 517}
494 518
519static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
520{
521 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
522 return cmd_attr_register_cpumask(info);
523 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
524 return cmd_attr_deregister_cpumask(info);
525 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
526 return cmd_attr_pid(info);
527 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
528 return cmd_attr_tgid(info);
529 else
530 return -EINVAL;
531}
532
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 533static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 534{
497 struct signal_struct *sig = tsk->signal; 535 struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 593 if (!stats)
556 goto err; 594 goto err;
557 595
558 rc = fill_pid(-1, tsk, stats); 596 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 597
562 /* 598 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 599 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/smp_lock.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -169,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
171 170
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD 171#define BLK_TC_RAHEAD BLK_TC_AHEAD
174 172
175/* The ilog2() calls fall out because they're constant */ 173/* The ilog2() calls fall out because they're constant */
@@ -197,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
197 return; 195 return;
198 196
199 what |= ddir_act[rw & WRITE]; 197 what |= ddir_act[rw & WRITE];
200 what |= MASK_TC_BIT(rw, HARDBARRIER);
201 what |= MASK_TC_BIT(rw, SYNC); 198 what |= MASK_TC_BIT(rw, SYNC);
202 what |= MASK_TC_BIT(rw, RAHEAD); 199 what |= MASK_TC_BIT(rw, RAHEAD);
203 what |= MASK_TC_BIT(rw, META); 200 what |= MASK_TC_BIT(rw, META);
@@ -326,6 +323,7 @@ static const struct file_operations blk_dropped_fops = {
326 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
327 .open = blk_dropped_open, 324 .open = blk_dropped_open,
328 .read = blk_dropped_read, 325 .read = blk_dropped_read,
326 .llseek = default_llseek,
329}; 327};
330 328
331static int blk_msg_open(struct inode *inode, struct file *filp) 329static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
365 .owner = THIS_MODULE, 363 .owner = THIS_MODULE,
366 .open = blk_msg_open, 364 .open = blk_msg_open,
367 .write = blk_msg_write, 365 .write = blk_msg_write,
366 .llseek = noop_llseek,
368}; 367};
369 368
370/* 369/*
@@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
639 if (!q) 638 if (!q)
640 return -ENXIO; 639 return -ENXIO;
641 640
642 lock_kernel();
643 mutex_lock(&bdev->bd_mutex); 641 mutex_lock(&bdev->bd_mutex);
644 642
645 switch (cmd) { 643 switch (cmd) {
@@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
667 } 665 }
668 666
669 mutex_unlock(&bdev->bd_mutex); 667 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
671 return ret; 668 return ret;
672} 669}
673 670
@@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1652 struct block_device *bdev; 1649 struct block_device *bdev;
1653 ssize_t ret = -ENXIO; 1650 ssize_t ret = -ENXIO;
1654 1651
1655 lock_kernel();
1656 bdev = bdget(part_devt(p)); 1652 bdev = bdget(part_devt(p));
1657 if (bdev == NULL) 1653 if (bdev == NULL)
1658 goto out_unlock_kernel; 1654 goto out;
1659 1655
1660 q = blk_trace_get_queue(bdev); 1656 q = blk_trace_get_queue(bdev);
1661 if (q == NULL) 1657 if (q == NULL)
@@ -1683,8 +1679,7 @@ out_unlock_bdev:
1683 mutex_unlock(&bdev->bd_mutex); 1679 mutex_unlock(&bdev->bd_mutex);
1684out_bdput: 1680out_bdput:
1685 bdput(bdev); 1681 bdput(bdev);
1686out_unlock_kernel: 1682out:
1687 unlock_kernel();
1688 return ret; 1683 return ret;
1689} 1684}
1690 1685
@@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1714 1709
1715 ret = -ENXIO; 1710 ret = -ENXIO;
1716 1711
1717 lock_kernel();
1718 p = dev_to_part(dev); 1712 p = dev_to_part(dev);
1719 bdev = bdget(part_devt(p)); 1713 bdev = bdget(part_devt(p));
1720 if (bdev == NULL) 1714 if (bdev == NULL)
1721 goto out_unlock_kernel; 1715 goto out;
1722 1716
1723 q = blk_trace_get_queue(bdev); 1717 q = blk_trace_get_queue(bdev);
1724 if (q == NULL) 1718 if (q == NULL)
@@ -1753,8 +1747,6 @@ out_unlock_bdev:
1753 mutex_unlock(&bdev->bd_mutex); 1747 mutex_unlock(&bdev->bd_mutex);
1754out_bdput: 1748out_bdput:
1755 bdput(bdev); 1749 bdput(bdev);
1756out_unlock_kernel:
1757 unlock_kernel();
1758out: 1750out:
1759 return ret ? ret : count; 1751 return ret ? ret : count;
1760} 1752}
@@ -1813,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1813 1805
1814 if (rw & REQ_RAHEAD) 1806 if (rw & REQ_RAHEAD)
1815 rwbs[i++] = 'A'; 1807 rwbs[i++] = 'A';
1816 if (rw & REQ_HARDBARRIER)
1817 rwbs[i++] = 'B';
1818 if (rw & REQ_SYNC) 1808 if (rw & REQ_SYNC)
1819 rwbs[i++] = 'S'; 1809 rwbs[i++] = 'S';
1820 if (rw & REQ_META) 1810 if (rw & REQ_META)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebd80d50c474..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
800 .open = tracing_open_generic, 800 .open = tracing_open_generic,
801 .read = ftrace_profile_read, 801 .read = ftrace_profile_read,
802 .write = ftrace_profile_write, 802 .write = ftrace_profile_write,
803 .llseek = default_llseek,
803}; 804};
804 805
805/* used to initialize the real stat files */ 806/* used to initialize the real stat files */
@@ -2669,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
2669 .read = seq_read, 2670 .read = seq_read,
2670 .write = ftrace_graph_write, 2671 .write = ftrace_graph_write,
2671 .release = ftrace_graph_release, 2672 .release = ftrace_graph_release,
2673 .llseek = seq_lseek,
2672}; 2674};
2673#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2675#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2674 2676
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c5a632a669e1..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2308,12 +2298,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2310{ 2300{
2301 u64 delta;
2302
2311 /* 2303 /*
2312 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2313 * time stamp. 2305 * time stamp.
2314 */ 2306 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2317} 2323}
2318 2324
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2359
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2356 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3049 3058
3050 again: 3059 again:
3051 /* 3060 /*
3052 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3053 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3054 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3055 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3056 */ 3065 */
3057 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3058 return NULL; 3067 return NULL;
3059 3068
3060 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3130 return NULL; 3139 return NULL;
3131 3140
3132 /* 3141 /*
3133 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3134 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3135 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3136 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3137 * But we limit them by not adding timestamps if they begin
3138 * at the start of a page.
3139 */ 3146 */
3140 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3141 return NULL; 3148 return NULL;
3142 3149
3143 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3835 if (len > (commit - read)) 3842 if (len > (commit - read))
3836 len = (commit - read); 3843 len = (commit - read);
3837 3844
3838 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3839 3847
3840 if (len < size) 3848 if (len < size)
3841 goto out_unlock; 3849 goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3857 break; 3865 break;
3858 3866
3859 event = rb_reader_event(cpu_buffer); 3867 event = rb_reader_event(cpu_buffer);
3860 size = rb_event_length(event); 3868 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event);
3861 } while (len > size); 3870 } while (len > size);
3862 3871
3863 /* update bpage */ 3872 /* update bpage */
@@ -3974,6 +3983,7 @@ static const struct file_operations rb_simple_fops = {
3974 .open = tracing_open_generic, 3983 .open = tracing_open_generic,
3975 .read = rb_simple_read, 3984 .read = rb_simple_read,
3976 .write = rb_simple_write, 3985 .write = rb_simple_write,
3986 .llseek = default_llseek,
3977}; 3987};
3978 3988
3979 3989
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3996{ 3996{
3997 struct dentry *d_percpu = tracing_dentry_percpu(); 3997 struct dentry *d_percpu = tracing_dentry_percpu();
3998 struct dentry *d_cpu; 3998 struct dentry *d_cpu;
3999 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3999 char cpu_dir[30]; /* 30 characters should be more than enough */
4000 char cpu_dir[7];
4001 4000
4002 if (cpu > 999 || cpu < 0) 4001 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4003 return;
4004
4005 sprintf(cpu_dir, "cpu%ld", cpu);
4006 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4002 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4007 if (!d_cpu) { 4003 if (!d_cpu) {
4008 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4004 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 398c0e8b332c..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,6 +932,7 @@ static const struct file_operations ftrace_enable_fops = {
932 .open = tracing_open_generic, 932 .open = tracing_open_generic,
933 .read = event_enable_read, 933 .read = event_enable_read,
934 .write = event_enable_write, 934 .write = event_enable_write,
935 .llseek = default_llseek,
935}; 936};
936 937
937static const struct file_operations ftrace_event_format_fops = { 938static const struct file_operations ftrace_event_format_fops = {
@@ -944,29 +945,34 @@ static const struct file_operations ftrace_event_format_fops = {
944static const struct file_operations ftrace_event_id_fops = { 945static const struct file_operations ftrace_event_id_fops = {
945 .open = tracing_open_generic, 946 .open = tracing_open_generic,
946 .read = event_id_read, 947 .read = event_id_read,
948 .llseek = default_llseek,
947}; 949};
948 950
949static const struct file_operations ftrace_event_filter_fops = { 951static const struct file_operations ftrace_event_filter_fops = {
950 .open = tracing_open_generic, 952 .open = tracing_open_generic,
951 .read = event_filter_read, 953 .read = event_filter_read,
952 .write = event_filter_write, 954 .write = event_filter_write,
955 .llseek = default_llseek,
953}; 956};
954 957
955static const struct file_operations ftrace_subsystem_filter_fops = { 958static const struct file_operations ftrace_subsystem_filter_fops = {
956 .open = tracing_open_generic, 959 .open = tracing_open_generic,
957 .read = subsystem_filter_read, 960 .read = subsystem_filter_read,
958 .write = subsystem_filter_write, 961 .write = subsystem_filter_write,
962 .llseek = default_llseek,
959}; 963};
960 964
961static const struct file_operations ftrace_system_enable_fops = { 965static const struct file_operations ftrace_system_enable_fops = {
962 .open = tracing_open_generic, 966 .open = tracing_open_generic,
963 .read = system_enable_read, 967 .read = system_enable_read,
964 .write = system_enable_write, 968 .write = system_enable_write,
969 .llseek = default_llseek,
965}; 970};
966 971
967static const struct file_operations ftrace_show_header_fops = { 972static const struct file_operations ftrace_show_header_fops = {
968 .open = tracing_open_generic, 973 .open = tracing_open_generic,
969 .read = show_header, 974 .read = show_header,
975 .llseek = default_llseek,
970}; 976};
971 977
972static struct dentry *event_trace_events_dir(void) 978static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
13#include <linux/kdb.h> 13#include <linux/kdb.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15 15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h" 16#include "trace.h"
18#include "trace_output.h" 17#include "trace_output.h"
19 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h> 33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
36 35
37#include "trace.h" 36#include "trace.h"
@@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
648 } 647 }
649 ret = register_probe_event(tp); 648 ret = register_probe_event(tp);
650 if (ret) { 649 if (ret) {
651 pr_warning("Faild to register probe event(%d)\n", ret); 650 pr_warning("Failed to register probe event(%d)\n", ret);
652 goto end; 651 goto end;
653 } 652 }
654 653
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
195 .open = tracing_open_generic, 195 .open = tracing_open_generic,
196 .read = stack_max_size_read, 196 .read = stack_max_size_read,
197 .write = stack_max_size_write, 197 .write = stack_max_size_write,
198 .llseek = default_llseek,
198}; 199};
199 200
200static void * 201static void *
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 91 * upon function exit.
92 */ 92 */
93static void free_user(struct user_struct *up, unsigned long flags) 93static void free_user(struct user_struct *up, unsigned long flags)
94 __releases(&uidhash_lock)
94{ 95{
95 uid_hash_remove(up); 96 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 94ca779aa9c2..14b8120d5232 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __initdata no_watchdog; 46static int no_watchdog;
47 47
48 48
49/* boot commands */ 49/* boot commands */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -42,9 +42,6 @@
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44 44
45#define CREATE_TRACE_POINTS
46#include <trace/events/workqueue.h>
47
48#include "workqueue_sched.h" 45#include "workqueue_sched.h"
49 46
50enum { 47enum {
@@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq);
257EXPORT_SYMBOL_GPL(system_nrt_wq); 254EXPORT_SYMBOL_GPL(system_nrt_wq);
258EXPORT_SYMBOL_GPL(system_unbound_wq); 255EXPORT_SYMBOL_GPL(system_unbound_wq);
259 256
257#define CREATE_TRACE_POINTS
258#include <trace/events/workqueue.h>
259
260#define for_each_busy_worker(worker, i, pos, gcwq) \ 260#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
310 (cpu) < WORK_CPU_NONE; \ 310 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312 312
313#ifdef CONFIG_LOCKDEP
314/**
315 * in_workqueue_context() - in context of specified workqueue?
316 * @wq: the workqueue of interest
317 *
318 * Checks lockdep state to see if the current task is executing from
319 * within a workqueue item. This function exists only if lockdep is
320 * enabled.
321 */
322int in_workqueue_context(struct workqueue_struct *wq)
323{
324 return lock_is_held(&wq->lockdep_map);
325}
326#endif
327
328#ifdef CONFIG_DEBUG_OBJECTS_WORK 313#ifdef CONFIG_DEBUG_OBJECTS_WORK
329 314
330static struct debug_obj_descr work_debug_descr; 315static struct debug_obj_descr work_debug_descr;
@@ -604,7 +589,9 @@ static bool keep_working(struct global_cwq *gcwq)
604{ 589{
605 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 590 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
606 591
607 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; 592 return !list_empty(&gcwq->worklist) &&
593 (atomic_read(nr_running) <= 1 ||
594 gcwq->flags & GCWQ_HIGHPRI_PENDING);
608} 595}
609 596
610/* Do we need a new worker? Called from manager. */ 597/* Do we need a new worker? Called from manager. */
@@ -997,6 +984,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
997 984
998 /* gcwq determined, get cwq and queue */ 985 /* gcwq determined, get cwq and queue */
999 cwq = get_cwq(gcwq->cpu, wq); 986 cwq = get_cwq(gcwq->cpu, wq);
987 trace_workqueue_queue_work(cpu, cwq, work);
1000 988
1001 BUG_ON(!list_empty(&work->entry)); 989 BUG_ON(!list_empty(&work->entry));
1002 990
@@ -1004,6 +992,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1004 work_flags = work_color_to_flags(cwq->work_color); 992 work_flags = work_color_to_flags(cwq->work_color);
1005 993
1006 if (likely(cwq->nr_active < cwq->max_active)) { 994 if (likely(cwq->nr_active < cwq->max_active)) {
995 trace_workqueue_activate_work(work);
1007 cwq->nr_active++; 996 cwq->nr_active++;
1008 worklist = gcwq_determine_ins_pos(gcwq, cwq); 997 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1009 } else { 998 } else {
@@ -1679,6 +1668,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1679 struct work_struct, entry); 1668 struct work_struct, entry);
1680 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); 1669 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1681 1670
1671 trace_workqueue_activate_work(work);
1682 move_linked_works(work, pos, NULL); 1672 move_linked_works(work, pos, NULL);
1683 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1673 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1684 cwq->nr_active++; 1674 cwq->nr_active++;
@@ -2074,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2074 * checks and call back into the fixup functions where we 2064 * checks and call back into the fixup functions where we
2075 * might deadlock. 2065 * might deadlock.
2076 */ 2066 */
2077 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2067 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2078 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2079 init_completion(&barr->done); 2069 init_completion(&barr->done);
2080 2070
@@ -2326,27 +2316,17 @@ out_unlock:
2326} 2316}
2327EXPORT_SYMBOL_GPL(flush_workqueue); 2317EXPORT_SYMBOL_GPL(flush_workqueue);
2328 2318
2329/** 2319static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2330 * flush_work - block until a work_struct's callback has terminated 2320 bool wait_executing)
2331 * @work: the work which is to be flushed
2332 *
2333 * Returns false if @work has already terminated.
2334 *
2335 * It is expected that, prior to calling flush_work(), the caller has
2336 * arranged for the work to not be requeued, otherwise it doesn't make
2337 * sense to use this function.
2338 */
2339int flush_work(struct work_struct *work)
2340{ 2321{
2341 struct worker *worker = NULL; 2322 struct worker *worker = NULL;
2342 struct global_cwq *gcwq; 2323 struct global_cwq *gcwq;
2343 struct cpu_workqueue_struct *cwq; 2324 struct cpu_workqueue_struct *cwq;
2344 struct wq_barrier barr;
2345 2325
2346 might_sleep(); 2326 might_sleep();
2347 gcwq = get_work_gcwq(work); 2327 gcwq = get_work_gcwq(work);
2348 if (!gcwq) 2328 if (!gcwq)
2349 return 0; 2329 return false;
2350 2330
2351 spin_lock_irq(&gcwq->lock); 2331 spin_lock_irq(&gcwq->lock);
2352 if (!list_empty(&work->entry)) { 2332 if (!list_empty(&work->entry)) {
@@ -2359,28 +2339,127 @@ int flush_work(struct work_struct *work)
2359 cwq = get_work_cwq(work); 2339 cwq = get_work_cwq(work);
2360 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2340 if (unlikely(!cwq || gcwq != cwq->gcwq))
2361 goto already_gone; 2341 goto already_gone;
2362 } else { 2342 } else if (wait_executing) {
2363 worker = find_worker_executing_work(gcwq, work); 2343 worker = find_worker_executing_work(gcwq, work);
2364 if (!worker) 2344 if (!worker)
2365 goto already_gone; 2345 goto already_gone;
2366 cwq = worker->current_cwq; 2346 cwq = worker->current_cwq;
2367 } 2347 } else
2348 goto already_gone;
2368 2349
2369 insert_wq_barrier(cwq, &barr, work, worker); 2350 insert_wq_barrier(cwq, barr, work, worker);
2370 spin_unlock_irq(&gcwq->lock); 2351 spin_unlock_irq(&gcwq->lock);
2371 2352
2372 lock_map_acquire(&cwq->wq->lockdep_map); 2353 lock_map_acquire(&cwq->wq->lockdep_map);
2373 lock_map_release(&cwq->wq->lockdep_map); 2354 lock_map_release(&cwq->wq->lockdep_map);
2374 2355 return true;
2375 wait_for_completion(&barr.done);
2376 destroy_work_on_stack(&barr.work);
2377 return 1;
2378already_gone: 2356already_gone:
2379 spin_unlock_irq(&gcwq->lock); 2357 spin_unlock_irq(&gcwq->lock);
2380 return 0; 2358 return false;
2359}
2360
2361/**
2362 * flush_work - wait for a work to finish executing the last queueing instance
2363 * @work: the work to flush
2364 *
2365 * Wait until @work has finished execution. This function considers
2366 * only the last queueing instance of @work. If @work has been
2367 * enqueued across different CPUs on a non-reentrant workqueue or on
2368 * multiple workqueues, @work might still be executing on return on
2369 * some of the CPUs from earlier queueing.
2370 *
2371 * If @work was queued only on a non-reentrant, ordered or unbound
2372 * workqueue, @work is guaranteed to be idle on return if it hasn't
2373 * been requeued since flush started.
2374 *
2375 * RETURNS:
2376 * %true if flush_work() waited for the work to finish execution,
2377 * %false if it was already idle.
2378 */
2379bool flush_work(struct work_struct *work)
2380{
2381 struct wq_barrier barr;
2382
2383 if (start_flush_work(work, &barr, true)) {
2384 wait_for_completion(&barr.done);
2385 destroy_work_on_stack(&barr.work);
2386 return true;
2387 } else
2388 return false;
2381} 2389}
2382EXPORT_SYMBOL_GPL(flush_work); 2390EXPORT_SYMBOL_GPL(flush_work);
2383 2391
2392static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2393{
2394 struct wq_barrier barr;
2395 struct worker *worker;
2396
2397 spin_lock_irq(&gcwq->lock);
2398
2399 worker = find_worker_executing_work(gcwq, work);
2400 if (unlikely(worker))
2401 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2402
2403 spin_unlock_irq(&gcwq->lock);
2404
2405 if (unlikely(worker)) {
2406 wait_for_completion(&barr.done);
2407 destroy_work_on_stack(&barr.work);
2408 return true;
2409 } else
2410 return false;
2411}
2412
2413static bool wait_on_work(struct work_struct *work)
2414{
2415 bool ret = false;
2416 int cpu;
2417
2418 might_sleep();
2419
2420 lock_map_acquire(&work->lockdep_map);
2421 lock_map_release(&work->lockdep_map);
2422
2423 for_each_gcwq_cpu(cpu)
2424 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2425 return ret;
2426}
2427
2428/**
2429 * flush_work_sync - wait until a work has finished execution
2430 * @work: the work to flush
2431 *
2432 * Wait until @work has finished execution. On return, it's
2433 * guaranteed that all queueing instances of @work which happened
2434 * before this function is called are finished. In other words, if
2435 * @work hasn't been requeued since this function was called, @work is
2436 * guaranteed to be idle on return.
2437 *
2438 * RETURNS:
2439 * %true if flush_work_sync() waited for the work to finish execution,
2440 * %false if it was already idle.
2441 */
2442bool flush_work_sync(struct work_struct *work)
2443{
2444 struct wq_barrier barr;
2445 bool pending, waited;
2446
2447 /* we'll wait for executions separately, queue barr only if pending */
2448 pending = start_flush_work(work, &barr, false);
2449
2450 /* wait for executions to finish */
2451 waited = wait_on_work(work);
2452
2453 /* wait for the pending one */
2454 if (pending) {
2455 wait_for_completion(&barr.done);
2456 destroy_work_on_stack(&barr.work);
2457 }
2458
2459 return pending || waited;
2460}
2461EXPORT_SYMBOL_GPL(flush_work_sync);
2462
2384/* 2463/*
2385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 2464 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2386 * so this work can't be re-armed in any way. 2465 * so this work can't be re-armed in any way.
@@ -2423,39 +2502,7 @@ static int try_to_grab_pending(struct work_struct *work)
2423 return ret; 2502 return ret;
2424} 2503}
2425 2504
2426static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) 2505static bool __cancel_work_timer(struct work_struct *work,
2427{
2428 struct wq_barrier barr;
2429 struct worker *worker;
2430
2431 spin_lock_irq(&gcwq->lock);
2432
2433 worker = find_worker_executing_work(gcwq, work);
2434 if (unlikely(worker))
2435 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2436
2437 spin_unlock_irq(&gcwq->lock);
2438
2439 if (unlikely(worker)) {
2440 wait_for_completion(&barr.done);
2441 destroy_work_on_stack(&barr.work);
2442 }
2443}
2444
2445static void wait_on_work(struct work_struct *work)
2446{
2447 int cpu;
2448
2449 might_sleep();
2450
2451 lock_map_acquire(&work->lockdep_map);
2452 lock_map_release(&work->lockdep_map);
2453
2454 for_each_gcwq_cpu(cpu)
2455 wait_on_cpu_work(get_gcwq(cpu), work);
2456}
2457
2458static int __cancel_work_timer(struct work_struct *work,
2459 struct timer_list* timer) 2506 struct timer_list* timer)
2460{ 2507{
2461 int ret; 2508 int ret;
@@ -2472,42 +2519,81 @@ static int __cancel_work_timer(struct work_struct *work,
2472} 2519}
2473 2520
2474/** 2521/**
2475 * cancel_work_sync - block until a work_struct's callback has terminated 2522 * cancel_work_sync - cancel a work and wait for it to finish
2476 * @work: the work which is to be flushed 2523 * @work: the work to cancel
2477 *
2478 * Returns true if @work was pending.
2479 * 2524 *
2480 * cancel_work_sync() will cancel the work if it is queued. If the work's 2525 * Cancel @work and wait for its execution to finish. This function
2481 * callback appears to be running, cancel_work_sync() will block until it 2526 * can be used even if the work re-queues itself or migrates to
2482 * has completed. 2527 * another workqueue. On return from this function, @work is
2528 * guaranteed to be not pending or executing on any CPU.
2483 * 2529 *
2484 * It is possible to use this function if the work re-queues itself. It can 2530 * cancel_work_sync(&delayed_work->work) must not be used for
2485 * cancel the work even if it migrates to another workqueue, however in that 2531 * delayed_work's. Use cancel_delayed_work_sync() instead.
2486 * case it only guarantees that work->func() has completed on the last queued
2487 * workqueue.
2488 * 2532 *
2489 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not 2533 * The caller must ensure that the workqueue on which @work was last
2490 * pending, otherwise it goes into a busy-wait loop until the timer expires.
2491 *
2492 * The caller must ensure that workqueue_struct on which this work was last
2493 * queued can't be destroyed before this function returns. 2534 * queued can't be destroyed before this function returns.
2535 *
2536 * RETURNS:
2537 * %true if @work was pending, %false otherwise.
2494 */ 2538 */
2495int cancel_work_sync(struct work_struct *work) 2539bool cancel_work_sync(struct work_struct *work)
2496{ 2540{
2497 return __cancel_work_timer(work, NULL); 2541 return __cancel_work_timer(work, NULL);
2498} 2542}
2499EXPORT_SYMBOL_GPL(cancel_work_sync); 2543EXPORT_SYMBOL_GPL(cancel_work_sync);
2500 2544
2501/** 2545/**
2502 * cancel_delayed_work_sync - reliably kill off a delayed work. 2546 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2503 * @dwork: the delayed work struct 2547 * @dwork: the delayed work to flush
2548 *
2549 * Delayed timer is cancelled and the pending work is queued for
2550 * immediate execution. Like flush_work(), this function only
2551 * considers the last queueing instance of @dwork.
2552 *
2553 * RETURNS:
2554 * %true if flush_work() waited for the work to finish execution,
2555 * %false if it was already idle.
2556 */
2557bool flush_delayed_work(struct delayed_work *dwork)
2558{
2559 if (del_timer_sync(&dwork->timer))
2560 __queue_work(raw_smp_processor_id(),
2561 get_work_cwq(&dwork->work)->wq, &dwork->work);
2562 return flush_work(&dwork->work);
2563}
2564EXPORT_SYMBOL(flush_delayed_work);
2565
2566/**
2567 * flush_delayed_work_sync - wait for a dwork to finish
2568 * @dwork: the delayed work to flush
2504 * 2569 *
2505 * Returns true if @dwork was pending. 2570 * Delayed timer is cancelled and the pending work is queued for
2571 * execution immediately. Other than timer handling, its behavior
2572 * is identical to flush_work_sync().
2506 * 2573 *
2507 * It is possible to use this function if @dwork rearms itself via queue_work() 2574 * RETURNS:
2508 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 2575 * %true if flush_work_sync() waited for the work to finish execution,
2576 * %false if it was already idle.
2509 */ 2577 */
2510int cancel_delayed_work_sync(struct delayed_work *dwork) 2578bool flush_delayed_work_sync(struct delayed_work *dwork)
2579{
2580 if (del_timer_sync(&dwork->timer))
2581 __queue_work(raw_smp_processor_id(),
2582 get_work_cwq(&dwork->work)->wq, &dwork->work);
2583 return flush_work_sync(&dwork->work);
2584}
2585EXPORT_SYMBOL(flush_delayed_work_sync);
2586
2587/**
2588 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2589 * @dwork: the delayed work cancel
2590 *
2591 * This is cancel_work_sync() for delayed works.
2592 *
2593 * RETURNS:
2594 * %true if @dwork was pending, %false otherwise.
2595 */
2596bool cancel_delayed_work_sync(struct delayed_work *dwork)
2511{ 2597{
2512 return __cancel_work_timer(&dwork->work, &dwork->timer); 2598 return __cancel_work_timer(&dwork->work, &dwork->timer);
2513} 2599}
@@ -2559,23 +2645,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
2559EXPORT_SYMBOL(schedule_delayed_work); 2645EXPORT_SYMBOL(schedule_delayed_work);
2560 2646
2561/** 2647/**
2562 * flush_delayed_work - block until a dwork_struct's callback has terminated
2563 * @dwork: the delayed work which is to be flushed
2564 *
2565 * Any timeout is cancelled, and any pending work is run immediately.
2566 */
2567void flush_delayed_work(struct delayed_work *dwork)
2568{
2569 if (del_timer_sync(&dwork->timer)) {
2570 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
2571 &dwork->work);
2572 put_cpu();
2573 }
2574 flush_work(&dwork->work);
2575}
2576EXPORT_SYMBOL(flush_delayed_work);
2577
2578/**
2579 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2648 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2580 * @cpu: cpu to use 2649 * @cpu: cpu to use
2581 * @dwork: job to be done 2650 * @dwork: job to be done
@@ -2592,13 +2661,15 @@ int schedule_delayed_work_on(int cpu,
2592EXPORT_SYMBOL(schedule_delayed_work_on); 2661EXPORT_SYMBOL(schedule_delayed_work_on);
2593 2662
2594/** 2663/**
2595 * schedule_on_each_cpu - call a function on each online CPU from keventd 2664 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2596 * @func: the function to call 2665 * @func: the function to call
2597 * 2666 *
2598 * Returns zero on success. 2667 * schedule_on_each_cpu() executes @func on each online CPU using the
2599 * Returns -ve errno on failure. 2668 * system workqueue and blocks until all CPUs have completed.
2600 *
2601 * schedule_on_each_cpu() is very slow. 2669 * schedule_on_each_cpu() is very slow.
2670 *
2671 * RETURNS:
2672 * 0 on success, -errno on failure.
2602 */ 2673 */
2603int schedule_on_each_cpu(work_func_t func) 2674int schedule_on_each_cpu(work_func_t func)
2604{ 2675{
@@ -2720,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2720 } 2791 }
2721 } 2792 }
2722 2793
2723 /* just in case, make sure it's actually aligned */ 2794 /* just in case, make sure it's actually aligned
2795 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2796 */
2724 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2797 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2725 return wq->cpu_wq.v ? 0 : -ENOMEM; 2798 return wq->cpu_wq.v ? 0 : -ENOMEM;
2726} 2799}
@@ -2764,6 +2837,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2764 unsigned int cpu; 2837 unsigned int cpu;
2765 2838
2766 /* 2839 /*
2840 * Workqueues which may be used during memory reclaim should
2841 * have a rescuer to guarantee forward progress.
2842 */
2843 if (flags & WQ_MEM_RECLAIM)
2844 flags |= WQ_RESCUER;
2845
2846 /*
2767 * Unbound workqueues aren't concurrency managed and should be 2847 * Unbound workqueues aren't concurrency managed and should be
2768 * dispatched to workers immediately. 2848 * dispatched to workers immediately.
2769 */ 2849 */